dockerize app

2025-07-11 02:02:18 -04:00 · 2025-07-11 02:02:18 -04:00 · 84c070181c
commit 84c070181c
parent 39efea8612
6 changed files with 131 additions and 9 deletions
--- a/Dockerfile.app
+++ b/Dockerfile.app
@ -0,0 +1,58 @@
+FROM node:18-slim
+
+# Install Python, uv, and curl
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy Python dependencies
+COPY pyproject.toml uv.lock ./
+RUN uv sync
+
+# Copy Python source
+COPY src/ ./src/
+
+# Copy sample document and warmup script
+COPY documents/2506.08231v1.pdf ./
+COPY warm_up_docling.py ./
+RUN uv run python warm_up_docling.py && rm warm_up_docling.py 2506.08231v1.pdf
+
+# Copy frontend dependencies
+COPY frontend/package*.json ./frontend/
+RUN cd frontend && npm install
+
+# Copy frontend source
+COPY frontend/ ./frontend/
+
+# Build frontend
+RUN cd frontend && npm run build
+
+# Create startup script
+RUN echo '#!/bin/bash\n\
+set -e\n\
+echo "Starting Python backend..."\n\
+uv run python src/app.py &\n\
+BACKEND_PID=$!\n\
+echo "Waiting for backend to be ready..."\n\
+until curl -f http://localhost:8000/search -X POST -H "Content-Type: application/json" -d "{\"query\":\"test\"}" > /dev/null 2>&1; do\n\
+  echo "Backend not ready yet, waiting..."\n\
+  sleep 2\n\
+done\n\
+echo "Backend is ready! Starting Frontend..."\n\
+cd frontend && npm start &\n\
+wait' > /app/start.sh && chmod +x /app/start.sh
+
+# Expose only frontend port
+EXPOSE 3000
+
+# Start both services
+CMD ["/app/start.sh"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -23,3 +23,25 @@ services:
    ports:
      - "5601:5601"

+  gendb:
+    build:
+      context: .
+      dockerfile: Dockerfile.app
+    container_name: gendb-app
+    depends_on:
+      - opensearch
+    environment:
+      - OPENSEARCH_HOST=opensearch
+      - OPENSEARCH_PORT=9200
+      - OPENSEARCH_USERNAME=admin
+      - OPENSEARCH_PASSWORD=OSisgendb1!
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./src:/app/src
+      - ./frontend/src:/app/frontend/src
+      - ./pyproject.toml:/app/pyproject.toml
+      - ./uv.lock:/app/uv.lock
+      - ./documents:/app/documents
+
--- a/frontend/next.config.ts
+++ b/frontend/next.config.ts
@ -9,6 +9,10 @@ const nextConfig: NextConfig = {
      },
    ];
  },
+  // Increase timeout for API routes
+  experimental: {
+    proxyTimeout: 300000, // 5 minutes
+  },
 };

 export default nextConfig;
--- a/frontend/src/app/admin/page.tsx
+++ b/frontend/src/app/admin/page.tsx
@ -11,7 +11,7 @@ export default function AdminPage() {
  const [fileUploadLoading, setFileUploadLoading] = useState(false)
  const [pathUploadLoading, setPathUploadLoading] = useState(false)
  const [selectedFile, setSelectedFile] = useState<File | null>(null)
-  const [folderPath, setFolderPath] = useState("")
+  const [folderPath, setFolderPath] = useState("/app/documents/")
  const [uploadStatus, setUploadStatus] = useState<string>("")

  const handleFileUpload = async (e: React.FormEvent) => {
--- a/src/app.py
+++ b/src/app.py
@ -8,6 +8,7 @@ os.environ['USE_CPU_ONLY'] = 'true'
 import hashlib
 import tempfile
 import asyncio
+import time

 from starlette.applications import Starlette
 from starlette.requests     import Request
@ -25,14 +26,19 @@ from openai import OpenAI
 converter = DocumentConverter()  # basic converter; tweak via PipelineOptions if you need OCR, etc.

 # Initialize Async OpenSearch (adjust hosts/auth as needed)
+opensearch_host = os.getenv("OPENSEARCH_HOST", "localhost")
+opensearch_port = int(os.getenv("OPENSEARCH_PORT", "9200"))
+opensearch_username = os.getenv("OPENSEARCH_USERNAME", "admin")
+opensearch_password = os.getenv("OPENSEARCH_PASSWORD", "OSisgendb1!")
+
 es = AsyncOpenSearch(
-    hosts=[{"host": "localhost", "port": 9200}],
+    hosts=[{"host": opensearch_host, "port": opensearch_port}],
    connection_class=AIOHttpConnection,
    scheme="https",
    use_ssl=True,
    verify_certs=False,
    ssl_assert_fingerprint=None,
-    http_auth=("admin","OSisgendb1!"),
+    http_auth=(opensearch_username, opensearch_password),
    http_compress=True,
 )

@ -71,7 +77,26 @@ index_body = {

 client = patch_openai_with_mcp(OpenAI())  # Get the patched client back

+async def wait_for_opensearch():
+    """Wait for OpenSearch to be ready with retries"""
+    max_retries = 30
+    retry_delay = 2
+    
+    for attempt in range(max_retries):
+        try:
+            await es.info()
+            print("OpenSearch is ready!")
+            return
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries}: OpenSearch not ready yet ({e})")
+            if attempt < max_retries - 1:
+                await asyncio.sleep(retry_delay)
+            else:
+                raise Exception("OpenSearch failed to become ready")
+
 async def init_index():
+    await wait_for_opensearch()
+    
    if not await es.indices.exists(index=INDEX_NAME):
        await es.indices.create(index=INDEX_NAME, body=index_body)
        print(f"Created index '{INDEX_NAME}'")
@ -133,9 +158,9 @@ async def process_file_common(file_path: str, file_hash: str = None):
                sha256.update(chunk)
        file_hash = sha256.hexdigest()

-    #exists = await es.exists(index=INDEX_NAME, id=file_hash)
-    #if exists:
-    #    return {"status": "unchanged", "id": file_hash}
+    exists = await es.exists(index=INDEX_NAME, id=file_hash)
+    if exists:
+        return {"status": "unchanged", "id": file_hash}

    # convert and extract
    # TODO: Check if docling can handle in-memory bytes instead of file path
@ -186,9 +211,9 @@ async def upload(request: Request):
        tmp.flush()

        file_hash = sha256.hexdigest()
-        #exists = await es.exists(index=INDEX_NAME, id=file_hash)
-        #if exists:
-        #    return JSONResponse({"status": "unchanged", "id": file_hash})
+        exists = await es.exists(index=INDEX_NAME, id=file_hash)
+        if exists:
+            return JSONResponse({"status": "unchanged", "id": file_hash})

        result = await process_file_common(tmp.name, file_hash)
        return JSONResponse(result)
--- a/warm_up_docling.py
+++ b/warm_up_docling.py
@ -0,0 +1,13 @@
+from docling.document_converter import DocumentConverter
+
+print('Warming up docling models...')
+
+try:
+    # Use the sample document to warm up docling
+    test_file = "/app/2506.08231v1.pdf"
+    print(f'Using {test_file} to warm up docling...')
+    DocumentConverter().convert(test_file)
+    print('Docling models warmed up successfully')
+except Exception as e:
+    print(f'Docling warm-up completed with: {e}')
+    # This is expected - we just want to trigger the model downloads