From 84c070181c8f4c3c42b6cfcfa6cbb621aa87e1a1 Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Fri, 11 Jul 2025 02:02:18 -0400
Subject: [PATCH] dockerize app

---
 Dockerfile.app                  | 58 +++++++++++++++++++++++++++++++++
 docker-compose.yml              | 22 +++++++++++++
 frontend/next.config.ts         |  4 +++
 frontend/src/app/admin/page.tsx |  2 +-
 src/app.py                      | 41 ++++++++++++++++++-----
 warm_up_docling.py              | 13 ++++++++
 6 files changed, 131 insertions(+), 9 deletions(-)
 create mode 100644 Dockerfile.app
 create mode 100644 warm_up_docling.py
diff --git a/Dockerfile.app b/Dockerfile.app
new file mode 100644
index 00000000..1aefd66e
--- /dev/null
+++ b/Dockerfile.app
@@ -0,0 +1,58 @@
+FROM node:18-slim
+
+# Install Python, uv, and curl
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy Python dependencies
+COPY pyproject.toml uv.lock ./
+RUN uv sync
+
+# Copy Python source
+COPY src/ ./src/
+
+# Copy sample document and warmup script
+COPY documents/2506.08231v1.pdf ./
+COPY warm_up_docling.py ./
+RUN uv run python warm_up_docling.py && rm warm_up_docling.py 2506.08231v1.pdf
+
+# Copy frontend dependencies
+COPY frontend/package*.json ./frontend/
+RUN cd frontend && npm install
+
+# Copy frontend source
+COPY frontend/ ./frontend/
+
+# Build frontend
+RUN cd frontend && npm run build
+
+# Create startup script
+RUN echo '#!/bin/bash\n\
+set -e\n\
+echo "Starting Python backend..."\n\
+uv run python src/app.py &\n\
+BACKEND_PID=$!\n\
+echo "Waiting for backend to be ready..."\n\
+until curl -f http://localhost:8000/search -X POST -H "Content-Type: application/json" -d "{\"query\":\"test\"}" > /dev/null 2>&1; do\n\
+  echo "Backend not ready yet, waiting..."\n\
+  sleep 2\n\
+done\n\
+echo "Backend is ready! Starting Frontend..."\n\
+cd frontend && npm start &\n\
+wait' > /app/start.sh && chmod +x /app/start.sh
+
+# Expose only frontend port
+EXPOSE 3000
+
+# Start both services
+CMD ["/app/start.sh"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 9eae53ed..596f4432 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -23,3 +23,25 @@ services:
     ports:
       - "5601:5601"
 
+  gendb:
+    build:
+      context: .
+      dockerfile: Dockerfile.app
+    container_name: gendb-app
+    depends_on:
+      - opensearch
+    environment:
+      - OPENSEARCH_HOST=opensearch
+      - OPENSEARCH_PORT=9200
+      - OPENSEARCH_USERNAME=admin
+      - OPENSEARCH_PASSWORD=OSisgendb1!
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./src:/app/src
+      - ./frontend/src:/app/frontend/src
+      - ./pyproject.toml:/app/pyproject.toml
+      - ./uv.lock:/app/uv.lock
+      - ./documents:/app/documents
+
diff --git a/frontend/next.config.ts b/frontend/next.config.ts
index 5bbfdb2c..ab7b4aa6 100644
--- a/frontend/next.config.ts
+++ b/frontend/next.config.ts
@@ -9,6 +9,10 @@ const nextConfig: NextConfig = {
       },
     ];
   },
+  // Increase timeout for API routes
+  experimental: {
+    proxyTimeout: 300000, // 5 minutes
+  },
 };
 
 export default nextConfig;
diff --git a/frontend/src/app/admin/page.tsx b/frontend/src/app/admin/page.tsx
index 09aa2f4d..2038fd43 100644
--- a/frontend/src/app/admin/page.tsx
+++ b/frontend/src/app/admin/page.tsx
@@ -11,7 +11,7 @@ export default function AdminPage() {
   const [fileUploadLoading, setFileUploadLoading] = useState(false)
   const [pathUploadLoading, setPathUploadLoading] = useState(false)
   const [selectedFile, setSelectedFile] = useState<File | null>(null)
-  const [folderPath, setFolderPath] = useState("")
+  const [folderPath, setFolderPath] = useState("/app/documents/")
   const [uploadStatus, setUploadStatus] = useState<string>("")
 
   const handleFileUpload = async (e: React.FormEvent) => {
diff --git a/src/app.py b/src/app.py
index f92d52b0..3f68a171 100644
--- a/src/app.py
+++ b/src/app.py
@@ -8,6 +8,7 @@ os.environ['USE_CPU_ONLY'] = 'true'
 import hashlib
 import tempfile
 import asyncio
+import time
 
 from starlette.applications import Starlette
 from starlette.requests     import Request
@@ -25,14 +26,19 @@ from openai import OpenAI
 converter = DocumentConverter()  # basic converter; tweak via PipelineOptions if you need OCR, etc.
 
 # Initialize Async OpenSearch (adjust hosts/auth as needed)
+opensearch_host = os.getenv("OPENSEARCH_HOST", "localhost")
+opensearch_port = int(os.getenv("OPENSEARCH_PORT", "9200"))
+opensearch_username = os.getenv("OPENSEARCH_USERNAME", "admin")
+opensearch_password = os.getenv("OPENSEARCH_PASSWORD", "OSisgendb1!")
+
 es = AsyncOpenSearch(
-    hosts=[{"host": "localhost", "port": 9200}],
+    hosts=[{"host": opensearch_host, "port": opensearch_port}],
     connection_class=AIOHttpConnection,
     scheme="https",
     use_ssl=True,
     verify_certs=False,
     ssl_assert_fingerprint=None,
-    http_auth=("admin","OSisgendb1!"),
+    http_auth=(opensearch_username, opensearch_password),
     http_compress=True,
 )
 
@@ -71,7 +77,26 @@ index_body = {
 
 client = patch_openai_with_mcp(OpenAI())  # Get the patched client back
 
+async def wait_for_opensearch():
+    """Wait for OpenSearch to be ready with retries"""
+    max_retries = 30
+    retry_delay = 2
+    
+    for attempt in range(max_retries):
+        try:
+            await es.info()
+            print("OpenSearch is ready!")
+            return
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries}: OpenSearch not ready yet ({e})")
+            if attempt < max_retries - 1:
+                await asyncio.sleep(retry_delay)
+            else:
+                raise Exception("OpenSearch failed to become ready")
+
 async def init_index():
+    await wait_for_opensearch()
+    
     if not await es.indices.exists(index=INDEX_NAME):
         await es.indices.create(index=INDEX_NAME, body=index_body)
         print(f"Created index '{INDEX_NAME}'")
@@ -133,9 +158,9 @@ async def process_file_common(file_path: str, file_hash: str = None):
                 sha256.update(chunk)
         file_hash = sha256.hexdigest()
 
-    #exists = await es.exists(index=INDEX_NAME, id=file_hash)
-    #if exists:
-    #    return {"status": "unchanged", "id": file_hash}
+    exists = await es.exists(index=INDEX_NAME, id=file_hash)
+    if exists:
+        return {"status": "unchanged", "id": file_hash}
 
     # convert and extract
     # TODO: Check if docling can handle in-memory bytes instead of file path
@@ -186,9 +211,9 @@ async def upload(request: Request):
         tmp.flush()
 
         file_hash = sha256.hexdigest()
-        #exists = await es.exists(index=INDEX_NAME, id=file_hash)
-        #if exists:
-        #    return JSONResponse({"status": "unchanged", "id": file_hash})
+        exists = await es.exists(index=INDEX_NAME, id=file_hash)
+        if exists:
+            return JSONResponse({"status": "unchanged", "id": file_hash})
 
         result = await process_file_common(tmp.name, file_hash)
         return JSONResponse(result)
diff --git a/warm_up_docling.py b/warm_up_docling.py
new file mode 100644
index 00000000..30c7489f
--- /dev/null
+++ b/warm_up_docling.py
@@ -0,0 +1,13 @@
+from docling.document_converter import DocumentConverter
+
+print('Warming up docling models...')
+
+try:
+    # Use the sample document to warm up docling
+    test_file = "/app/2506.08231v1.pdf"
+    print(f'Using {test_file} to warm up docling...')
+    DocumentConverter().convert(test_file)
+    print('Docling models warmed up successfully')
+except Exception as e:
+    print(f'Docling warm-up completed with: {e}')
+    # This is expected - we just want to trigger the model downloads
\ No newline at end of file