From 84c070181c8f4c3c42b6cfcfa6cbb621aa87e1a1 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 11 Jul 2025 02:02:18 -0400 Subject: [PATCH] dockerize app --- Dockerfile.app | 58 +++++++++++++++++++++++++++++++++ docker-compose.yml | 22 +++++++++++++ frontend/next.config.ts | 4 +++ frontend/src/app/admin/page.tsx | 2 +- src/app.py | 41 ++++++++++++++++++----- warm_up_docling.py | 13 ++++++++ 6 files changed, 131 insertions(+), 9 deletions(-) create mode 100644 Dockerfile.app create mode 100644 warm_up_docling.py diff --git a/Dockerfile.app b/Dockerfile.app new file mode 100644 index 00000000..1aefd66e --- /dev/null +++ b/Dockerfile.app @@ -0,0 +1,58 @@ +FROM node:18-slim + +# Install Python, uv, and curl +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install uv +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Set working directory +WORKDIR /app + +# Copy Python dependencies +COPY pyproject.toml uv.lock ./ +RUN uv sync + +# Copy Python source +COPY src/ ./src/ + +# Copy sample document and warmup script +COPY documents/2506.08231v1.pdf ./ +COPY warm_up_docling.py ./ +RUN uv run python warm_up_docling.py && rm warm_up_docling.py 2506.08231v1.pdf + +# Copy frontend dependencies +COPY frontend/package*.json ./frontend/ +RUN cd frontend && npm install + +# Copy frontend source +COPY frontend/ ./frontend/ + +# Build frontend +RUN cd frontend && npm run build + +# Create startup script +RUN echo '#!/bin/bash\n\ +set -e\n\ +echo "Starting Python backend..."\n\ +uv run python src/app.py &\n\ +BACKEND_PID=$!\n\ +echo "Waiting for backend to be ready..."\n\ +until curl -f http://localhost:8000/search -X POST -H "Content-Type: application/json" -d "{\"query\":\"test\"}" > /dev/null 2>&1; do\n\ + echo "Backend not ready yet, waiting..."\n\ + sleep 2\n\ +done\n\ +echo "Backend is ready! Starting Frontend..."\n\ +cd frontend && npm start &\n\ +wait' > /app/start.sh && chmod +x /app/start.sh + +# Expose only frontend port +EXPOSE 3000 + +# Start both services +CMD ["/app/start.sh"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 9eae53ed..596f4432 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,3 +23,25 @@ services: ports: - "5601:5601" + gendb: + build: + context: . + dockerfile: Dockerfile.app + container_name: gendb-app + depends_on: + - opensearch + environment: + - OPENSEARCH_HOST=opensearch + - OPENSEARCH_PORT=9200 + - OPENSEARCH_USERNAME=admin + - OPENSEARCH_PASSWORD=OSisgendb1! + - OPENAI_API_KEY=${OPENAI_API_KEY} + ports: + - "3000:3000" + volumes: + - ./src:/app/src + - ./frontend/src:/app/frontend/src + - ./pyproject.toml:/app/pyproject.toml + - ./uv.lock:/app/uv.lock + - ./documents:/app/documents + diff --git a/frontend/next.config.ts b/frontend/next.config.ts index 5bbfdb2c..ab7b4aa6 100644 --- a/frontend/next.config.ts +++ b/frontend/next.config.ts @@ -9,6 +9,10 @@ const nextConfig: NextConfig = { }, ]; }, + // Increase timeout for API routes + experimental: { + proxyTimeout: 300000, // 5 minutes + }, }; export default nextConfig; diff --git a/frontend/src/app/admin/page.tsx b/frontend/src/app/admin/page.tsx index 09aa2f4d..2038fd43 100644 --- a/frontend/src/app/admin/page.tsx +++ b/frontend/src/app/admin/page.tsx @@ -11,7 +11,7 @@ export default function AdminPage() { const [fileUploadLoading, setFileUploadLoading] = useState(false) const [pathUploadLoading, setPathUploadLoading] = useState(false) const [selectedFile, setSelectedFile] = useState(null) - const [folderPath, setFolderPath] = useState("") + const [folderPath, setFolderPath] = useState("/app/documents/") const [uploadStatus, setUploadStatus] = useState("") const handleFileUpload = async (e: React.FormEvent) => { diff --git a/src/app.py b/src/app.py index f92d52b0..3f68a171 100644 --- a/src/app.py +++ b/src/app.py @@ -8,6 +8,7 @@ os.environ['USE_CPU_ONLY'] = 'true' import hashlib import tempfile import asyncio +import time from starlette.applications import Starlette from starlette.requests import Request @@ -25,14 +26,19 @@ from openai import OpenAI converter = DocumentConverter() # basic converter; tweak via PipelineOptions if you need OCR, etc. # Initialize Async OpenSearch (adjust hosts/auth as needed) +opensearch_host = os.getenv("OPENSEARCH_HOST", "localhost") +opensearch_port = int(os.getenv("OPENSEARCH_PORT", "9200")) +opensearch_username = os.getenv("OPENSEARCH_USERNAME", "admin") +opensearch_password = os.getenv("OPENSEARCH_PASSWORD", "OSisgendb1!") + es = AsyncOpenSearch( - hosts=[{"host": "localhost", "port": 9200}], + hosts=[{"host": opensearch_host, "port": opensearch_port}], connection_class=AIOHttpConnection, scheme="https", use_ssl=True, verify_certs=False, ssl_assert_fingerprint=None, - http_auth=("admin","OSisgendb1!"), + http_auth=(opensearch_username, opensearch_password), http_compress=True, ) @@ -71,7 +77,26 @@ index_body = { client = patch_openai_with_mcp(OpenAI()) # Get the patched client back +async def wait_for_opensearch(): + """Wait for OpenSearch to be ready with retries""" + max_retries = 30 + retry_delay = 2 + + for attempt in range(max_retries): + try: + await es.info() + print("OpenSearch is ready!") + return + except Exception as e: + print(f"Attempt {attempt + 1}/{max_retries}: OpenSearch not ready yet ({e})") + if attempt < max_retries - 1: + await asyncio.sleep(retry_delay) + else: + raise Exception("OpenSearch failed to become ready") + async def init_index(): + await wait_for_opensearch() + if not await es.indices.exists(index=INDEX_NAME): await es.indices.create(index=INDEX_NAME, body=index_body) print(f"Created index '{INDEX_NAME}'") @@ -133,9 +158,9 @@ async def process_file_common(file_path: str, file_hash: str = None): sha256.update(chunk) file_hash = sha256.hexdigest() - #exists = await es.exists(index=INDEX_NAME, id=file_hash) - #if exists: - # return {"status": "unchanged", "id": file_hash} + exists = await es.exists(index=INDEX_NAME, id=file_hash) + if exists: + return {"status": "unchanged", "id": file_hash} # convert and extract # TODO: Check if docling can handle in-memory bytes instead of file path @@ -186,9 +211,9 @@ async def upload(request: Request): tmp.flush() file_hash = sha256.hexdigest() - #exists = await es.exists(index=INDEX_NAME, id=file_hash) - #if exists: - # return JSONResponse({"status": "unchanged", "id": file_hash}) + exists = await es.exists(index=INDEX_NAME, id=file_hash) + if exists: + return JSONResponse({"status": "unchanged", "id": file_hash}) result = await process_file_common(tmp.name, file_hash) return JSONResponse(result) diff --git a/warm_up_docling.py b/warm_up_docling.py new file mode 100644 index 00000000..30c7489f --- /dev/null +++ b/warm_up_docling.py @@ -0,0 +1,13 @@ +from docling.document_converter import DocumentConverter + +print('Warming up docling models...') + +try: + # Use the sample document to warm up docling + test_file = "/app/2506.08231v1.pdf" + print(f'Using {test_file} to warm up docling...') + DocumentConverter().convert(test_file) + print('Docling models warmed up successfully') +except Exception as e: + print(f'Docling warm-up completed with: {e}') + # This is expected - we just want to trigger the model downloads \ No newline at end of file