From 38e3007964d2f8a026ea0ffe5fb4b06d64734cc9 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 11:48:43 -0400 Subject: [PATCH 001/113] dead method --- src/services/task_service.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/services/task_service.py b/src/services/task_service.py index 8e69d4ae..c0d7ffad 100644 --- a/src/services/task_service.py +++ b/src/services/task_service.py @@ -109,33 +109,6 @@ class TaskService: return task_id - async def background_upload_processor(self, user_id: str, task_id: str) -> None: - """Background task to process all files in an upload job with concurrency control""" - try: - upload_task = self.task_store[user_id][task_id] - upload_task.status = TaskStatus.RUNNING - upload_task.updated_at = time.time() - - # Process files with limited concurrency to avoid overwhelming the system - max_workers = get_worker_count() - semaphore = asyncio.Semaphore(max_workers * 2) # Allow 2x process pool size for async I/O - - async def process_with_semaphore(file_path: str): - async with semaphore: - await self.document_service.process_single_file_task(upload_task, file_path) - - tasks = [process_with_semaphore(file_path) for file_path in upload_task.file_tasks.keys()] - - await asyncio.gather(*tasks, return_exceptions=True) - - except Exception as e: - logger.error("Background upload processor failed", task_id=task_id, error=str(e)) - import traceback - - traceback.print_exc() - if user_id in self.task_store and task_id in self.task_store[user_id]: - self.task_store[user_id][task_id].status = TaskStatus.FAILED - self.task_store[user_id][task_id].updated_at = time.time() async def background_custom_processor(self, user_id: str, task_id: str, items: list) -> None: """Background task to process items using custom processor""" From f761eab1b481df73b9c8bcb53f1360df2e1f43f0 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 11:58:35 -0400 Subject: [PATCH 002/113] upload dir should respect langflow flag --- src/api/upload.py | 32 ++++++++++++++++++++++++-------- src/main.py | 1 + 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/api/upload.py b/src/api/upload.py index 373b2948..d845e978 100644 --- a/src/api/upload.py +++ b/src/api/upload.py @@ -45,7 +45,7 @@ async def upload(request: Request, document_service, session_manager): return JSONResponse({"error": error_msg}, status_code=500) -async def upload_path(request: Request, task_service, session_manager): +async def upload_path(request: Request, task_service, session_manager, langflow_file_service): """Upload all files from a directory path""" payload = await request.json() base_dir = payload.get("path") @@ -74,13 +74,29 @@ async def upload_path(request: Request, task_service, session_manager): owner_name = user.name owner_email = user.email - task_id = await task_service.create_upload_task( - owner_user_id, - file_paths, - jwt_token=jwt_token, - owner_name=owner_name, - owner_email=owner_email, - ) + from config.settings import DISABLE_INGEST_WITH_LANGFLOW + + # Use same logic as single file uploads - respect the Langflow setting + if DISABLE_INGEST_WITH_LANGFLOW: + # Use direct DocumentFileProcessor (no Langflow) + task_id = await task_service.create_upload_task( + owner_user_id, + file_paths, + jwt_token=jwt_token, + owner_name=owner_name, + owner_email=owner_email, + ) + else: + # Use Langflow pipeline for processing + task_id = await task_service.create_langflow_upload_task( + user_id=owner_user_id, + file_paths=file_paths, + langflow_file_service=langflow_file_service, + session_manager=session_manager, + jwt_token=jwt_token, + owner_name=owner_name, + owner_email=owner_email, + ) return JSONResponse( {"task_id": task_id, "total_files": len(file_paths), "status": "accepted"}, diff --git a/src/main.py b/src/main.py index 1c0dc09f..a0f00268 100644 --- a/src/main.py +++ b/src/main.py @@ -558,6 +558,7 @@ async def create_app(): upload.upload_path, task_service=services["task_service"], session_manager=services["session_manager"], + langflow_file_service=services["langflow_file_service"], ) ), methods=["POST"], From 6533367fa0fb561abc926436b0aa5b38b8040022 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 12:16:36 -0400 Subject: [PATCH 003/113] fix process count bug --- src/services/document_service.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/services/document_service.py b/src/services/document_service.py index 70a70942..22f61411 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -430,8 +430,4 @@ class DocumentService: upload_task.failed_files += 1 finally: file_task.updated_at = time.time() - upload_task.processed_files += 1 upload_task.updated_at = time.time() - - if upload_task.processed_files >= upload_task.total_files: - upload_task.status = TaskStatus.COMPLETED From 219f9da4e09acc80c547aa1c6a918e6b20459c38 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 13:25:26 -0400 Subject: [PATCH 004/113] disable startup ingest flag --- src/config/settings.py | 1 + src/main.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/config/settings.py b/src/config/settings.py index 715146fb..ace9d5cb 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -48,6 +48,7 @@ GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") # Ingestion configuration DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes") +DISABLE_STARTUP_INGEST = os.getenv("DISABLE_STARTUP_INGEST", "false").lower() in ("true", "1", "yes") def is_no_auth_mode(): diff --git a/src/main.py b/src/main.py index a0f00268..1912f7df 100644 --- a/src/main.py +++ b/src/main.py @@ -386,9 +386,14 @@ async def _ingest_default_documents_openrag(services, file_paths): async def startup_tasks(services): """Startup tasks""" + from config.settings import DISABLE_STARTUP_INGEST + logger.info("Starting startup tasks") await init_index() - await ingest_default_documents_when_ready(services) + if DISABLE_STARTUP_INGEST: + logger.info("Startup ingest disabled via DISABLE_STARTUP_INGEST; skipping default documents ingestion") + else: + await ingest_default_documents_when_ready(services) async def initialize_services(): From 0866b5218e49b37cb40a1596311eb651b48745fe Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 16:02:56 -0400 Subject: [PATCH 005/113] docker compose not docker-compose --- Makefile | 72 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index fe76467a..6ac03b93 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # OpenRAG Development Makefile # Provides easy commands for development workflow -.PHONY: help dev dev-cpu dev-local infra stop clean build logs shell-backend shell-frontend install test backend frontend install-be install-fe build-be build-fe logs-be logs-fe logs-lf logs-os shell-be shell-lf shell-os restart status health db-reset flow-upload quick setup +.PHONY: help dev dev-cpu dev-local infra stop clean build logs shell-backend shell-frontend install test test-integration test-unit test-ingest test-search test-coverage backend frontend install-be install-fe build-be build-fe logs-be logs-fe logs-lf logs-os shell-be shell-lf shell-os restart status health db-reset flow-upload quick setup # Default target help: @@ -32,14 +32,19 @@ help: @echo " shell-lf - Shell into langflow container" @echo "" @echo "Testing:" - @echo " test - Run backend tests" + @echo " test - Run all backend tests" + @echo " test-integration - Run integration tests (requires infra)" + @echo " test-unit - Run unit tests only" + @echo " test-ingest - Test file ingestion flows" + @echo " test-search - Test search functionality" + @echo " test-coverage - Run tests with coverage report" @echo " lint - Run linting checks" @echo "" # Development environments dev: @echo "๐Ÿš€ Starting OpenRAG with GPU support..." - docker-compose up -d + docker compose up -d @echo "โœ… Services started!" @echo " Backend: http://localhost:8000" @echo " Frontend: http://localhost:3000" @@ -49,7 +54,7 @@ dev: dev-cpu: @echo "๐Ÿš€ Starting OpenRAG with CPU only..." - docker-compose -f docker-compose-cpu.yml up -d + docker compose -f docker-compose-cpu.yml up -d @echo "โœ… Services started!" @echo " Backend: http://localhost:8000" @echo " Frontend: http://localhost:3000" @@ -59,7 +64,7 @@ dev-cpu: dev-local: @echo "๐Ÿ”ง Starting infrastructure only (for local development)..." - docker-compose up -d opensearch dashboards langflow + docker compose up -d opensearch dashboards langflow @echo "โœ… Infrastructure started!" @echo " Langflow: http://localhost:7860" @echo " OpenSearch: http://localhost:9200" @@ -69,7 +74,7 @@ dev-local: infra: @echo "๐Ÿ”ง Starting infrastructure services only..." - docker-compose up -d opensearch dashboards langflow + docker compose up -d opensearch dashboards langflow @echo "โœ… Infrastructure services started!" @echo " Langflow: http://localhost:7860" @echo " OpenSearch: http://localhost:9200" @@ -78,15 +83,15 @@ infra: # Container management stop: @echo "๐Ÿ›‘ Stopping all containers..." - docker-compose down - docker-compose -f docker-compose-cpu.yml down 2>/dev/null || true + docker compose down + docker compose -f docker-compose-cpu.yml down 2>/dev/null || true restart: stop dev clean: stop @echo "๐Ÿงน Cleaning up containers and volumes..." - docker-compose down -v --remove-orphans - docker-compose -f docker-compose-cpu.yml down -v --remove-orphans 2>/dev/null || true + docker compose down -v --remove-orphans + docker compose -f docker-compose-cpu.yml down -v --remove-orphans 2>/dev/null || true docker system prune -f # Local development @@ -115,7 +120,7 @@ install-fe: # Building build: @echo "๐Ÿ”จ Building Docker images..." - docker-compose build + docker compose build build-be: @echo "๐Ÿ”จ Building backend image..." @@ -128,41 +133,62 @@ build-fe: # Logging and debugging logs: @echo "๐Ÿ“‹ Showing all container logs..." - docker-compose logs -f + docker compose logs -f logs-be: @echo "๐Ÿ“‹ Showing backend logs..." - docker-compose logs -f openrag-backend + docker compose logs -f openrag-backend logs-fe: @echo "๐Ÿ“‹ Showing frontend logs..." - docker-compose logs -f openrag-frontend + docker compose logs -f openrag-frontend logs-lf: @echo "๐Ÿ“‹ Showing langflow logs..." - docker-compose logs -f langflow + docker compose logs -f langflow logs-os: @echo "๐Ÿ“‹ Showing opensearch logs..." - docker-compose logs -f opensearch + docker compose logs -f opensearch # Shell access shell-be: @echo "๐Ÿš Opening shell in backend container..." - docker-compose exec openrag-backend /bin/bash + docker compose exec openrag-backend /bin/bash shell-lf: @echo "๐Ÿš Opening shell in langflow container..." - docker-compose exec langflow /bin/bash + docker compose exec langflow /bin/bash shell-os: @echo "๐Ÿš Opening shell in opensearch container..." - docker-compose exec opensearch /bin/bash + docker compose exec opensearch /bin/bash # Testing and quality test: - @echo "๐Ÿงช Running backend tests..." - uv run pytest + @echo "๐Ÿงช Running all backend tests..." + uv run pytest tests/ -v + +test-integration: + @echo "๐Ÿงช Running integration tests (requires infrastructure)..." + @echo "๐Ÿ’ก Make sure to run 'make infra' first!" + uv run pytest tests/integration/ -v + +test-unit: + @echo "๐Ÿงช Running unit tests..." + uv run pytest tests/unit/ -v + +test-ingest: + @echo "๐Ÿงช Testing file ingestion flows..." + uv run pytest tests/integration/test_file_ingest.py -v + +test-search: + @echo "๐Ÿงช Testing search functionality..." + uv run pytest tests/integration/test_search_flow.py -v + +test-coverage: + @echo "๐Ÿงช Running tests with coverage report..." + uv run pytest tests/ --cov=src --cov-report=term-missing --cov-report=html:htmlcov lint: @echo "๐Ÿ” Running linting checks..." @@ -172,7 +198,7 @@ lint: # Service status status: @echo "๐Ÿ“Š Container status:" - @docker-compose ps 2>/dev/null || echo "No containers running" + @docker compose ps 2>/dev/null || echo "No containers running" health: @echo "๐Ÿฅ Health check:" @@ -207,4 +233,4 @@ setup: @echo "โš™๏ธ Setting up development environment..." @if [ ! -f .env ]; then cp .env.example .env && echo "๐Ÿ“ Created .env from template"; fi @$(MAKE) install - @echo "โœ… Setup complete! Run 'make dev' to start." \ No newline at end of file + @echo "โœ… Setup complete! Run 'make dev' to start." From c6ba47d11887fdccef0b2a6c27b9123b024aacb9 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 16:05:46 -0400 Subject: [PATCH 006/113] ingsest refactor --- frontend/components/knowledge-dropdown.tsx | 4 +- frontend/src/app/admin/page.tsx | 4 +- src/api/langflow_files.py | 243 --------------------- src/api/router.py | 98 +++++---- src/api/upload.py | 82 ++----- src/api/upload_utils.py | 47 ++++ src/main.py | 195 ++++++----------- src/services/document_service.py | 7 +- 8 files changed, 197 insertions(+), 483 deletions(-) create mode 100644 src/api/upload_utils.py diff --git a/frontend/components/knowledge-dropdown.tsx b/frontend/components/knowledge-dropdown.tsx index 481a45b1..31cdea31 100644 --- a/frontend/components/knowledge-dropdown.tsx +++ b/frontend/components/knowledge-dropdown.tsx @@ -134,7 +134,7 @@ export function KnowledgeDropdown({ active, variant = 'navigation' }: KnowledgeD formData.append('file', files[0]) // Use router upload and ingest endpoint (automatically routes based on configuration) - const uploadIngestRes = await fetch('/api/router/upload_ingest', { + const uploadIngestRes = await fetch('/api/upload', { method: 'POST', body: formData, }) @@ -463,4 +463,4 @@ export function KnowledgeDropdown({ active, variant = 'navigation' }: KnowledgeD ) -} \ No newline at end of file +} diff --git a/frontend/src/app/admin/page.tsx b/frontend/src/app/admin/page.tsx index 6cb8aa96..c8c9ecf8 100644 --- a/frontend/src/app/admin/page.tsx +++ b/frontend/src/app/admin/page.tsx @@ -51,7 +51,7 @@ function AdminPage() { const formData = new FormData() formData.append("file", selectedFile) - const response = await fetch("/api/router/upload_ingest", { + const response = await fetch("/api/upload", { method: "POST", body: formData, }) @@ -326,4 +326,4 @@ export default function ProtectedAdminPage() { ) -} \ No newline at end of file +} diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py index a5595813..41d3ac08 100644 --- a/src/api/langflow_files.py +++ b/src/api/langflow_files.py @@ -6,249 +6,6 @@ from utils.logging_config import get_logger logger = get_logger(__name__) - -async def upload_user_file( - request: Request, langflow_file_service: LangflowFileService, session_manager -): - try: - logger.debug("upload_user_file endpoint called") - form = await request.form() - upload_file = form.get("file") - if upload_file is None: - logger.error("No file provided in upload request") - return JSONResponse({"error": "Missing file"}, status_code=400) - - logger.debug( - "Processing file", filename=upload_file.filename, size=upload_file.size - ) - - # starlette UploadFile provides file-like; httpx needs (filename, file, content_type) - content = await upload_file.read() - file_tuple = ( - upload_file.filename, - content, - upload_file.content_type or "application/octet-stream", - ) - - jwt_token = getattr(request.state, "jwt_token", None) - logger.debug("JWT token status", jwt_present=jwt_token is not None) - - logger.debug("Calling langflow_file_service.upload_user_file") - result = await langflow_file_service.upload_user_file(file_tuple, jwt_token) - logger.debug("Upload successful", result=result) - return JSONResponse(result, status_code=201) - except Exception as e: - logger.error( - "upload_user_file endpoint failed", - error_type=type(e).__name__, - error=str(e), - ) - import traceback - - logger.error("Full traceback", traceback=traceback.format_exc()) - return JSONResponse({"error": str(e)}, status_code=500) - - -async def run_ingestion( - request: Request, langflow_file_service: LangflowFileService, session_manager -): - try: - payload = await request.json() - file_ids = payload.get("file_ids") - file_paths = payload.get("file_paths") or [] - session_id = payload.get("session_id") - tweaks = payload.get("tweaks") or {} - settings = payload.get("settings", {}) - - # We assume file_paths is provided. If only file_ids are provided, client would need to resolve to paths via Files API (not implemented here). - if not file_paths and not file_ids: - return JSONResponse( - {"error": "Provide file_paths or file_ids"}, status_code=400 - ) - - # Convert UI settings to component tweaks using exact component IDs - if settings: - logger.debug("Applying ingestion settings", settings=settings) - - # Split Text component tweaks (SplitText-QIKhg) - if ( - settings.get("chunkSize") - or settings.get("chunkOverlap") - or settings.get("separator") - ): - if "SplitText-QIKhg" not in tweaks: - tweaks["SplitText-QIKhg"] = {} - if settings.get("chunkSize"): - tweaks["SplitText-QIKhg"]["chunk_size"] = settings["chunkSize"] - if settings.get("chunkOverlap"): - tweaks["SplitText-QIKhg"]["chunk_overlap"] = settings[ - "chunkOverlap" - ] - if settings.get("separator"): - tweaks["SplitText-QIKhg"]["separator"] = settings["separator"] - - # OpenAI Embeddings component tweaks (OpenAIEmbeddings-joRJ6) - if settings.get("embeddingModel"): - if "OpenAIEmbeddings-joRJ6" not in tweaks: - tweaks["OpenAIEmbeddings-joRJ6"] = {} - tweaks["OpenAIEmbeddings-joRJ6"]["model"] = settings["embeddingModel"] - - # Note: OpenSearch component tweaks not needed for ingestion - # (search parameters are for retrieval, not document processing) - - logger.debug("Final tweaks with settings applied", tweaks=tweaks) - # Include user JWT if available - jwt_token = getattr(request.state, "jwt_token", None) - - # Extract user info from User object - user = getattr(request.state, "user", None) - user_id = user.user_id if user else None - user_name = user.name if user else None - user_email = user.email if user else None - - if jwt_token: - # Set auth context for downstream services - from auth_context import set_auth_context - - set_auth_context(user_id, jwt_token) - - result = await langflow_file_service.run_ingestion_flow( - file_paths=file_paths or [], - jwt_token=jwt_token, - session_id=session_id, - tweaks=tweaks, - owner=user_id, - owner_name=user_name, - owner_email=user_email, - connector_type="local", - ) - return JSONResponse(result) - except Exception as e: - return JSONResponse({"error": str(e)}, status_code=500) - - -async def upload_and_ingest_user_file( - request: Request, langflow_file_service: LangflowFileService, session_manager, task_service -): - """Combined upload and ingest endpoint - uses task service for tracking and cancellation""" - try: - logger.debug("upload_and_ingest_user_file endpoint called - using task service") - form = await request.form() - upload_file = form.get("file") - if upload_file is None: - logger.error("No file provided in upload_and_ingest request") - return JSONResponse({"error": "Missing file"}, status_code=400) - - # Extract optional parameters - session_id = form.get("session_id") - settings_json = form.get("settings") - tweaks_json = form.get("tweaks") - delete_after_ingest = form.get("delete_after_ingest", "true").lower() == "true" - - # Parse JSON fields if provided - settings = None - tweaks = None - - if settings_json: - try: - import json - settings = json.loads(settings_json) - except json.JSONDecodeError as e: - logger.error("Invalid settings JSON", error=str(e)) - return JSONResponse({"error": "Invalid settings JSON"}, status_code=400) - - if tweaks_json: - try: - import json - tweaks = json.loads(tweaks_json) - except json.JSONDecodeError as e: - logger.error("Invalid tweaks JSON", error=str(e)) - return JSONResponse({"error": "Invalid tweaks JSON"}, status_code=400) - - # Get user info from request state - user = getattr(request.state, "user", None) - user_id = user.user_id if user else None - user_name = user.name if user else None - user_email = user.email if user else None - jwt_token = getattr(request.state, "jwt_token", None) - - if not user_id: - return JSONResponse({"error": "User authentication required"}, status_code=401) - - logger.debug( - "Processing file for task-based upload and ingest", - filename=upload_file.filename, - size=upload_file.size, - session_id=session_id, - has_settings=bool(settings), - has_tweaks=bool(tweaks), - delete_after_ingest=delete_after_ingest, - user_id=user_id - ) - - # Create temporary file for task processing - import tempfile - import os - - # Read file content - content = await upload_file.read() - - # Create temporary file - safe_filename = upload_file.filename.replace(" ", "_").replace("/", "_") - temp_fd, temp_path = tempfile.mkstemp( - suffix=f"_{safe_filename}" - ) - - try: - # Write content to temp file - with os.fdopen(temp_fd, 'wb') as temp_file: - temp_file.write(content) - - logger.debug("Created temporary file for task processing", temp_path=temp_path) - - # Create langflow upload task for single file - task_id = await task_service.create_langflow_upload_task( - user_id=user_id, - file_paths=[temp_path], - langflow_file_service=langflow_file_service, - session_manager=session_manager, - jwt_token=jwt_token, - owner_name=user_name, - owner_email=user_email, - session_id=session_id, - tweaks=tweaks, - settings=settings, - delete_after_ingest=delete_after_ingest, - ) - - logger.debug("Langflow upload task created successfully", task_id=task_id) - - return JSONResponse({ - "task_id": task_id, - "message": f"Langflow upload task created for file '{upload_file.filename}'", - "filename": upload_file.filename - }, status_code=202) # 202 Accepted for async processing - - except Exception: - # Clean up temp file on error - try: - if os.path.exists(temp_path): - os.unlink(temp_path) - except Exception: - pass # Ignore cleanup errors - raise - - except Exception as e: - logger.error( - "upload_and_ingest_user_file endpoint failed", - error_type=type(e).__name__, - error=str(e), - ) - import traceback - logger.error("Full traceback", traceback=traceback.format_exc()) - return JSONResponse({"error": str(e)}, status_code=500) - - async def delete_user_files( request: Request, langflow_file_service: LangflowFileService, session_manager ): diff --git a/src/api/router.py b/src/api/router.py index 154757a5..620b0d55 100644 --- a/src/api/router.py +++ b/src/api/router.py @@ -3,11 +3,8 @@ from starlette.requests import Request from starlette.responses import JSONResponse -from config.settings import DISABLE_INGEST_WITH_LANGFLOW from utils.logging_config import get_logger - -# Import the actual endpoint implementations -from .upload import upload as traditional_upload +from .upload_utils import extract_user_context, create_temp_files_from_form_files logger = get_logger(__name__) @@ -29,20 +26,57 @@ async def upload_ingest_router( All langflow uploads are processed as background tasks for better scalability. """ try: - logger.debug( - "Router upload_ingest endpoint called", - disable_langflow_ingest=DISABLE_INGEST_WITH_LANGFLOW - ) + # Read setting at request time to avoid stale module-level values + from config import settings as cfg + disable_langflow_ingest = cfg.DISABLE_INGEST_WITH_LANGFLOW + logger.debug("Router upload_ingest endpoint called", disable_langflow_ingest=disable_langflow_ingest) # Route based on configuration - if DISABLE_INGEST_WITH_LANGFLOW: - # Route to traditional OpenRAG upload - logger.debug("Routing to traditional OpenRAG upload") - return await traditional_upload(request, document_service, session_manager) + if disable_langflow_ingest: + # Traditional OpenRAG path: create a background task via TaskService + logger.debug("Routing to traditional OpenRAG upload via task service (async)") + form = await request.form() + upload_files = form.getlist("file") + if not upload_files: + return JSONResponse({"error": "Missing file"}, status_code=400) + # Extract user context + ctx = await extract_user_context(request) + + # Create temporary files + temp_file_paths = await create_temp_files_from_form_files(upload_files) + try: + # Create traditional upload task for all files + task_id = await task_service.create_upload_task( + ctx["owner_user_id"], + temp_file_paths, + jwt_token=ctx["jwt_token"], + owner_name=ctx["owner_name"], + owner_email=ctx["owner_email"], + ) + return JSONResponse( + { + "task_id": task_id, + "message": f"Traditional upload task created for {len(upload_files)} file(s)", + "file_count": len(upload_files), + }, + status_code=201, + ) + except Exception: + # Clean up temp files on error + import os + for p in temp_file_paths: + try: + if os.path.exists(p): + os.unlink(p) + except Exception: + pass + raise else: - # Route to Langflow upload and ingest using task service - logger.debug("Routing to Langflow upload-ingest pipeline via task service") - return await langflow_upload_ingest_task(request, langflow_file_service, session_manager, task_service) + # Route to Langflow upload-ingest via task service for async processing (202 + task_id) + logger.debug("Routing to Langflow upload-ingest pipeline via task service (async)") + return await langflow_upload_ingest_task( + request, langflow_file_service, session_manager, task_service + ) except Exception as e: logger.error("Error in upload_ingest_router", error=str(e)) @@ -98,37 +132,19 @@ async def langflow_upload_ingest_task( logger.error("Invalid tweaks JSON", error=str(e)) return JSONResponse({"error": "Invalid tweaks JSON"}, status_code=400) - # Get user info from request state - user = getattr(request.state, "user", None) - user_id = user.user_id if user else None - user_name = user.name if user else None - user_email = user.email if user else None - jwt_token = getattr(request.state, "jwt_token", None) - - if not user_id: - return JSONResponse({"error": "User authentication required"}, status_code=401) + # Get user/auth context (allows no-auth mode) + ctx = await extract_user_context(request) + user_id = ctx["owner_user_id"] + user_name = ctx["owner_name"] + user_email = ctx["owner_email"] + jwt_token = ctx["jwt_token"] # Create temporary files for task processing - import tempfile import os temp_file_paths = [] try: - for upload_file in upload_files: - # Read file content - content = await upload_file.read() - - # Create temporary file - safe_filename = upload_file.filename.replace(" ", "_").replace("/", "_") - temp_fd, temp_path = tempfile.mkstemp( - suffix=f"_{safe_filename}" - ) - - # Write content to temp file - with os.fdopen(temp_fd, 'wb') as temp_file: - temp_file.write(content) - - temp_file_paths.append(temp_path) + temp_file_paths = await create_temp_files_from_form_files(upload_files) logger.debug( "Created temporary files for task-based processing", @@ -160,7 +176,7 @@ async def langflow_upload_ingest_task( "task_id": task_id, "message": f"Langflow upload task created for {len(upload_files)} file(s)", "file_count": len(upload_files) - }, status_code=202) # 202 Accepted for async processing + }, status_code=201) except Exception: # Clean up temp files on error diff --git a/src/api/upload.py b/src/api/upload.py index d845e978..bd820d40 100644 --- a/src/api/upload.py +++ b/src/api/upload.py @@ -3,46 +3,7 @@ from urllib.parse import urlparse import boto3 from starlette.requests import Request from starlette.responses import JSONResponse - - -async def upload(request: Request, document_service, session_manager): - """Upload a single file""" - try: - form = await request.form() - upload_file = form["file"] - user = request.state.user - jwt_token = request.state.jwt_token - - from config.settings import is_no_auth_mode - - # In no-auth mode, pass None for owner fields so documents have no owner - # This allows all users to see them when switching to auth mode - if is_no_auth_mode(): - owner_user_id = None - owner_name = None - owner_email = None - else: - owner_user_id = user.user_id - owner_name = user.name - owner_email = user.email - - result = await document_service.process_upload_file( - upload_file, - owner_user_id=owner_user_id, - jwt_token=jwt_token, - owner_name=owner_name, - owner_email=owner_email, - ) - return JSONResponse(result, status_code=201) # Created - except Exception as e: - error_msg = str(e) - if ( - "AuthenticationException" in error_msg - or "access denied" in error_msg.lower() - ): - return JSONResponse({"error": error_msg}, status_code=403) - else: - return JSONResponse({"error": error_msg}, status_code=500) +from .upload_utils import extract_user_context async def upload_path(request: Request, task_service, session_manager, langflow_file_service): @@ -59,20 +20,11 @@ async def upload_path(request: Request, task_service, session_manager, langflow_ if not file_paths: return JSONResponse({"error": "No files found in directory"}, status_code=400) - user = request.state.user - jwt_token = request.state.jwt_token - - from config.settings import is_no_auth_mode - - # In no-auth mode, pass None for owner fields so documents have no owner - if is_no_auth_mode(): - owner_user_id = None - owner_name = None - owner_email = None - else: - owner_user_id = user.user_id - owner_name = user.name - owner_email = user.email + ctx = await extract_user_context(request) + owner_user_id = ctx["owner_user_id"] + owner_name = ctx["owner_name"] + owner_email = ctx["owner_email"] + jwt_token = ctx["jwt_token"] from config.settings import DISABLE_INGEST_WITH_LANGFLOW @@ -184,23 +136,15 @@ async def upload_bucket(request: Request, task_service, session_manager): if not keys: return JSONResponse({"error": "No files found in bucket"}, status_code=400) - user = request.state.user - jwt_token = request.state.jwt_token - from models.processors import S3FileProcessor - from config.settings import is_no_auth_mode + from .upload_utils import extract_user_context - # In no-auth mode, pass None for owner fields so documents have no owner - if is_no_auth_mode(): - owner_user_id = None - owner_name = None - owner_email = None - task_user_id = None - else: - owner_user_id = user.user_id - owner_name = user.name - owner_email = user.email - task_user_id = user.user_id + ctx = await extract_user_context(request) + owner_user_id = ctx["owner_user_id"] + owner_name = ctx["owner_name"] + owner_email = ctx["owner_email"] + jwt_token = ctx["jwt_token"] + task_user_id = owner_user_id processor = S3FileProcessor( task_service.document_service, diff --git a/src/api/upload_utils.py b/src/api/upload_utils.py new file mode 100644 index 00000000..f2479107 --- /dev/null +++ b/src/api/upload_utils.py @@ -0,0 +1,47 @@ +from typing import List + +from starlette.requests import Request + + +async def extract_user_context(request: Request) -> dict: + """Extract user/auth context from request.state. Honors no-auth mode.""" + from config.settings import is_no_auth_mode + + user = getattr(request.state, "user", None) + jwt_token = getattr(request.state, "jwt_token", None) + + if is_no_auth_mode(): + return { + "owner_user_id": None, + "owner_name": None, + "owner_email": None, + "jwt_token": None, + } + + return { + "owner_user_id": getattr(user, "user_id", None), + "owner_name": getattr(user, "name", None), + "owner_email": getattr(user, "email", None), + "jwt_token": jwt_token, + } + + +async def create_temp_files_from_form_files(upload_files: List) -> list[str]: + """Persist UploadFile items to temp files; return list of paths.""" + import tempfile + import os + + temp_file_paths: list[str] = [] + for upload_file in upload_files: + content = await upload_file.read() + safe_filename = ( + upload_file.filename.replace(" ", "_").replace("/", "_") + if getattr(upload_file, "filename", None) + else "uploaded" + ) + fd, temp_path = tempfile.mkstemp(suffix=f"_{safe_filename}") + with os.fdopen(fd, "wb") as temp_file: + temp_file.write(content) + temp_file_paths.append(temp_path) + return temp_file_paths + diff --git a/src/main.py b/src/main.py index 1912f7df..bb745451 100644 --- a/src/main.py +++ b/src/main.py @@ -263,96 +263,60 @@ async def ingest_default_documents_when_ready(services): async def _ingest_default_documents_langflow(services, file_paths): - """Ingest default documents using Langflow upload-ingest-delete pipeline.""" + """Ingest default documents using Langflow via a single background task (aligned with router semantics).""" langflow_file_service = services["langflow_file_service"] session_manager = services["session_manager"] logger.info( - "Using Langflow ingestion pipeline for default documents", + "Using Langflow ingestion pipeline for default documents (task-based)", file_count=len(file_paths), ) - success_count = 0 - error_count = 0 + # Use AnonymousUser for default documents + from session_manager import AnonymousUser - for file_path in file_paths: - try: - logger.debug("Processing file with Langflow pipeline", file_path=file_path) + anonymous_user = AnonymousUser() - # Read file content - with open(file_path, "rb") as f: - content = f.read() + # Ensure an (anonymous) JWT is available for OpenSearch/flow auth + effective_jwt = None + try: + session_manager.get_user_opensearch_client(anonymous_user.user_id, None) + if hasattr(session_manager, "_anonymous_jwt"): + effective_jwt = session_manager._anonymous_jwt + except Exception: + pass - # Create file tuple for upload - filename = os.path.basename(file_path) - # Determine content type based on file extension - content_type, _ = mimetypes.guess_type(filename) - if not content_type: - content_type = "application/octet-stream" + # Prepare tweaks with anonymous metadata for OpenSearch component + default_tweaks = { + "OpenSearchHybrid-Ve6bS": { + "docs_metadata": [ + {"key": "owner", "value": None}, + {"key": "owner_name", "value": anonymous_user.name}, + {"key": "owner_email", "value": anonymous_user.email}, + {"key": "connector_type", "value": "system_default"}, + ] + } + } - file_tuple = (filename, content, content_type) - - # Use AnonymousUser details for default documents - from session_manager import AnonymousUser - - anonymous_user = AnonymousUser() - - # Get JWT token using same logic as DocumentFileProcessor - # This will handle anonymous JWT creation if needed for anonymous user - effective_jwt = None - - # Let session manager handle anonymous JWT creation if needed - if session_manager: - # This call will create anonymous JWT if needed (same as DocumentFileProcessor) - session_manager.get_user_opensearch_client( - anonymous_user.user_id, effective_jwt - ) - # Get the JWT that was created by session manager - if hasattr(session_manager, "_anonymous_jwt"): - effective_jwt = session_manager._anonymous_jwt - - # Prepare tweaks for default documents with anonymous user metadata - default_tweaks = { - "OpenSearchHybrid-Ve6bS": { - "docs_metadata": [ - {"key": "owner", "value": None}, - {"key": "owner_name", "value": anonymous_user.name}, - {"key": "owner_email", "value": anonymous_user.email}, - {"key": "connector_type", "value": "system_default"}, - ] - } - } - - # Use langflow upload_and_ingest_file method with JWT token - result = await langflow_file_service.upload_and_ingest_file( - file_tuple=file_tuple, - session_id=None, # No session for default documents - tweaks=default_tweaks, # Add anonymous user metadata - settings=None, # Use default ingestion settings - jwt_token=effective_jwt, # Use JWT token (anonymous if needed) - delete_after_ingest=True, # Clean up after ingestion - ) - - logger.info( - "Successfully ingested file via Langflow", - file_path=file_path, - result_status=result.get("status"), - ) - success_count += 1 - - except Exception as e: - logger.error( - "Failed to ingest file via Langflow", - file_path=file_path, - error=str(e), - ) - error_count += 1 + # Create a single task to process all default documents through Langflow + task_id = await services["task_service"].create_langflow_upload_task( + user_id=anonymous_user.user_id, + file_paths=file_paths, + langflow_file_service=langflow_file_service, + session_manager=session_manager, + jwt_token=effective_jwt, + owner_name=anonymous_user.name, + owner_email=anonymous_user.email, + session_id=None, + tweaks=default_tweaks, + settings=None, + delete_after_ingest=True, + ) logger.info( - "Langflow ingestion completed", - success_count=success_count, - error_count=error_count, - total_files=len(file_paths), + "Started Langflow ingestion task for default documents", + task_id=task_id, + file_count=len(file_paths), ) @@ -486,41 +450,7 @@ async def create_app(): # Create route handlers with service dependencies injected routes = [ - # Upload endpoints - Route( - "/upload", - require_auth(services["session_manager"])( - partial( - upload.upload, - document_service=services["document_service"], - session_manager=services["session_manager"], - ) - ), - methods=["POST"], - ), - # Langflow Files endpoints - Route( - "/langflow/files/upload", - optional_auth(services["session_manager"])( - partial( - langflow_files.upload_user_file, - langflow_file_service=services["langflow_file_service"], - session_manager=services["session_manager"], - ) - ), - methods=["POST"], - ), - Route( - "/langflow/ingest", - require_auth(services["session_manager"])( - partial( - langflow_files.run_ingestion, - langflow_file_service=services["langflow_file_service"], - session_manager=services["session_manager"], - ) - ), - methods=["POST"], - ), + # Langflow direct upload/ingest endpoints removed in favor of router (/router/upload_ingest) Route( "/langflow/files", require_auth(services["session_manager"])( @@ -532,18 +462,6 @@ async def create_app(): ), methods=["DELETE"], ), - Route( - "/langflow/upload_ingest", - require_auth(services["session_manager"])( - partial( - langflow_files.upload_and_ingest_user_file, - langflow_file_service=services["langflow_file_service"], - session_manager=services["session_manager"], - task_service=services["task_service"], - ) - ), - methods=["POST"], - ), Route( "/upload_context", require_auth(services["session_manager"])( @@ -939,7 +857,7 @@ async def create_app(): methods=["POST"], ), Route( - "/router/upload_ingest", + "/upload", require_auth(services["session_manager"])( partial( router.upload_ingest_router, @@ -969,6 +887,33 @@ async def create_app(): @app.on_event("shutdown") async def shutdown_event(): await cleanup_subscriptions_proper(services) + # Close HTTP/OpenSearch clients cleanly + try: + from config.settings import clients as _clients + + if getattr(_clients, "langflow_http_client", None): + try: + await _clients.langflow_http_client.aclose() + except Exception: + pass + if getattr(_clients, "opensearch", None): + try: + await _clients.opensearch.close() + except Exception: + pass + except Exception: + pass + # Close any per-user OpenSearch clients + try: + sm = services.get("session_manager") + if sm and getattr(sm, "user_opensearch_clients", None): + for oc in sm.user_opensearch_clients.values(): + try: + await oc.close() + except Exception: + pass + except Exception: + pass return app diff --git a/src/services/document_service.py b/src/services/document_service.py index 22f61411..98e2c2a1 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -215,7 +215,12 @@ class DocumentService: ): """Process an uploaded file from form data""" sha256 = hashlib.sha256() - tmp = tempfile.NamedTemporaryFile(delete=False) + # Preserve file extension so the converter can detect format + try: + _, ext = os.path.splitext(getattr(upload_file, "filename", "") or "") + except Exception: + ext = "" + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext) file_size = 0 try: while True: From 1e5661757bcd8681e1e5450e74354e77b00d5ab5 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 16:05:57 -0400 Subject: [PATCH 007/113] integration tests v0 --- pyproject.toml | 4 + tests/__init__.py | 1 + tests/conftest.py | 80 ++++++++++ tests/integration/__init__.py | 1 + tests/integration/test_api_endpoints.py | 193 +++++++++++++++++++++++ tests/integration/test_startup_ingest.py | 114 +++++++++++++ uv.lock | 124 +++++++++++++++ 7 files changed, 517 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/test_api_endpoints.py create mode 100644 tests/integration/test_startup_ingest.py diff --git a/pyproject.toml b/pyproject.toml index 6065f077..04200e93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,10 @@ dependencies = [ "python-dotenv>=1.0.0", "textual-fspicker>=0.6.0", "structlog>=25.4.0", + "pytest>=8.0.0", + "pytest-asyncio>=0.21.0", + "pytest-mock>=3.12.0", + "pytest-cov>=4.0.0", ] [project.scripts] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..5f19b37d --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Test package \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..2edf3d65 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,80 @@ +import asyncio +import os +import tempfile +from pathlib import Path + +import pytest +import pytest_asyncio +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Force no-auth mode for testing by removing OAuth credentials +# This ensures anonymous JWT tokens are created automatically +os.environ.pop('GOOGLE_OAUTH_CLIENT_ID', None) +os.environ.pop('GOOGLE_OAUTH_CLIENT_SECRET', None) + +from src.config.settings import clients +from src.session_manager import SessionManager + + +@pytest.fixture(scope="session") +def event_loop(): + """Create an instance of the default event loop for the test session.""" + loop = asyncio.get_event_loop_policy().new_event_loop() + yield loop + loop.close() + + +@pytest_asyncio.fixture +async def opensearch_client(): + """OpenSearch client for testing - requires running OpenSearch.""" + await clients.initialize() + yield clients.opensearch + # Cleanup test indices after tests + try: + await clients.opensearch.indices.delete(index="test_documents") + except Exception: + pass + + +@pytest.fixture +def session_manager(): + """Session manager for testing.""" + return SessionManager("test-secret-key") + + +@pytest.fixture +def test_documents_dir(): + """Create a temporary directory with test documents.""" + with tempfile.TemporaryDirectory() as temp_dir: + test_dir = Path(temp_dir) + + # Create some test files in supported formats + (test_dir / "test1.md").write_text("# Machine Learning Document\n\nThis is a test document about machine learning.") + (test_dir / "test2.md").write_text("# AI Document\n\nAnother document discussing artificial intelligence.") + (test_dir / "test3.md").write_text("# Data Science Document\n\nThis is a markdown file about data science.") + + # Create subdirectory with files + sub_dir = test_dir / "subdir" + sub_dir.mkdir() + (sub_dir / "nested.md").write_text("# Neural Networks\n\nNested document about neural networks.") + + yield test_dir + + +@pytest.fixture +def test_single_file(): + """Create a single test file.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='_test_document.md', delete=False) as f: + f.write("# Single Test Document\n\nThis is a test document about OpenRAG testing framework. This document contains multiple sentences to ensure proper chunking. The content should be indexed and searchable in OpenSearch after processing.") + temp_path = f.name + + yield temp_path + + # Cleanup + try: + os.unlink(temp_path) + except FileNotFoundError: + pass \ No newline at end of file diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e27cd7ab --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1 @@ +# Integration tests package \ No newline at end of file diff --git a/tests/integration/test_api_endpoints.py b/tests/integration/test_api_endpoints.py new file mode 100644 index 00000000..e2ae3c18 --- /dev/null +++ b/tests/integration/test_api_endpoints.py @@ -0,0 +1,193 @@ +import asyncio +import os +from pathlib import Path + +import httpx +import pytest + + +async def wait_for_service_ready(client: httpx.AsyncClient, timeout_s: float = 30.0): + """Poll existing endpoints until the app and OpenSearch are ready. + + Strategy: + - GET /auth/me should return 200 immediately (confirms app is up). + - POST /search with query "*" avoids embeddings and checks OpenSearch/index readiness. + """ + deadline = asyncio.get_event_loop().time() + timeout_s + last_err = None + while asyncio.get_event_loop().time() < deadline: + try: + r1 = await client.get("/auth/me") + if r1.status_code != 200: + await asyncio.sleep(0.5) + continue + # match_all readiness probe; no embeddings + r2 = await client.post("/search", json={"query": "*", "limit": 0}) + if r2.status_code == 200: + return + last_err = r2.text + except Exception as e: + last_err = str(e) + await asyncio.sleep(0.5) + raise AssertionError(f"Service not ready in time: {last_err}") + + +@pytest.mark.parametrize("disable_langflow_ingest", [True, False]) +@pytest.mark.asyncio +async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_ingest: bool): + """Boot the ASGI app and exercise /upload and /search endpoints.""" + # Ensure we route uploads to traditional processor and disable startup ingest + os.environ["DISABLE_INGEST_WITH_LANGFLOW"] = "true" if disable_langflow_ingest else "false" + os.environ["DISABLE_STARTUP_INGEST"] = "true" + # Force no-auth mode so endpoints bypass authentication + os.environ["GOOGLE_OAUTH_CLIENT_ID"] = "" + os.environ["GOOGLE_OAUTH_CLIENT_SECRET"] = "" + + # Import after env vars to ensure settings pick them up. Clear cached modules + import sys + # Clear cached modules so settings pick up env and router sees new flag + for mod in [ + "src.api.router", + "src.api.connector_router", + "src.config.settings", + "src.main", + ]: + sys.modules.pop(mod, None) + from src.main import create_app, startup_tasks + from src.config.settings import clients, INDEX_NAME + + # Ensure a clean index before startup + await clients.initialize() + try: + await clients.opensearch.indices.delete(index=INDEX_NAME) + except Exception: + pass + + app = await create_app() + # Manually run startup tasks since httpx ASGI transport here doesn't manage lifespan + await startup_tasks(app.state.services) + + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + # Wait for app + OpenSearch readiness using existing endpoints + await wait_for_service_ready(client) + + # Create a temporary markdown file to upload + file_path = tmp_path / "endpoint_test_doc.md" + file_text = ( + "# Single Test Document\n\n" + "This is a test document about OpenRAG testing framework. " + "The content should be indexed and searchable in OpenSearch after processing." + ) + file_path.write_text(file_text) + + # POST via router (multipart) + files = { + "file": ( + file_path.name, + file_path.read_bytes(), + "text/markdown", + ) + } + upload_resp = await client.post("/upload", files=files) + body = upload_resp.json() + # Router now returns 201 + task_id (async) regardless of mode + assert upload_resp.status_code == 201, upload_resp.text + assert isinstance(body.get("task_id"), str) + + # Poll search for the specific content until it's indexed + async def _wait_for_indexed(timeout_s: float = 30.0): + deadline = asyncio.get_event_loop().time() + timeout_s + while asyncio.get_event_loop().time() < deadline: + resp = await client.post( + "/search", + json={"query": "OpenRAG testing framework", "limit": 5}, + ) + if resp.status_code == 200 and resp.json().get("results"): + return resp + await asyncio.sleep(0.5) + return resp + + search_resp = await _wait_for_indexed() + + # POST /search + assert search_resp.status_code == 200, search_resp.text + search_body = search_resp.json() + + # Basic shape and at least one hit + assert isinstance(search_body.get("results"), list) + assert len(search_body["results"]) >= 0 + # When hits exist, confirm our phrase is present in top result content + if search_body["results"]: + top = search_body["results"][0] + assert "text" in top or "content" in top + text = top.get("text") or top.get("content") + assert isinstance(text, str) + assert "testing" in text.lower() + # Explicitly close global clients to avoid aiohttp warnings + from src.config.settings import clients + try: + if getattr(clients, "opensearch", None): + await clients.opensearch.close() + if getattr(clients, "langflow_http_client", None): + await clients.langflow_http_client.aclose() + except Exception: + pass + + +@pytest.mark.parametrize("disable_langflow_ingest", [True, False]) +@pytest.mark.asyncio +async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow_ingest: bool): + """Exercise the router endpoint to ensure it routes to traditional upload when Langflow ingest is disabled.""" + os.environ["DISABLE_INGEST_WITH_LANGFLOW"] = "true" if disable_langflow_ingest else "false" + os.environ["DISABLE_STARTUP_INGEST"] = "true" + os.environ["GOOGLE_OAUTH_CLIENT_ID"] = "" + os.environ["GOOGLE_OAUTH_CLIENT_SECRET"] = "" + + import sys + for mod in [ + "src.api.router", + "src.api.connector_router", + "src.config.settings", + "src.main", + ]: + sys.modules.pop(mod, None) + from src.main import create_app, startup_tasks + from src.config.settings import clients, INDEX_NAME + + # Ensure a clean index before startup + await clients.initialize() + try: + await clients.opensearch.indices.delete(index=INDEX_NAME) + except Exception: + pass + + app = await create_app() + await startup_tasks(app.state.services) + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + await wait_for_service_ready(client) + + file_path = tmp_path / "router_test_doc.md" + file_path.write_text("# Router Test\n\nThis file validates the upload router.") + + files = { + "file": ( + file_path.name, + file_path.read_bytes(), + "text/markdown", + ) + } + + resp = await client.post("/upload", files=files) + data = resp.json() + assert resp.status_code == 201, resp.text + assert isinstance(data.get("task_id"), str) + from src.config.settings import clients + try: + if getattr(clients, "opensearch", None): + await clients.opensearch.close() + if getattr(clients, "langflow_http_client", None): + await clients.langflow_http_client.aclose() + except Exception: + pass diff --git a/tests/integration/test_startup_ingest.py b/tests/integration/test_startup_ingest.py new file mode 100644 index 00000000..5ce62a94 --- /dev/null +++ b/tests/integration/test_startup_ingest.py @@ -0,0 +1,114 @@ +import asyncio +import os +from pathlib import Path + +import httpx +import pytest + + +async def wait_for_ready(client: httpx.AsyncClient, timeout_s: float = 30.0): + deadline = asyncio.get_event_loop().time() + timeout_s + last_err = None + while asyncio.get_event_loop().time() < deadline: + try: + r1 = await client.get("/auth/me") + if r1.status_code != 200: + await asyncio.sleep(0.5) + continue + r2 = await client.post("/search", json={"query": "*", "limit": 0}) + if r2.status_code == 200: + return + last_err = r2.text + except Exception as e: + last_err = str(e) + await asyncio.sleep(0.5) + raise AssertionError(f"Service not ready in time: {last_err}") + + +def count_files_in_documents() -> int: + base_dir = Path(os.getcwd()) / "documents" + if not base_dir.is_dir(): + return 0 + return sum(1 for _ in base_dir.rglob("*") if _.is_file()) + + +@pytest.mark.parametrize("disable_langflow_ingest", [True, False]) +@pytest.mark.asyncio +async def test_startup_ingest_creates_task(disable_langflow_ingest: bool): + # Ensure startup ingest runs and choose pipeline per param + os.environ["DISABLE_STARTUP_INGEST"] = "false" + os.environ["DISABLE_INGEST_WITH_LANGFLOW"] = ( + "true" if disable_langflow_ingest else "false" + ) + # Force no-auth mode for simpler endpoint access + os.environ["GOOGLE_OAUTH_CLIENT_ID"] = "" + os.environ["GOOGLE_OAUTH_CLIENT_SECRET"] = "" + + # Reload settings to pick up env for this test run + import sys + + for mod in [ + "src.api.router", + "src.api.connector_router", + "src.config.settings", + "src.main", + ]: + sys.modules.pop(mod, None) + + from src.main import create_app, startup_tasks + from src.config.settings import clients, INDEX_NAME + + # Ensure a clean index before startup + await clients.initialize() + try: + await clients.opensearch.indices.delete(index=INDEX_NAME) + except Exception: + pass + + app = await create_app() + # Trigger startup tasks explicitly + await startup_tasks(app.state.services) + + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + await wait_for_ready(client) + + expected_files = count_files_in_documents() + + # Poll /tasks until we see at least one startup ingest task + async def _wait_for_task(timeout_s: float = 60.0): + deadline = asyncio.get_event_loop().time() + timeout_s + last = None + while asyncio.get_event_loop().time() < deadline: + resp = await client.get("/tasks") + if resp.status_code == 200: + data = resp.json() + last = data + tasks = data.get("tasks") if isinstance(data, dict) else None + if isinstance(tasks, list) and len(tasks) > 0: + return tasks + await asyncio.sleep(0.5) + return last.get("tasks") if isinstance(last, dict) else last + + tasks = await _wait_for_task() + if expected_files == 0: + return # Nothing to do + if not (isinstance(tasks, list) and len(tasks) > 0): + # Fallback: verify that documents were indexed as a sign of startup ingest + sr = await client.post("/search", json={"query": "*", "limit": 1}) + assert sr.status_code == 200, sr.text + total = sr.json().get("total") + assert isinstance(total, int) and total >= 0, "Startup ingest did not index documents" + return + newest = tasks[0] + assert "task_id" in newest + assert newest.get("total_files") == expected_files + # Explicitly close global clients to avoid aiohttp warnings + from src.config.settings import clients + try: + if getattr(clients, "opensearch", None): + await clients.opensearch.close() + if getattr(clients, "langflow_http_client", None): + await clients.langflow_http_client.aclose() + except Exception: + pass diff --git a/uv.lock b/uv.lock index 08a14492..40e7f39a 100644 --- a/uv.lock +++ b/uv.lock @@ -243,6 +243,59 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coverage" +version = "7.10.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/14/70/025b179c993f019105b79575ac6edb5e084fb0f0e63f15cdebef4e454fb5/coverage-7.10.6.tar.gz", hash = "sha256:f644a3ae5933a552a29dbb9aa2f90c677a875f80ebea028e5a52a4f429044b90", size = 823736, upload-time = "2025-08-29T15:35:16.668Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/e7/917e5953ea29a28c1057729c1d5af9084ab6d9c66217523fd0e10f14d8f6/coverage-7.10.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ffea0575345e9ee0144dfe5701aa17f3ba546f8c3bb48db62ae101afb740e7d6", size = 217351, upload-time = "2025-08-29T15:33:45.438Z" }, + { url = "https://files.pythonhosted.org/packages/eb/86/2e161b93a4f11d0ea93f9bebb6a53f113d5d6e416d7561ca41bb0a29996b/coverage-7.10.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:95d91d7317cde40a1c249d6b7382750b7e6d86fad9d8eaf4fa3f8f44cf171e80", size = 217600, upload-time = "2025-08-29T15:33:47.269Z" }, + { url = "https://files.pythonhosted.org/packages/0e/66/d03348fdd8df262b3a7fb4ee5727e6e4936e39e2f3a842e803196946f200/coverage-7.10.6-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3e23dd5408fe71a356b41baa82892772a4cefcf758f2ca3383d2aa39e1b7a003", size = 248600, upload-time = "2025-08-29T15:33:48.953Z" }, + { url = "https://files.pythonhosted.org/packages/73/dd/508420fb47d09d904d962f123221bc249f64b5e56aa93d5f5f7603be475f/coverage-7.10.6-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0f3f56e4cb573755e96a16501a98bf211f100463d70275759e73f3cbc00d4f27", size = 251206, upload-time = "2025-08-29T15:33:50.697Z" }, + { url = "https://files.pythonhosted.org/packages/e9/1f/9020135734184f439da85c70ea78194c2730e56c2d18aee6e8ff1719d50d/coverage-7.10.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:db4a1d897bbbe7339946ffa2fe60c10cc81c43fab8b062d3fcb84188688174a4", size = 252478, upload-time = "2025-08-29T15:33:52.303Z" }, + { url = "https://files.pythonhosted.org/packages/a4/a4/3d228f3942bb5a2051fde28c136eea23a761177dc4ff4ef54533164ce255/coverage-7.10.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d8fd7879082953c156d5b13c74aa6cca37f6a6f4747b39538504c3f9c63d043d", size = 250637, upload-time = "2025-08-29T15:33:53.67Z" }, + { url = "https://files.pythonhosted.org/packages/36/e3/293dce8cdb9a83de971637afc59b7190faad60603b40e32635cbd15fbf61/coverage-7.10.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:28395ca3f71cd103b8c116333fa9db867f3a3e1ad6a084aa3725ae002b6583bc", size = 248529, upload-time = "2025-08-29T15:33:55.022Z" }, + { url = "https://files.pythonhosted.org/packages/90/26/64eecfa214e80dd1d101e420cab2901827de0e49631d666543d0e53cf597/coverage-7.10.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:61c950fc33d29c91b9e18540e1aed7d9f6787cc870a3e4032493bbbe641d12fc", size = 250143, upload-time = "2025-08-29T15:33:56.386Z" }, + { url = "https://files.pythonhosted.org/packages/3e/70/bd80588338f65ea5b0d97e424b820fb4068b9cfb9597fbd91963086e004b/coverage-7.10.6-cp313-cp313-win32.whl", hash = "sha256:160c00a5e6b6bdf4e5984b0ef21fc860bc94416c41b7df4d63f536d17c38902e", size = 219770, upload-time = "2025-08-29T15:33:58.063Z" }, + { url = "https://files.pythonhosted.org/packages/a7/14/0b831122305abcc1060c008f6c97bbdc0a913ab47d65070a01dc50293c2b/coverage-7.10.6-cp313-cp313-win_amd64.whl", hash = "sha256:628055297f3e2aa181464c3808402887643405573eb3d9de060d81531fa79d32", size = 220566, upload-time = "2025-08-29T15:33:59.766Z" }, + { url = "https://files.pythonhosted.org/packages/83/c6/81a83778c1f83f1a4a168ed6673eeedc205afb562d8500175292ca64b94e/coverage-7.10.6-cp313-cp313-win_arm64.whl", hash = "sha256:df4ec1f8540b0bcbe26ca7dd0f541847cc8a108b35596f9f91f59f0c060bfdd2", size = 219195, upload-time = "2025-08-29T15:34:01.191Z" }, + { url = "https://files.pythonhosted.org/packages/d7/1c/ccccf4bf116f9517275fa85047495515add43e41dfe8e0bef6e333c6b344/coverage-7.10.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:c9a8b7a34a4de3ed987f636f71881cd3b8339f61118b1aa311fbda12741bff0b", size = 218059, upload-time = "2025-08-29T15:34:02.91Z" }, + { url = "https://files.pythonhosted.org/packages/92/97/8a3ceff833d27c7492af4f39d5da6761e9ff624831db9e9f25b3886ddbca/coverage-7.10.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dd5af36092430c2b075cee966719898f2ae87b636cefb85a653f1d0ba5d5393", size = 218287, upload-time = "2025-08-29T15:34:05.106Z" }, + { url = "https://files.pythonhosted.org/packages/92/d8/50b4a32580cf41ff0423777a2791aaf3269ab60c840b62009aec12d3970d/coverage-7.10.6-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b0353b0f0850d49ada66fdd7d0c7cdb0f86b900bb9e367024fd14a60cecc1e27", size = 259625, upload-time = "2025-08-29T15:34:06.575Z" }, + { url = "https://files.pythonhosted.org/packages/7e/7e/6a7df5a6fb440a0179d94a348eb6616ed4745e7df26bf2a02bc4db72c421/coverage-7.10.6-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d6b9ae13d5d3e8aeca9ca94198aa7b3ebbc5acfada557d724f2a1f03d2c0b0df", size = 261801, upload-time = "2025-08-29T15:34:08.006Z" }, + { url = "https://files.pythonhosted.org/packages/3a/4c/a270a414f4ed5d196b9d3d67922968e768cd971d1b251e1b4f75e9362f75/coverage-7.10.6-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:675824a363cc05781b1527b39dc2587b8984965834a748177ee3c37b64ffeafb", size = 264027, upload-time = "2025-08-29T15:34:09.806Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8b/3210d663d594926c12f373c5370bf1e7c5c3a427519a8afa65b561b9a55c/coverage-7.10.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:692d70ea725f471a547c305f0d0fc6a73480c62fb0da726370c088ab21aed282", size = 261576, upload-time = "2025-08-29T15:34:11.585Z" }, + { url = "https://files.pythonhosted.org/packages/72/d0/e1961eff67e9e1dba3fc5eb7a4caf726b35a5b03776892da8d79ec895775/coverage-7.10.6-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:851430a9a361c7a8484a36126d1d0ff8d529d97385eacc8dfdc9bfc8c2d2cbe4", size = 259341, upload-time = "2025-08-29T15:34:13.159Z" }, + { url = "https://files.pythonhosted.org/packages/3a/06/d6478d152cd189b33eac691cba27a40704990ba95de49771285f34a5861e/coverage-7.10.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d9369a23186d189b2fc95cc08b8160ba242057e887d766864f7adf3c46b2df21", size = 260468, upload-time = "2025-08-29T15:34:14.571Z" }, + { url = "https://files.pythonhosted.org/packages/ed/73/737440247c914a332f0b47f7598535b29965bf305e19bbc22d4c39615d2b/coverage-7.10.6-cp313-cp313t-win32.whl", hash = "sha256:92be86fcb125e9bda0da7806afd29a3fd33fdf58fba5d60318399adf40bf37d0", size = 220429, upload-time = "2025-08-29T15:34:16.394Z" }, + { url = "https://files.pythonhosted.org/packages/bd/76/b92d3214740f2357ef4a27c75a526eb6c28f79c402e9f20a922c295c05e2/coverage-7.10.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6b3039e2ca459a70c79523d39347d83b73f2f06af5624905eba7ec34d64d80b5", size = 221493, upload-time = "2025-08-29T15:34:17.835Z" }, + { url = "https://files.pythonhosted.org/packages/fc/8e/6dcb29c599c8a1f654ec6cb68d76644fe635513af16e932d2d4ad1e5ac6e/coverage-7.10.6-cp313-cp313t-win_arm64.whl", hash = "sha256:3fb99d0786fe17b228eab663d16bee2288e8724d26a199c29325aac4b0319b9b", size = 219757, upload-time = "2025-08-29T15:34:19.248Z" }, + { url = "https://files.pythonhosted.org/packages/d3/aa/76cf0b5ec00619ef208da4689281d48b57f2c7fde883d14bf9441b74d59f/coverage-7.10.6-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6008a021907be8c4c02f37cdc3ffb258493bdebfeaf9a839f9e71dfdc47b018e", size = 217331, upload-time = "2025-08-29T15:34:20.846Z" }, + { url = "https://files.pythonhosted.org/packages/65/91/8e41b8c7c505d398d7730206f3cbb4a875a35ca1041efc518051bfce0f6b/coverage-7.10.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5e75e37f23eb144e78940b40395b42f2321951206a4f50e23cfd6e8a198d3ceb", size = 217607, upload-time = "2025-08-29T15:34:22.433Z" }, + { url = "https://files.pythonhosted.org/packages/87/7f/f718e732a423d442e6616580a951b8d1ec3575ea48bcd0e2228386805e79/coverage-7.10.6-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0f7cb359a448e043c576f0da00aa8bfd796a01b06aa610ca453d4dde09cc1034", size = 248663, upload-time = "2025-08-29T15:34:24.425Z" }, + { url = "https://files.pythonhosted.org/packages/e6/52/c1106120e6d801ac03e12b5285e971e758e925b6f82ee9b86db3aa10045d/coverage-7.10.6-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c68018e4fc4e14b5668f1353b41ccf4bc83ba355f0e1b3836861c6f042d89ac1", size = 251197, upload-time = "2025-08-29T15:34:25.906Z" }, + { url = "https://files.pythonhosted.org/packages/3d/ec/3a8645b1bb40e36acde9c0609f08942852a4af91a937fe2c129a38f2d3f5/coverage-7.10.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cd4b2b0707fc55afa160cd5fc33b27ccbf75ca11d81f4ec9863d5793fc6df56a", size = 252551, upload-time = "2025-08-29T15:34:27.337Z" }, + { url = "https://files.pythonhosted.org/packages/a1/70/09ecb68eeb1155b28a1d16525fd3a9b65fbe75337311a99830df935d62b6/coverage-7.10.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4cec13817a651f8804a86e4f79d815b3b28472c910e099e4d5a0e8a3b6a1d4cb", size = 250553, upload-time = "2025-08-29T15:34:29.065Z" }, + { url = "https://files.pythonhosted.org/packages/c6/80/47df374b893fa812e953b5bc93dcb1427a7b3d7a1a7d2db33043d17f74b9/coverage-7.10.6-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f2a6a8e06bbda06f78739f40bfb56c45d14eb8249d0f0ea6d4b3d48e1f7c695d", size = 248486, upload-time = "2025-08-29T15:34:30.897Z" }, + { url = "https://files.pythonhosted.org/packages/4a/65/9f98640979ecee1b0d1a7164b589de720ddf8100d1747d9bbdb84be0c0fb/coverage-7.10.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:081b98395ced0d9bcf60ada7661a0b75f36b78b9d7e39ea0790bb4ed8da14747", size = 249981, upload-time = "2025-08-29T15:34:32.365Z" }, + { url = "https://files.pythonhosted.org/packages/1f/55/eeb6603371e6629037f47bd25bef300387257ed53a3c5fdb159b7ac8c651/coverage-7.10.6-cp314-cp314-win32.whl", hash = "sha256:6937347c5d7d069ee776b2bf4e1212f912a9f1f141a429c475e6089462fcecc5", size = 220054, upload-time = "2025-08-29T15:34:34.124Z" }, + { url = "https://files.pythonhosted.org/packages/15/d1/a0912b7611bc35412e919a2cd59ae98e7ea3b475e562668040a43fb27897/coverage-7.10.6-cp314-cp314-win_amd64.whl", hash = "sha256:adec1d980fa07e60b6ef865f9e5410ba760e4e1d26f60f7e5772c73b9a5b0713", size = 220851, upload-time = "2025-08-29T15:34:35.651Z" }, + { url = "https://files.pythonhosted.org/packages/ef/2d/11880bb8ef80a45338e0b3e0725e4c2d73ffbb4822c29d987078224fd6a5/coverage-7.10.6-cp314-cp314-win_arm64.whl", hash = "sha256:a80f7aef9535442bdcf562e5a0d5a5538ce8abe6bb209cfbf170c462ac2c2a32", size = 219429, upload-time = "2025-08-29T15:34:37.16Z" }, + { url = "https://files.pythonhosted.org/packages/83/c0/1f00caad775c03a700146f55536ecd097a881ff08d310a58b353a1421be0/coverage-7.10.6-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:0de434f4fbbe5af4fa7989521c655c8c779afb61c53ab561b64dcee6149e4c65", size = 218080, upload-time = "2025-08-29T15:34:38.919Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c4/b1c5d2bd7cc412cbeb035e257fd06ed4e3e139ac871d16a07434e145d18d/coverage-7.10.6-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6e31b8155150c57e5ac43ccd289d079eb3f825187d7c66e755a055d2c85794c6", size = 218293, upload-time = "2025-08-29T15:34:40.425Z" }, + { url = "https://files.pythonhosted.org/packages/3f/07/4468d37c94724bf6ec354e4ec2f205fda194343e3e85fd2e59cec57e6a54/coverage-7.10.6-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:98cede73eb83c31e2118ae8d379c12e3e42736903a8afcca92a7218e1f2903b0", size = 259800, upload-time = "2025-08-29T15:34:41.996Z" }, + { url = "https://files.pythonhosted.org/packages/82/d8/f8fb351be5fee31690cd8da768fd62f1cfab33c31d9f7baba6cd8960f6b8/coverage-7.10.6-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f863c08f4ff6b64fa8045b1e3da480f5374779ef187f07b82e0538c68cb4ff8e", size = 261965, upload-time = "2025-08-29T15:34:43.61Z" }, + { url = "https://files.pythonhosted.org/packages/e8/70/65d4d7cfc75c5c6eb2fed3ee5cdf420fd8ae09c4808723a89a81d5b1b9c3/coverage-7.10.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b38261034fda87be356f2c3f42221fdb4171c3ce7658066ae449241485390d5", size = 264220, upload-time = "2025-08-29T15:34:45.387Z" }, + { url = "https://files.pythonhosted.org/packages/98/3c/069df106d19024324cde10e4ec379fe2fb978017d25e97ebee23002fbadf/coverage-7.10.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0e93b1476b79eae849dc3872faeb0bf7948fd9ea34869590bc16a2a00b9c82a7", size = 261660, upload-time = "2025-08-29T15:34:47.288Z" }, + { url = "https://files.pythonhosted.org/packages/fc/8a/2974d53904080c5dc91af798b3a54a4ccb99a45595cc0dcec6eb9616a57d/coverage-7.10.6-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ff8a991f70f4c0cf53088abf1e3886edcc87d53004c7bb94e78650b4d3dac3b5", size = 259417, upload-time = "2025-08-29T15:34:48.779Z" }, + { url = "https://files.pythonhosted.org/packages/30/38/9616a6b49c686394b318974d7f6e08f38b8af2270ce7488e879888d1e5db/coverage-7.10.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ac765b026c9f33044419cbba1da913cfb82cca1b60598ac1c7a5ed6aac4621a0", size = 260567, upload-time = "2025-08-29T15:34:50.718Z" }, + { url = "https://files.pythonhosted.org/packages/76/16/3ed2d6312b371a8cf804abf4e14895b70e4c3491c6e53536d63fd0958a8d/coverage-7.10.6-cp314-cp314t-win32.whl", hash = "sha256:441c357d55f4936875636ef2cfb3bee36e466dcf50df9afbd398ce79dba1ebb7", size = 220831, upload-time = "2025-08-29T15:34:52.653Z" }, + { url = "https://files.pythonhosted.org/packages/d5/e5/d38d0cb830abede2adb8b147770d2a3d0e7fecc7228245b9b1ae6c24930a/coverage-7.10.6-cp314-cp314t-win_amd64.whl", hash = "sha256:073711de3181b2e204e4870ac83a7c4853115b42e9cd4d145f2231e12d670930", size = 221950, upload-time = "2025-08-29T15:34:54.212Z" }, + { url = "https://files.pythonhosted.org/packages/f4/51/e48e550f6279349895b0ffcd6d2a690e3131ba3a7f4eafccc141966d4dea/coverage-7.10.6-cp314-cp314t-win_arm64.whl", hash = "sha256:137921f2bac5559334ba66122b753db6dc5d1cf01eb7b64eb412bb0d064ef35b", size = 219969, upload-time = "2025-08-29T15:34:55.83Z" }, + { url = "https://files.pythonhosted.org/packages/44/0c/50db5379b615854b5cf89146f8f5bd1d5a9693d7f3a987e269693521c404/coverage-7.10.6-py3-none-any.whl", hash = "sha256:92c4ecf6bf11b2e85fd4d8204814dc26e6a19f0c9d938c207c5cb0eadfcabbe3", size = 208986, upload-time = "2025-08-29T15:35:14.506Z" }, +] + [[package]] name = "cryptography" version = "45.0.6" @@ -738,6 +791,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, ] +[[package]] +name = "iniconfig" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -1421,6 +1483,10 @@ dependencies = [ { name = "opensearch-py", extra = ["async"] }, { name = "psutil" }, { name = "pyjwt" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "pytest-cov" }, + { name = "pytest-mock" }, { name = "python-dotenv" }, { name = "python-multipart" }, { name = "rich" }, @@ -1448,6 +1514,10 @@ requires-dist = [ { name = "opensearch-py", extras = ["async"], specifier = ">=3.0.0" }, { name = "psutil", specifier = ">=7.0.0" }, { name = "pyjwt", specifier = ">=2.8.0" }, + { name = "pytest", specifier = ">=8.0.0" }, + { name = "pytest-asyncio", specifier = ">=0.21.0" }, + { name = "pytest-cov", specifier = ">=4.0.0" }, + { name = "pytest-mock", specifier = ">=3.12.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "python-multipart", specifier = ">=0.0.20" }, { name = "rich", specifier = ">=13.0.0" }, @@ -1831,6 +1901,60 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/6b/2706497c86e8d69fb76afe5ea857fe1794621aa0f3b1d863feb953fe0f22/pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c", size = 2814810, upload-time = "2024-12-19T19:28:09.857Z" }, ] +[[package]] +name = "pytest" +version = "8.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, +] + +[[package]] +name = "pytest-asyncio" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4e/51/f8794af39eeb870e87a8c8068642fc07bce0c854d6865d7dd0f2a9d338c2/pytest_asyncio-1.1.0.tar.gz", hash = "sha256:796aa822981e01b68c12e4827b8697108f7205020f24b5793b3c41555dab68ea", size = 46652, upload-time = "2025-07-16T04:29:26.393Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9d/bf86eddabf8c6c9cb1ea9a869d6873b46f105a5d292d3a6f7071f5b07935/pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf", size = 15157, upload-time = "2025-07-16T04:29:24.929Z" }, +] + +[[package]] +name = "pytest-cov" +version = "7.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coverage" }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, +] + +[[package]] +name = "pytest-mock" +version = "3.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/99/3323ee5c16b3637b4d941c362182d3e749c11e400bea31018c42219f3a98/pytest_mock-3.15.0.tar.gz", hash = "sha256:ab896bd190316b9d5d87b277569dfcdf718b2d049a2ccff5f7aca279c002a1cf", size = 33838, upload-time = "2025-09-04T20:57:48.679Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/b3/7fefc43fb706380144bcd293cc6e446e6f637ddfa8b83f48d1734156b529/pytest_mock-3.15.0-py3-none-any.whl", hash = "sha256:ef2219485fb1bd256b00e7ad7466ce26729b30eadfc7cbcdb4fa9a92ca68db6f", size = 10050, upload-time = "2025-09-04T20:57:47.274Z" }, +] + [[package]] name = "python-bidi" version = "0.6.6" From 4ca3f179745546d21b2029b45f2a276731f81211 Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 11 Sep 2025 16:25:26 -0400 Subject: [PATCH 008/113] unnecessary comment --- src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.py b/src/main.py index bb745451..873dd458 100644 --- a/src/main.py +++ b/src/main.py @@ -450,7 +450,6 @@ async def create_app(): # Create route handlers with service dependencies injected routes = [ - # Langflow direct upload/ingest endpoints removed in favor of router (/router/upload_ingest) Route( "/langflow/files", require_auth(services["session_manager"])( From 2ef560ca7f13c54aaf8f8e85712bbf8f227b3f34 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 11:54:08 -0400 Subject: [PATCH 009/113] simplify makefile --- Makefile | 63 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 6ac03b93..e9c0367d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,17 @@ # OpenRAG Development Makefile # Provides easy commands for development workflow -.PHONY: help dev dev-cpu dev-local infra stop clean build logs shell-backend shell-frontend install test test-integration test-unit test-ingest test-search test-coverage backend frontend install-be install-fe build-be build-fe logs-be logs-fe logs-lf logs-os shell-be shell-lf shell-os restart status health db-reset flow-upload quick setup +# Load variables from .env if present so `make` commands pick them up +ifneq (,$(wildcard .env)) + include .env + # Export all simple KEY=VALUE pairs to the environment for child processes + export $(shell sed -n 's/^\([A-Za-z_][A-Za-z0-9_]*\)=.*/\1/p' .env) +endif + +.PHONY: help dev dev-cpu dev-local infra stop clean build logs shell-backend shell-frontend install \ + test test-integration test-ci \ + backend frontend install-be install-fe build-be build-fe logs-be logs-fe logs-lf logs-os \ + shell-be shell-lf shell-os restart status health db-reset flow-upload quick setup # Default target help: @@ -32,12 +42,9 @@ help: @echo " shell-lf - Shell into langflow container" @echo "" @echo "Testing:" - @echo " test - Run all backend tests" + @echo " test - Run all backend tests" @echo " test-integration - Run integration tests (requires infra)" - @echo " test-unit - Run unit tests only" - @echo " test-ingest - Test file ingestion flows" - @echo " test-search - Test search functionality" - @echo " test-coverage - Run tests with coverage report" + @echo " test-ci - Start infra, run integration tests, tear down" @echo " lint - Run linting checks" @echo "" @@ -174,21 +181,29 @@ test-integration: @echo "๐Ÿ’ก Make sure to run 'make infra' first!" uv run pytest tests/integration/ -v -test-unit: - @echo "๐Ÿงช Running unit tests..." - uv run pytest tests/unit/ -v - -test-ingest: - @echo "๐Ÿงช Testing file ingestion flows..." - uv run pytest tests/integration/test_file_ingest.py -v - -test-search: - @echo "๐Ÿงช Testing search functionality..." - uv run pytest tests/integration/test_search_flow.py -v - -test-coverage: - @echo "๐Ÿงช Running tests with coverage report..." - uv run pytest tests/ --cov=src --cov-report=term-missing --cov-report=html:htmlcov +# CI-friendly integration test target: brings up infra, waits, runs tests, tears down +test-ci: + @set -e; \ + echo "๐Ÿš€ Starting infra (OpenSearch + Dashboards + Langflow)"; \ + make infra; \ + echo "โณ Waiting for OpenSearch..."; \ + for i in $$(seq 1 60); do \ + curl -k -s https://localhost:9200 -u admin:$${OPENSEARCH_PASSWORD} >/dev/null 2>&1 && break || sleep 2; \ + done; \ + echo "โณ Waiting for Langflow..."; \ + for i in $$(seq 1 60); do \ + curl -s http://localhost:7860/ >/dev/null 2>&1 && break || sleep 2; \ + done; \ + echo "๐Ÿงช Running integration tests"; \ + LOG_LEVEL=$${LOG_LEVEL:-DEBUG} \ + GOOGLE_OAUTH_CLIENT_ID="" \ + GOOGLE_OAUTH_CLIENT_SECRET="" \ + OPENSEARCH_HOST=localhost OPENSEARCH_PORT=9200 \ + OPENSEARCH_USERNAME=admin OPENSEARCH_PASSWORD=$${OPENSEARCH_PASSWORD} \ + DISABLE_STARTUP_INGEST=$${DISABLE_STARTUP_INGEST:-true} \ + uv run pytest tests/integration -vv -s -o log_cli=true --log-cli-level=DEBUG; \ + echo "๐Ÿงน Tearing down infra"; \ + docker compose down -v || true lint: @echo "๐Ÿ” Running linting checks..." @@ -204,13 +219,13 @@ health: @echo "๐Ÿฅ Health check:" @echo "Backend: $$(curl -s http://localhost:8000/health 2>/dev/null || echo 'Not responding')" @echo "Langflow: $$(curl -s http://localhost:7860/health 2>/dev/null || echo 'Not responding')" - @echo "OpenSearch: $$(curl -s -k -u admin:$(shell grep OPENSEARCH_PASSWORD .env | cut -d= -f2) https://localhost:9200 2>/dev/null | jq -r .tagline 2>/dev/null || echo 'Not responding')" + @echo "OpenSearch: $$(curl -s -k -u admin:$${OPENSEARCH_PASSWORD} https://localhost:9200 2>/dev/null | jq -r .tagline 2>/dev/null || echo 'Not responding')" # Database operations db-reset: @echo "๐Ÿ—„๏ธ Resetting OpenSearch indices..." - curl -X DELETE "http://localhost:9200/documents" -u admin:$$(grep OPENSEARCH_PASSWORD .env | cut -d= -f2) || true - curl -X DELETE "http://localhost:9200/knowledge_filters" -u admin:$$(grep OPENSEARCH_PASSWORD .env | cut -d= -f2) || true + curl -X DELETE "http://localhost:9200/documents" -u admin:$${OPENSEARCH_PASSWORD} || true + curl -X DELETE "http://localhost:9200/knowledge_filters" -u admin:$${OPENSEARCH_PASSWORD} || true @echo "Indices reset. Restart backend to recreate." # Flow management From e23ed258c932c72ca67518fcafaf0665c6156a7c Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 11:54:28 -0400 Subject: [PATCH 010/113] improve tests --- src/config/settings.py | 23 +++ tests/integration/test_api_endpoints.py | 198 +++++++++++++---------- tests/integration/test_startup_ingest.py | 81 +++++----- 3 files changed, 172 insertions(+), 130 deletions(-) diff --git a/src/config/settings.py b/src/config/settings.py index ace9d5cb..dc9a6e23 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -297,6 +297,29 @@ class AppClients: return self + async def close(self): + """Close all client connections""" + try: + if hasattr(self, 'opensearch') and self.opensearch: + await self.opensearch.close() + self.opensearch = None + except Exception as e: + logger.warning("Error closing OpenSearch client", error=str(e)) + + try: + if hasattr(self, 'langflow_http_client') and self.langflow_http_client: + await self.langflow_http_client.aclose() + self.langflow_http_client = None + except Exception as e: + logger.warning("Error closing Langflow HTTP client", error=str(e)) + + try: + if hasattr(self, 'patched_async_client') and self.patched_async_client: + await self.patched_async_client.close() + self.patched_async_client = None + except Exception as e: + logger.warning("Error closing OpenAI client", error=str(e)) + async def ensure_langflow_client(self): """Ensure Langflow client exists; try to generate key and create client lazily.""" if self.langflow_client is not None: diff --git a/tests/integration/test_api_endpoints.py b/tests/integration/test_api_endpoints.py index e2ae3c18..60810563 100644 --- a/tests/integration/test_api_endpoints.py +++ b/tests/integration/test_api_endpoints.py @@ -60,79 +60,89 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges await clients.initialize() try: await clients.opensearch.indices.delete(index=INDEX_NAME) + # Wait for deletion to complete + await asyncio.sleep(1) except Exception: pass app = await create_app() # Manually run startup tasks since httpx ASGI transport here doesn't manage lifespan await startup_tasks(app.state.services) + + # Verify index is truly empty after startup + try: + count_response = await clients.opensearch.count(index=INDEX_NAME) + doc_count = count_response.get('count', 0) + assert doc_count == 0, f"Index should be empty after startup but contains {doc_count} documents" + except Exception as e: + # If count fails, the index might not exist yet, which is fine + pass transport = httpx.ASGITransport(app=app) - async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: - # Wait for app + OpenSearch readiness using existing endpoints - await wait_for_service_ready(client) - - # Create a temporary markdown file to upload - file_path = tmp_path / "endpoint_test_doc.md" - file_text = ( - "# Single Test Document\n\n" - "This is a test document about OpenRAG testing framework. " - "The content should be indexed and searchable in OpenSearch after processing." - ) - file_path.write_text(file_text) - - # POST via router (multipart) - files = { - "file": ( - file_path.name, - file_path.read_bytes(), - "text/markdown", - ) - } - upload_resp = await client.post("/upload", files=files) - body = upload_resp.json() - # Router now returns 201 + task_id (async) regardless of mode - assert upload_resp.status_code == 201, upload_resp.text - assert isinstance(body.get("task_id"), str) - - # Poll search for the specific content until it's indexed - async def _wait_for_indexed(timeout_s: float = 30.0): - deadline = asyncio.get_event_loop().time() + timeout_s - while asyncio.get_event_loop().time() < deadline: - resp = await client.post( - "/search", - json={"query": "OpenRAG testing framework", "limit": 5}, - ) - if resp.status_code == 200 and resp.json().get("results"): - return resp - await asyncio.sleep(0.5) - return resp - - search_resp = await _wait_for_indexed() - - # POST /search - assert search_resp.status_code == 200, search_resp.text - search_body = search_resp.json() - - # Basic shape and at least one hit - assert isinstance(search_body.get("results"), list) - assert len(search_body["results"]) >= 0 - # When hits exist, confirm our phrase is present in top result content - if search_body["results"]: - top = search_body["results"][0] - assert "text" in top or "content" in top - text = top.get("text") or top.get("content") - assert isinstance(text, str) - assert "testing" in text.lower() - # Explicitly close global clients to avoid aiohttp warnings - from src.config.settings import clients try: - if getattr(clients, "opensearch", None): - await clients.opensearch.close() - if getattr(clients, "langflow_http_client", None): - await clients.langflow_http_client.aclose() - except Exception: - pass + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + # Wait for app + OpenSearch readiness using existing endpoints + await wait_for_service_ready(client) + + # Create a temporary markdown file to upload + file_path = tmp_path / "endpoint_test_doc.md" + file_text = ( + "# Single Test Document\n\n" + "This is a test document about OpenRAG testing framework. " + "The content should be indexed and searchable in OpenSearch after processing." + ) + file_path.write_text(file_text) + + # POST via router (multipart) + files = { + "file": ( + file_path.name, + file_path.read_bytes(), + "text/markdown", + ) + } + upload_resp = await client.post("/upload", files=files) + body = upload_resp.json() + # Router now returns 201 + task_id (async) regardless of mode + assert upload_resp.status_code == 201, upload_resp.text + assert isinstance(body.get("task_id"), str) + + # Poll search for the specific content until it's indexed + async def _wait_for_indexed(timeout_s: float = 30.0): + deadline = asyncio.get_event_loop().time() + timeout_s + while asyncio.get_event_loop().time() < deadline: + resp = await client.post( + "/search", + json={"query": "OpenRAG testing framework", "limit": 5}, + ) + if resp.status_code == 200 and resp.json().get("results"): + return resp + await asyncio.sleep(0.5) + return resp + + search_resp = await _wait_for_indexed() + + # POST /search + assert search_resp.status_code == 200, search_resp.text + search_body = search_resp.json() + + # Basic shape and at least one hit + assert isinstance(search_body.get("results"), list) + assert len(search_body["results"]) >= 0 + # When hits exist, confirm our phrase is present in top result content + if search_body["results"]: + top = search_body["results"][0] + assert "text" in top or "content" in top + text = top.get("text") or top.get("content") + assert isinstance(text, str) + assert "testing" in text.lower() + finally: + # Explicitly close global clients to avoid aiohttp warnings + from src.config.settings import clients + try: + await clients.close() + except Exception: + pass @pytest.mark.parametrize("disable_langflow_ingest", [True, False]) @@ -159,35 +169,45 @@ async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow await clients.initialize() try: await clients.opensearch.indices.delete(index=INDEX_NAME) + # Wait for deletion to complete + await asyncio.sleep(1) except Exception: pass app = await create_app() await startup_tasks(app.state.services) - transport = httpx.ASGITransport(app=app) - async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: - await wait_for_service_ready(client) - - file_path = tmp_path / "router_test_doc.md" - file_path.write_text("# Router Test\n\nThis file validates the upload router.") - - files = { - "file": ( - file_path.name, - file_path.read_bytes(), - "text/markdown", - ) - } - - resp = await client.post("/upload", files=files) - data = resp.json() - assert resp.status_code == 201, resp.text - assert isinstance(data.get("task_id"), str) - from src.config.settings import clients + + # Verify index is truly empty after startup try: - if getattr(clients, "opensearch", None): - await clients.opensearch.close() - if getattr(clients, "langflow_http_client", None): - await clients.langflow_http_client.aclose() - except Exception: + count_response = await clients.opensearch.count(index=INDEX_NAME) + doc_count = count_response.get('count', 0) + assert doc_count == 0, f"Index should be empty after startup but contains {doc_count} documents" + except Exception as e: + # If count fails, the index might not exist yet, which is fine pass + transport = httpx.ASGITransport(app=app) + try: + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + await wait_for_service_ready(client) + + file_path = tmp_path / "router_test_doc.md" + file_path.write_text("# Router Test\n\nThis file validates the upload router.") + + files = { + "file": ( + file_path.name, + file_path.read_bytes(), + "text/markdown", + ) + } + + resp = await client.post("/upload", files=files) + data = resp.json() + assert resp.status_code == 201, resp.text + assert isinstance(data.get("task_id"), str) + finally: + from src.config.settings import clients + try: + await clients.close() + except Exception: + pass diff --git a/tests/integration/test_startup_ingest.py b/tests/integration/test_startup_ingest.py index 5ce62a94..436c4d28 100644 --- a/tests/integration/test_startup_ingest.py +++ b/tests/integration/test_startup_ingest.py @@ -70,45 +70,44 @@ async def test_startup_ingest_creates_task(disable_langflow_ingest: bool): await startup_tasks(app.state.services) transport = httpx.ASGITransport(app=app) - async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: - await wait_for_ready(client) - - expected_files = count_files_in_documents() - - # Poll /tasks until we see at least one startup ingest task - async def _wait_for_task(timeout_s: float = 60.0): - deadline = asyncio.get_event_loop().time() + timeout_s - last = None - while asyncio.get_event_loop().time() < deadline: - resp = await client.get("/tasks") - if resp.status_code == 200: - data = resp.json() - last = data - tasks = data.get("tasks") if isinstance(data, dict) else None - if isinstance(tasks, list) and len(tasks) > 0: - return tasks - await asyncio.sleep(0.5) - return last.get("tasks") if isinstance(last, dict) else last - - tasks = await _wait_for_task() - if expected_files == 0: - return # Nothing to do - if not (isinstance(tasks, list) and len(tasks) > 0): - # Fallback: verify that documents were indexed as a sign of startup ingest - sr = await client.post("/search", json={"query": "*", "limit": 1}) - assert sr.status_code == 200, sr.text - total = sr.json().get("total") - assert isinstance(total, int) and total >= 0, "Startup ingest did not index documents" - return - newest = tasks[0] - assert "task_id" in newest - assert newest.get("total_files") == expected_files - # Explicitly close global clients to avoid aiohttp warnings - from src.config.settings import clients try: - if getattr(clients, "opensearch", None): - await clients.opensearch.close() - if getattr(clients, "langflow_http_client", None): - await clients.langflow_http_client.aclose() - except Exception: - pass + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + await wait_for_ready(client) + + expected_files = count_files_in_documents() + + # Poll /tasks until we see at least one startup ingest task + async def _wait_for_task(timeout_s: float = 60.0): + deadline = asyncio.get_event_loop().time() + timeout_s + last = None + while asyncio.get_event_loop().time() < deadline: + resp = await client.get("/tasks") + if resp.status_code == 200: + data = resp.json() + last = data + tasks = data.get("tasks") if isinstance(data, dict) else None + if isinstance(tasks, list) and len(tasks) > 0: + return tasks + await asyncio.sleep(0.5) + return last.get("tasks") if isinstance(last, dict) else last + + tasks = await _wait_for_task() + if expected_files == 0: + return # Nothing to do + if not (isinstance(tasks, list) and len(tasks) > 0): + # Fallback: verify that documents were indexed as a sign of startup ingest + sr = await client.post("/search", json={"query": "*", "limit": 1}) + assert sr.status_code == 200, sr.text + total = sr.json().get("total") + assert isinstance(total, int) and total >= 0, "Startup ingest did not index documents" + return + newest = tasks[0] + assert "task_id" in newest + assert newest.get("total_files") == expected_files + finally: + # Explicitly close global clients to avoid aiohttp warnings + from src.config.settings import clients + try: + await clients.close() + except Exception: + pass From 33911052a6bf83ce43736894966578663c50c87a Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 11:55:15 -0400 Subject: [PATCH 011/113] add integration test action --- .github/workflows/test-integration.yml | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/test-integration.yml diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml new file mode 100644 index 00000000..8c4f971c --- /dev/null +++ b/.github/workflows/test-integration.yml @@ -0,0 +1,45 @@ +name: Integration Tests + +on: + pull_request: + push: + branches: + - main + - develop + +jobs: + tests: + runs-on: ubuntu-latest + env: + # Prefer repository/environment variable first, then secret, then a sane fallback + OPENSEARCH_PASSWORD: ${{ vars.OPENSEARCH_PASSWORD || secrets.OPENSEARCH_PASSWORD || 'OpenRag#2025!' }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up UV + uses: astral-sh/setup-uv@v3 + with: + version: latest + + - name: Python version + run: uv python install 3.13 + + - name: Install dependencies + run: uv sync + + - name: Run integration tests + env: + OPENSEARCH_HOST: localhost + OPENSEARCH_PORT: 9200 + OPENSEARCH_USERNAME: admin + OPENSEARCH_PASSWORD: ${{ env.OPENSEARCH_PASSWORD }} + LOG_LEVEL: DEBUG + # Force no-auth mode so tests bypass OAuth + GOOGLE_OAUTH_CLIENT_ID: "" + GOOGLE_OAUTH_CLIENT_SECRET: "" + # Disable startup ingest noise unless a test enables it + DISABLE_STARTUP_INGEST: "true" + run: | + make test-ci From 952dc6dc92c10329accc2be95340f00d16b72c85 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 11:56:27 -0400 Subject: [PATCH 012/113] ci branches trigger --- .github/workflows/test-integration.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 8c4f971c..75b75ed3 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -5,7 +5,6 @@ on: push: branches: - main - - develop jobs: tests: From 57f893b622af55552a8b52af78ad8e223fabcd6a Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 12:33:02 -0400 Subject: [PATCH 013/113] ci node cleanup --- .github/workflows/test-integration.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 75b75ed3..19bacefd 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -14,6 +14,13 @@ jobs: OPENSEARCH_PASSWORD: ${{ vars.OPENSEARCH_PASSWORD || secrets.OPENSEARCH_PASSWORD || 'OpenRag#2025!' }} steps: + - run: df -h + - name: "node-cleanup" + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL + sudo docker image prune --all --force + sudo docker builder prune -a + - run: df -h - name: Checkout uses: actions/checkout@v4 From 463bb48222baab5be3ef79e4d3e5d3b5cb03fbfe Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 13:02:56 -0400 Subject: [PATCH 014/113] devel and torch dependencies optional --- .github/workflows/test-integration.yml | 2 +- Dockerfile.backend | 2 +- Makefile | 4 +++- pyproject.toml | 27 +++++++++++++++++--------- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 19bacefd..8b1a0b74 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -33,7 +33,7 @@ jobs: run: uv python install 3.13 - name: Install dependencies - run: uv sync + run: uv sync --group dev - name: Run integration tests env: diff --git a/Dockerfile.backend b/Dockerfile.backend index 5d9d84f4..d314eefe 100644 --- a/Dockerfile.backend +++ b/Dockerfile.backend @@ -18,7 +18,7 @@ WORKDIR /app # Copy Python dependencies COPY pyproject.toml uv.lock ./ -RUN uv sync +RUN uv sync --extra torch-cu128 # Copy sample document and warmup script for docling COPY documents/warmup_ocr.pdf ./ diff --git a/Makefile b/Makefile index e9c0367d..eeab5a12 100644 --- a/Makefile +++ b/Makefile @@ -118,7 +118,7 @@ install: install-be install-fe install-be: @echo "๐Ÿ“ฆ Installing backend dependencies..." - uv sync + uv sync --extra torch-cu128 install-fe: @echo "๐Ÿ“ฆ Installing frontend dependencies..." @@ -184,6 +184,8 @@ test-integration: # CI-friendly integration test target: brings up infra, waits, runs tests, tears down test-ci: @set -e; \ + echo "๐Ÿ“ฆ Installing test dependencies..."; \ + uv sync --group dev; \ echo "๐Ÿš€ Starting infra (OpenSearch + Dashboards + Langflow)"; \ make infra; \ echo "โณ Waiting for OpenSearch..."; \ diff --git a/pyproject.toml b/pyproject.toml index 04200e93..8e816391 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,6 @@ dependencies = [ "pyjwt>=2.8.0", "python-multipart>=0.0.20", "starlette>=0.47.1", - "torch>=2.7.1", "uvicorn>=0.35.0", "boto3>=1.35.0", "psutil>=7.0.0", @@ -27,12 +26,15 @@ dependencies = [ "python-dotenv>=1.0.0", "textual-fspicker>=0.6.0", "structlog>=25.4.0", - "pytest>=8.0.0", - "pytest-asyncio>=0.21.0", - "pytest-mock>=3.12.0", - "pytest-cov>=4.0.0", ] +[project.optional-dependencies] +torch = ["torch", "torchvision"] +torch-cu128 = ["torch", "torchvision"] + +[dependency-groups] +dev = ["pytest>=8", "pytest-asyncio>=0.21.0", "pytest-mock>=3.12.0", "pytest-cov>=4.0.0"] + [project.scripts] openrag = "tui.main:run_tui" @@ -41,13 +43,20 @@ package = true [tool.uv.sources] torch = [ - { index = "pytorch-cu128", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" }, + { index = "pytorch-cu128", extra = "torch-cu128" }, + { index = "pytorch-cpu", extra = "torch" } ] torchvision = [ - { index = "pytorch-cu128", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" }, + { index = "pytorch-cu128", extra = "torch-cu128" }, + { index = "pytorch-cpu", extra = "torch" } ] [[tool.uv.index]] -name = "pytorch-cu128" -url = "https://download.pytorch.org/whl/cu128" +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" explicit = true From 364f24a2ca1f690ca221f47b96353a95f964102f Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 13:11:32 -0400 Subject: [PATCH 015/113] torch dep fix --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8e816391..de2e562c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,12 +43,12 @@ package = true [tool.uv.sources] torch = [ - { index = "pytorch-cu128", extra = "torch-cu128" }, - { index = "pytorch-cpu", extra = "torch" } + { index = "pytorch-cu128", extra = "torch-cu128", marker = "sys_platform == 'linux'" }, + { index = "pytorch-cpu", extra = "torch", marker = "sys_platform != 'linux'" } ] torchvision = [ - { index = "pytorch-cu128", extra = "torch-cu128" }, - { index = "pytorch-cpu", extra = "torch" } + { index = "pytorch-cu128", extra = "torch-cu128", marker = "sys_platform == 'linux'" }, + { index = "pytorch-cpu", extra = "torch", marker = "sys_platform != 'linux'" } ] [[tool.uv.index]] From b5d0d23fbe5334fcacea5f931cc4d9a9308eca27 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 13:40:14 -0400 Subject: [PATCH 016/113] ci cpu only --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index eeab5a12..2defe2bb 100644 --- a/Makefile +++ b/Makefile @@ -186,8 +186,8 @@ test-ci: @set -e; \ echo "๐Ÿ“ฆ Installing test dependencies..."; \ uv sync --group dev; \ - echo "๐Ÿš€ Starting infra (OpenSearch + Dashboards + Langflow)"; \ - make infra; \ + echo "๐Ÿš€ Starting infra (OpenSearch + Dashboards + Langflow) with CPU containers"; \ + docker compose -f docker-compose-cpu.yml up -d opensearch dashboards langflow; \ echo "โณ Waiting for OpenSearch..."; \ for i in $$(seq 1 60); do \ curl -k -s https://localhost:9200 -u admin:$${OPENSEARCH_PASSWORD} >/dev/null 2>&1 && break || sleep 2; \ From f0b608e776ef4c75b1d8980e8262f7b6c32bd7a6 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 14:11:10 -0400 Subject: [PATCH 017/113] add openai key to workflow --- .github/workflows/test-integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 8b1a0b74..0ff6b8ff 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -12,6 +12,7 @@ jobs: env: # Prefer repository/environment variable first, then secret, then a sane fallback OPENSEARCH_PASSWORD: ${{ vars.OPENSEARCH_PASSWORD || secrets.OPENSEARCH_PASSWORD || 'OpenRag#2025!' }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} steps: - run: df -h From 1549161a336a1dd645eb926cd43410cc60066b37 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 14:35:42 -0400 Subject: [PATCH 018/113] genrate keys --- tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 2edf3d65..87722481 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ os.environ.pop('GOOGLE_OAUTH_CLIENT_SECRET', None) from src.config.settings import clients from src.session_manager import SessionManager +from src.main import generate_jwt_keys @pytest.fixture(scope="session") @@ -42,6 +43,8 @@ async def opensearch_client(): @pytest.fixture def session_manager(): """Session manager for testing.""" + # Generate RSA keys before creating SessionManager + generate_jwt_keys() return SessionManager("test-secret-key") From 2210f6ac7365f97d2dc3a97d84311fb31d086c17 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 15:32:09 -0400 Subject: [PATCH 019/113] debug keys dir --- .github/workflows/test-integration.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 0ff6b8ff..b883b747 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -36,6 +36,13 @@ jobs: - name: Install dependencies run: uv sync --group dev + - name: Debug keys directory + run: | + ls -la keys/ || echo "keys dir doesn't exist" + whoami + pwd + id + - name: Run integration tests env: OPENSEARCH_HOST: localhost From dd6886aec6bb30ad2c8b553d21873ea37abf2844 Mon Sep 17 00:00:00 2001 From: phact Date: Fri, 12 Sep 2025 15:40:31 -0400 Subject: [PATCH 020/113] debug keys --- .github/workflows/test-integration.yml | 7 ------- src/main.py | 20 +++++++++++++++++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index b883b747..0ff6b8ff 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -36,13 +36,6 @@ jobs: - name: Install dependencies run: uv sync --group dev - - name: Debug keys directory - run: | - ls -la keys/ || echo "keys dir doesn't exist" - whoami - pwd - id - - name: Run integration tests env: OPENSEARCH_HOST: localhost diff --git a/src/main.py b/src/main.py index 873dd458..46b5fa7e 100644 --- a/src/main.py +++ b/src/main.py @@ -183,15 +183,19 @@ def generate_jwt_keys(): # Generate keys if they don't exist if not os.path.exists(private_key_path): try: + logger.info("Generating RSA keys", private_key_path=private_key_path, public_key_path=public_key_path) + # Generate private key - subprocess.run( + result = subprocess.run( ["openssl", "genrsa", "-out", private_key_path, "2048"], check=True, capture_output=True, + text=True, ) + logger.info("Private key generation completed", stdout=result.stdout, stderr=result.stderr) # Generate public key - subprocess.run( + result = subprocess.run( [ "openssl", "rsa", @@ -203,11 +207,21 @@ def generate_jwt_keys(): ], check=True, capture_output=True, + text=True, ) + logger.info("Public key generation completed", stdout=result.stdout, stderr=result.stderr) + + # Verify files were created and are readable + logger.info("Verifying generated keys") + logger.info("Private key exists", exists=os.path.exists(private_key_path)) + logger.info("Public key exists", exists=os.path.exists(public_key_path)) + if os.path.exists(private_key_path): + stat_info = os.stat(private_key_path) + logger.info("Private key permissions", mode=oct(stat_info.st_mode), uid=stat_info.st_uid, gid=stat_info.st_gid) logger.info("Generated RSA keys for JWT signing") except subprocess.CalledProcessError as e: - logger.error("Failed to generate RSA keys", error=str(e)) + logger.error("Failed to generate RSA keys", error=str(e), stdout=e.stdout, stderr=e.stderr) raise else: logger.info("RSA keys already exist, skipping generation") From ccd5be6bdca4066152fe3d66ccd71576455db0a0 Mon Sep 17 00:00:00 2001 From: phact Date: Mon, 15 Sep 2025 15:49:28 -0400 Subject: [PATCH 021/113] ls keys --- .github/workflows/test-integration.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 0ff6b8ff..46bbe977 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -50,3 +50,5 @@ jobs: DISABLE_STARTUP_INGEST: "true" run: | make test-ci + echo "Keys directory after tests:" + ls -la keys/ || echo "No keys directory" From 07a2cabbcf1fe7a51244094286293d5b93628aa0 Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Tue, 30 Sep 2025 11:39:47 -0700 Subject: [PATCH 022/113] feat: Add Google Drive Folder Selection --- .../src/components/cloud-picker/provider-handlers.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/frontend/src/components/cloud-picker/provider-handlers.ts b/frontend/src/components/cloud-picker/provider-handlers.ts index 4a39312f..9fe27656 100644 --- a/frontend/src/components/cloud-picker/provider-handlers.ts +++ b/frontend/src/components/cloud-picker/provider-handlers.ts @@ -52,12 +52,17 @@ export class GoogleDriveHandler { try { this.onPickerStateChange?.(true); + // Create a view for regular documents + const docsView = new window.google.picker.DocsView() + .setIncludeFolders(true) + .setSelectFolderEnabled(true); + const picker = new window.google.picker.PickerBuilder() - .addView(window.google.picker.ViewId.DOCS) + .addView(docsView) .addView(window.google.picker.ViewId.FOLDERS) .setOAuthToken(this.accessToken) .enableFeature(window.google.picker.Feature.MULTISELECT_ENABLED) - .setTitle("Select files from Google Drive") + .setTitle("Select files or folders from Google Drive") .setCallback(data => this.pickerCallback(data, onFileSelected)) .build(); From 776394465aed1d4891dfe2ba0300e3e279435fc5 Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Sun, 5 Oct 2025 08:50:44 -0700 Subject: [PATCH 023/113] Add folder processing google drive connector --- .../cloud-picker/provider-handlers.ts | 1 - src/connectors/google_drive/connector.py | 231 ++++++++++++++---- src/connectors/langflow_connector_service.py | 55 ++++- src/connectors/service.py | 68 ++++-- 4 files changed, 282 insertions(+), 73 deletions(-) diff --git a/frontend/src/components/cloud-picker/provider-handlers.ts b/frontend/src/components/cloud-picker/provider-handlers.ts index 9fe27656..5b0a8258 100644 --- a/frontend/src/components/cloud-picker/provider-handlers.ts +++ b/frontend/src/components/cloud-picker/provider-handlers.ts @@ -59,7 +59,6 @@ export class GoogleDriveHandler { const picker = new window.google.picker.PickerBuilder() .addView(docsView) - .addView(window.google.picker.ViewId.FOLDERS) .setOAuthToken(this.accessToken) .enableFeature(window.google.picker.Feature.MULTISELECT_ENABLED) .setTitle("Select files or folders from Google Drive") diff --git a/src/connectors/google_drive/connector.py b/src/connectors/google_drive/connector.py index 48a445bf..66b67519 100644 --- a/src/connectors/google_drive/connector.py +++ b/src/connectors/google_drive/connector.py @@ -1,21 +1,20 @@ import io import os -from pathlib import Path import time from collections import deque from dataclasses import dataclass -from typing import Dict, List, Any, Optional, Iterable, Set +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Set from googleapiclient.errors import HttpError from googleapiclient.http import MediaIoBaseDownload + from utils.logging_config import get_logger -logger = get_logger(__name__) - -# Project-specific base types (adjust imports to your project) from ..base import BaseConnector, ConnectorDocument, DocumentACL from .oauth import GoogleDriveOAuth +logger = get_logger(__name__) # ------------------------- # Config model @@ -32,8 +31,8 @@ class GoogleDriveConfig: recursive: bool = True # Shared Drives control - drive_id: Optional[str] = None # when set, we use corpora='drive' - corpora: Optional[str] = None # 'user' | 'drive' | 'domain'; auto-picked if None + drive_id: Optional[str] = None # when set, we use corpora='drive' + corpora: Optional[str] = None # 'user' | 'drive' | 'domain'; auto-picked if None # Optional filtering include_mime_types: Optional[List[str]] = None @@ -80,7 +79,6 @@ class GoogleDriveConnector(BaseConnector): _FILE_ID_ALIASES = ("file_ids", "selected_file_ids", "selected_files") _FOLDER_ID_ALIASES = ("folder_ids", "selected_folder_ids", "selected_folders") - def emit(self, doc: ConnectorDocument) -> None: """ Emit a ConnectorDocument instance. @@ -100,7 +98,9 @@ class GoogleDriveConnector(BaseConnector): # Token file default (so callback & workers donโ€™t need to pass it) project_root = Path(__file__).resolve().parent.parent.parent.parent - token_file = config.get("token_file") or str(project_root / "google_drive_token.json") + token_file = config.get("token_file") or str( + project_root / "google_drive_token.json" + ) Path(token_file).parent.mkdir(parents=True, exist_ok=True) if not isinstance(client_id, str) or not client_id.strip(): @@ -115,7 +115,9 @@ class GoogleDriveConnector(BaseConnector): ) # Normalize incoming IDs from any of the supported alias keys - def _first_present_list(cfg: Dict[str, Any], keys: Iterable[str]) -> Optional[List[str]]: + def _first_present_list( + cfg: Dict[str, Any], keys: Iterable[str] + ) -> Optional[List[str]]: for k in keys: v = cfg.get(k) if v: # accept non-empty list @@ -151,6 +153,7 @@ class GoogleDriveConnector(BaseConnector): # Drive client is built in authenticate() from google.oauth2.credentials import Credentials + self.creds: Optional[Credentials] = None self.service: Any = None @@ -214,7 +217,7 @@ class GoogleDriveConnector(BaseConnector): "id, name, mimeType, modifiedTime, createdTime, size, " "webViewLink, parents, owners, driveId" ), - **self._drives_flags, + **self._drives_get_flags, ) .execute() ) @@ -285,7 +288,9 @@ class GoogleDriveConnector(BaseConnector): Fetch metadata for a file by ID (resolving shortcuts). """ if self.service is None: - raise RuntimeError("Google Drive service is not initialized. Please authenticate first.") + raise RuntimeError( + "Google Drive service is not initialized. Please authenticate first." + ) try: meta = ( self.service.files() @@ -323,24 +328,40 @@ class GoogleDriveConnector(BaseConnector): def _iter_selected_items(self) -> List[Dict[str, Any]]: """ Return a de-duplicated list of file metadata for the selected scope: - - explicit file_ids + - explicit file_ids (automatically expands folders to their contents) - items inside folder_ids (with optional recursion) Shortcuts are resolved to their targets automatically. """ seen: Set[str] = set() items: List[Dict[str, Any]] = [] + folders_to_expand: List[str] = [] - # Explicit files + # Process file_ids: separate actual files from folders if self.cfg.file_ids: for fid in self.cfg.file_ids: meta = self._get_file_meta_by_id(fid) - if meta and meta["id"] not in seen: + if not meta: + continue + + # If it's a folder, add to folders_to_expand instead + if meta.get("mimeType") == "application/vnd.google-apps.folder": + logger.debug( + f"Item {fid} ({meta.get('name')}) is a folder, " + f"will expand to contents" + ) + folders_to_expand.append(fid) + elif meta["id"] not in seen: + # It's a regular file, add it directly seen.add(meta["id"]) items.append(meta) - # Folders + # Collect all folders to expand (from both file_ids and folder_ids) if self.cfg.folder_ids: - folder_children = self._bfs_expand_folders(self.cfg.folder_ids) + folders_to_expand.extend(self.cfg.folder_ids) + + # Expand all folders to their contents + if folders_to_expand: + folder_children = self._bfs_expand_folders(folders_to_expand) for meta in folder_children: meta = self._resolve_shortcut(meta) if meta.get("id") in seen: @@ -357,7 +378,11 @@ class GoogleDriveConnector(BaseConnector): items = self._filter_by_mime(items) # Exclude folders from final emits: - items = [m for m in items if m.get("mimeType") != "application/vnd.google-apps.folder"] + items = [ + m + for m in items + if m.get("mimeType") != "application/vnd.google-apps.folder" + ] return items # ------------------------- @@ -389,29 +414,85 @@ class GoogleDriveConnector(BaseConnector): def _download_file_bytes(self, file_meta: Dict[str, Any]) -> bytes: """ Download bytes for a given file (exporting if Google-native). + Raises ValueError if the item is a folder (folders cannot be downloaded). """ file_id = file_meta["id"] + file_name = file_meta.get("name", "unknown") mime_type = file_meta.get("mimeType") or "" - # Google-native: export - export_mime = self._pick_export_mime(mime_type) - if mime_type.startswith("application/vnd.google-apps."): - # default fallback if not overridden - #if not export_mime: - # export_mime = "application/pdf" - export_mime = "application/pdf" + logger.debug( + f"Downloading file {file_id} ({file_name}) with mimetype: {mime_type}" + ) + + # Folders cannot be downloaded or exported - this should never be reached + # as folders are automatically expanded in _iter_selected_items() + if mime_type == "application/vnd.google-apps.folder": + raise ValueError( + f"Cannot download folder {file_id} ({file_name}). " + f"This is a bug - folders should be automatically expanded before download." + ) + + # According to https://stackoverflow.com/questions/65053558/google-drive-api-v3-files-export-method-throws-a-403-error-export-only-support + # export_media ONLY works for Google Docs Editors files (Docs, Sheets, Slides, Drawings) + # All other files (including other Google Apps types like Forms, Sites, Maps) must use get_media + + # Define which Google Workspace files are exportable + exportable_types = { + "application/vnd.google-apps.document", # Google Docs + "application/vnd.google-apps.spreadsheet", # Google Sheets + "application/vnd.google-apps.presentation", # Google Slides + "application/vnd.google-apps.drawing", # Google Drawings + } + + if mime_type in exportable_types: + # This is an exportable Google Workspace file - must use export_media + export_mime = self._pick_export_mime(mime_type) + if not export_mime: + # Default fallback for unsupported Google native types + export_mime = "application/pdf" + + logger.debug( + f"Using export_media for {file_id} ({mime_type} -> {export_mime})" + ) # NOTE: export_media does not accept supportsAllDrives/includeItemsFromAllDrives - request = self.service.files().export_media(fileId=file_id, mimeType=export_mime) + request = self.service.files().export_media( + fileId=file_id, mimeType=export_mime + ) else: + # This is a regular uploaded file (PDF, image, video, etc.) - use get_media + # Also handles non-exportable Google Apps files (Forms, Sites, Maps, etc.) + logger.debug(f"Using get_media for {file_id} ({mime_type})") # Binary download (get_media also doesn't accept the Drive flags) request = self.service.files().get_media(fileId=file_id) + # Download the file with error handling for misclassified Google Docs fh = io.BytesIO() downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) done = False - while not done: - status, done = downloader.next_chunk() - # Optional: you can log progress via status.progress() + + try: + while not done: + status, done = downloader.next_chunk() + # Optional: you can log progress via status.progress() + except HttpError as e: + # If download fails with "fileNotDownloadable", it's a Docs Editor file + # that wasn't properly detected. Retry with export_media. + if "fileNotDownloadable" in str(e) and mime_type not in exportable_types: + logger.warning( + f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. " + f"Retrying with export_media (file might be a Google Doc)" + ) + export_mime = "application/pdf" + request = self.service.files().export_media( + fileId=file_id, mimeType=export_mime + ) + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) + done = False + while not done: + status, done = downloader.next_chunk() + else: + raise return fh.getvalue() @@ -430,7 +511,9 @@ class GoogleDriveConnector(BaseConnector): # If still not authenticated, bail (caller should kick off OAuth init) if not await self.oauth.is_authenticated(): - logger.debug("authenticate: no valid credentials; run OAuth init/callback first.") + logger.debug( + "authenticate: no valid credentials; run OAuth init/callback first." + ) return False # Build Drive service from OAuth helper @@ -450,7 +533,7 @@ class GoogleDriveConnector(BaseConnector): self, page_token: Optional[str] = None, max_files: Optional[int] = None, - **kwargs + **kwargs, ) -> Dict[str, Any]: """ List files in the currently selected scope (file_ids/folder_ids/recursive). @@ -483,15 +566,24 @@ class GoogleDriveConnector(BaseConnector): except Exception: pass return {"files": [], "next_page_token": None} - + async def get_file_content(self, file_id: str) -> ConnectorDocument: """ Fetch a file's metadata and content from Google Drive and wrap it in a ConnectorDocument. + Raises FileNotFoundError if the ID is a folder (folders cannot be downloaded). """ meta = self._get_file_meta_by_id(file_id) if not meta: raise FileNotFoundError(f"Google Drive file not found: {file_id}") + # Check if this is a folder - folders cannot be downloaded + if meta.get("mimeType") == "application/vnd.google-apps.folder": + raise FileNotFoundError( + f"Cannot download folder {file_id} ({meta.get('name')}). " + f"Folders must be expanded to list their contents. " + f"This ID should not have been passed to get_file_content()." + ) + try: blob = self._download_file_bytes(meta) except Exception as e: @@ -527,11 +619,13 @@ class GoogleDriveConnector(BaseConnector): metadata={ "parents": meta.get("parents"), "driveId": meta.get("driveId"), - "size": int(meta.get("size", 0)) if str(meta.get("size", "")).isdigit() else None, + "size": int(meta.get("size", 0)) + if str(meta.get("size", "")).isdigit() + else None, }, ) return doc - + async def setup_subscription(self) -> str: """ Start a Google Drive Changes API watch (webhook). @@ -546,10 +640,14 @@ class GoogleDriveConnector(BaseConnector): # 1) Ensure we are authenticated and have a live Drive service ok = await self.authenticate() if not ok: - raise RuntimeError("GoogleDriveConnector.setup_subscription: not authenticated") + raise RuntimeError( + "GoogleDriveConnector.setup_subscription: not authenticated" + ) # 2) Resolve webhook address (no param in ABC, so pull from config/env) - webhook_address = getattr(self.cfg, "webhook_address", None) or os.getenv("GOOGLE_DRIVE_WEBHOOK_URL") + webhook_address = getattr(self.cfg, "webhook_address", None) or os.getenv( + "GOOGLE_DRIVE_WEBHOOK_URL" + ) if not webhook_address: raise RuntimeError( "GoogleDriveConnector.setup_subscription: webhook URL not configured. " @@ -600,7 +698,9 @@ class GoogleDriveConnector(BaseConnector): } if not isinstance(channel_id, str) or not channel_id: - raise RuntimeError(f"Drive watch returned invalid channel id: {channel_id!r}") + raise RuntimeError( + f"Drive watch returned invalid channel id: {channel_id!r}" + ) return channel_id @@ -665,13 +765,20 @@ class GoogleDriveConnector(BaseConnector): return False try: - self.service.channels().stop(body={"id": subscription_id, "resourceId": resource_id}).execute() + self.service.channels().stop( + body={"id": subscription_id, "resourceId": resource_id} + ).execute() # 4) Clear local bookkeeping - if getattr(self, "_active_channel", None) and self._active_channel.get("channel_id") == subscription_id: + if ( + getattr(self, "_active_channel", None) + and self._active_channel.get("channel_id") == subscription_id + ): self._active_channel = {} - if hasattr(self, "_subscriptions") and isinstance(self._subscriptions, dict): + if hasattr(self, "_subscriptions") and isinstance( + self._subscriptions, dict + ): self._subscriptions.pop(subscription_id, None) return True @@ -682,7 +789,7 @@ class GoogleDriveConnector(BaseConnector): except Exception: pass return False - + async def handle_webhook(self, payload: Dict[str, Any]) -> List[str]: """ Process a Google Drive Changes webhook. @@ -722,7 +829,9 @@ class GoogleDriveConnector(BaseConnector): except Exception as e: selected_ids = set() try: - logger.error(f"handle_webhook: scope build failed, proceeding unfiltered: {e}") + logger.error( + f"handle_webhook: scope build failed, proceeding unfiltered: {e}" + ) except Exception: pass @@ -759,7 +868,11 @@ class GoogleDriveConnector(BaseConnector): # Filter to our selected scope if we have one; otherwise accept all if selected_ids and (rid not in selected_ids): # Shortcut target might be in scope even if the shortcut isn't - tgt = fobj.get("shortcutDetails", {}).get("targetId") if fobj else None + tgt = ( + fobj.get("shortcutDetails", {}).get("targetId") + if fobj + else None + ) if not (tgt and tgt in selected_ids): continue @@ -808,7 +921,9 @@ class GoogleDriveConnector(BaseConnector): blob = self._download_file_bytes(meta) except HttpError as e: # Skip/record failures - logger.error(f"Failed to download {meta.get('name')} ({meta.get('id')}): {e}") + logger.error( + f"Failed to download {meta.get('name')} ({meta.get('id')}): {e}" + ) continue from datetime import datetime @@ -838,7 +953,9 @@ class GoogleDriveConnector(BaseConnector): "webViewLink": meta.get("webViewLink"), "parents": meta.get("parents"), "driveId": meta.get("driveId"), - "size": int(meta.get("size", 0)) if str(meta.get("size", "")).isdigit() else None, + "size": int(meta.get("size", 0)) + if str(meta.get("size", "")).isdigit() + else None, }, content=blob, ) @@ -849,7 +966,9 @@ class GoogleDriveConnector(BaseConnector): # ------------------------- def get_start_page_token(self) -> str: # getStartPageToken accepts supportsAllDrives (not includeItemsFromAllDrives) - resp = self.service.changes().getStartPageToken(**self._drives_get_flags).execute() + resp = ( + self.service.changes().getStartPageToken(**self._drives_get_flags).execute() + ) return resp["startPageToken"] def poll_changes_and_sync(self) -> Optional[str]: @@ -888,7 +1007,10 @@ class GoogleDriveConnector(BaseConnector): # Match scope if fid not in selected_ids: # also consider shortcut target - if file_obj.get("mimeType") == "application/vnd.google-apps.shortcut": + if ( + file_obj.get("mimeType") + == "application/vnd.google-apps.shortcut" + ): tgt = file_obj.get("shortcutDetails", {}).get("targetId") if tgt and tgt in selected_ids: pass @@ -923,7 +1045,10 @@ class GoogleDriveConnector(BaseConnector): modified_time=parse_datetime(resolved.get("modifiedTime")), mimetype=str(resolved.get("mimeType", "")), acl=DocumentACL(), # Set appropriate ACL if needed - metadata={"parents": resolved.get("parents"), "driveId": resolved.get("driveId")}, + metadata={ + "parents": resolved.get("parents"), + "driveId": resolved.get("driveId"), + }, content=blob, ) self.emit(doc) @@ -945,7 +1070,9 @@ class GoogleDriveConnector(BaseConnector): # ------------------------- # Optional: webhook stubs # ------------------------- - def build_watch_body(self, webhook_address: str, channel_id: Optional[str] = None) -> Dict[str, Any]: + def build_watch_body( + self, webhook_address: str, channel_id: Optional[str] = None + ) -> Dict[str, Any]: """ Prepare the request body for changes.watch if you use webhooks. """ @@ -964,7 +1091,7 @@ class GoogleDriveConnector(BaseConnector): body = self.build_watch_body(webhook_address) result = ( self.service.changes() - .watch(pageToken=page_token, body=body, **self._drives_flags) + .watch(pageToken=page_token, body=body, **self._drives_get_flags) .execute() ) return result @@ -974,7 +1101,9 @@ class GoogleDriveConnector(BaseConnector): Stop a previously started webhook watch. """ try: - self.service.channels().stop(body={"id": channel_id, "resourceId": resource_id}).execute() + self.service.channels().stop( + body={"id": channel_id, "resourceId": resource_id} + ).execute() return True except HttpError as e: diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py index 545c6190..f79a43d9 100644 --- a/src/connectors/langflow_connector_service.py +++ b/src/connectors/langflow_connector_service.py @@ -1,5 +1,3 @@ -import os -import tempfile from typing import Any, Dict, List, Optional # Create custom processor for connector files using Langflow @@ -60,14 +58,14 @@ class LangflowConnectorService: # Create temporary file from document content with auto_cleanup_tempfile(suffix=suffix) as tmp_path: # Write document content to temp file - with open(tmp_path, 'wb') as f: + with open(tmp_path, "wb") as f: f.write(document.content) # Step 1: Upload file to Langflow logger.debug("Uploading file to Langflow", filename=document.filename) content = document.content file_tuple = ( - document.filename.replace(" ", "_").replace("/", "_")+suffix, + document.filename.replace(" ", "_").replace("/", "_") + suffix, content, document.mimetype or "application/octet-stream", ) @@ -255,7 +253,10 @@ class LangflowConnectorService: file_ids: List[str], jwt_token: str = None, ) -> str: - """Sync specific files by their IDs using Langflow processing""" + """ + Sync specific files by their IDs using Langflow processing. + Automatically expands folders to their contents. + """ if not self.task_service: raise ValueError( "TaskService not available - connector sync requires task service dependency" @@ -278,10 +279,50 @@ class LangflowConnectorService: owner_name = user.name if user else None owner_email = user.email if user else None + # Temporarily set file_ids in the connector's config so list_files() can use them + # Store the original values to restore later + cfg = getattr(connector, "cfg", None) + original_file_ids = None + original_folder_ids = None + + if cfg is not None: + original_file_ids = getattr(cfg, "file_ids", None) + original_folder_ids = getattr(cfg, "folder_ids", None) + + try: + # Set the file_ids we want to sync in the connector's config + if cfg is not None: + cfg.file_ids = file_ids # type: ignore + cfg.folder_ids = None # type: ignore + + # Get the expanded list of file IDs (folders will be expanded to their contents) + # This uses the connector's list_files() which calls _iter_selected_items() + result = await connector.list_files() + expanded_file_ids = [f["id"] for f in result.get("files", [])] + + if not expanded_file_ids: + logger.warning( + f"No files found after expanding file_ids. " + f"Original IDs: {file_ids}. This may indicate all IDs were folders " + f"with no contents, or files that were filtered out." + ) + # Return empty task rather than failing + raise ValueError("No files to sync after expanding folders") + + except Exception as e: + logger.error(f"Failed to expand file_ids via list_files(): {e}") + # Fallback to original file_ids if expansion fails + expanded_file_ids = file_ids + finally: + # Restore original config values + if cfg is not None: + cfg.file_ids = original_file_ids # type: ignore + cfg.folder_ids = original_folder_ids # type: ignore + processor = LangflowConnectorFileProcessor( self, connection_id, - file_ids, + expanded_file_ids, user_id, jwt_token=jwt_token, owner_name=owner_name, @@ -290,7 +331,7 @@ class LangflowConnectorService: # Create custom task using TaskService task_id = await self.task_service.create_custom_task( - user_id, file_ids, processor + user_id, expanded_file_ids, processor ) return task_id diff --git a/src/connectors/service.py b/src/connectors/service.py index 792d8d1f..278743d3 100644 --- a/src/connectors/service.py +++ b/src/connectors/service.py @@ -1,16 +1,11 @@ -import tempfile -import os -from typing import Dict, Any, List, Optional +from typing import Any, Dict, List, Optional -from .base import BaseConnector, ConnectorDocument from utils.logging_config import get_logger -logger = get_logger(__name__) -from .google_drive import GoogleDriveConnector -from .sharepoint import SharePointConnector -from .onedrive import OneDriveConnector +from .base import BaseConnector, ConnectorDocument from .connection_manager import ConnectionManager + logger = get_logger(__name__) @@ -56,9 +51,11 @@ class ConnectorService: # Create temporary file from document content from utils.file_utils import auto_cleanup_tempfile - with auto_cleanup_tempfile(suffix=self._get_file_extension(document.mimetype)) as tmp_path: + with auto_cleanup_tempfile( + suffix=self._get_file_extension(document.mimetype) + ) as tmp_path: # Write document content to temp file - with open(tmp_path, 'wb') as f: + with open(tmp_path, "wb") as f: f.write(document.content) # Use existing process_file_common function with connector document metadata @@ -71,6 +68,7 @@ class ConnectorService: # Process using consolidated processing pipeline from models.processors import TaskProcessor + processor = TaskProcessor(document_service=doc_service) result = await processor.process_document_standard( file_path=tmp_path, @@ -301,7 +299,10 @@ class ConnectorService: file_ids: List[str], jwt_token: str = None, ) -> str: - """Sync specific files by their IDs (used for webhook-triggered syncs)""" + """ + Sync specific files by their IDs (used for webhook-triggered syncs or manual selection). + Automatically expands folders to their contents. + """ if not self.task_service: raise ValueError( "TaskService not available - connector sync requires task service dependency" @@ -324,14 +325,53 @@ class ConnectorService: owner_name = user.name if user else None owner_email = user.email if user else None + # Temporarily set file_ids in the connector's config so list_files() can use them + # Store the original values to restore later + original_file_ids = None + original_folder_ids = None + + if hasattr(connector, "cfg"): + original_file_ids = getattr(connector.cfg, "file_ids", None) + original_folder_ids = getattr(connector.cfg, "folder_ids", None) + + try: + # Set the file_ids we want to sync in the connector's config + if hasattr(connector, "cfg"): + connector.cfg.file_ids = file_ids # type: ignore + connector.cfg.folder_ids = None # type: ignore + + # Get the expanded list of file IDs (folders will be expanded to their contents) + # This uses the connector's list_files() which calls _iter_selected_items() + result = await connector.list_files() + expanded_file_ids = [f["id"] for f in result.get("files", [])] + + if not expanded_file_ids: + logger.warning( + f"No files found after expanding file_ids. " + f"Original IDs: {file_ids}. This may indicate all IDs were folders " + f"with no contents, or files that were filtered out." + ) + # Return empty task rather than failing + raise ValueError("No files to sync after expanding folders") + + except Exception as e: + logger.error(f"Failed to expand file_ids via list_files(): {e}") + # Fallback to original file_ids if expansion fails + expanded_file_ids = file_ids + finally: + # Restore original config values + if hasattr(connector, "cfg"): + connector.cfg.file_ids = original_file_ids # type: ignore + connector.cfg.folder_ids = original_folder_ids # type: ignore + # Create custom processor for specific connector files from models.processors import ConnectorFileProcessor - # We'll pass file_ids as the files_info, the processor will handle ID-only files + # Use expanded_file_ids which has folders already expanded processor = ConnectorFileProcessor( self, connection_id, - file_ids, + expanded_file_ids, user_id, jwt_token=jwt_token, owner_name=owner_name, @@ -340,7 +380,7 @@ class ConnectorService: # Create custom task using TaskService task_id = await self.task_service.create_custom_task( - user_id, file_ids, processor + user_id, expanded_file_ids, processor ) return task_id From 55203d9c69530f5e8f6673c6a4887a2ff58f8382 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 6 Oct 2025 12:07:17 -0400 Subject: [PATCH 024/113] Update openrag_agent.json --- flows/openrag_agent.json | 285 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 279 insertions(+), 6 deletions(-) diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json index 597011a4..74307fee 100644 --- a/flows/openrag_agent.json +++ b/flows/openrag_agent.json @@ -170,6 +170,31 @@ "sourceHandle": "{ล“dataTypeล“:ล“TextInputล“,ล“idล“:ล“TextInput-aHsQbล“,ล“nameล“:ล“textล“,ล“output_typesล“:[ล“Messageล“]}", "target": "OpenSearch-iYfjf", "targetHandle": "{ล“fieldNameล“:ล“filter_expressionล“,ล“idล“:ล“OpenSearch-iYfjfล“,ล“inputTypesล“:[ล“Messageล“],ล“typeล“:ล“strล“}" + }, + { + "data": { + "sourceHandle": { + "dataType": "MCP", + "id": "MCP-7EY21", + "name": "component_as_tool", + "output_types": [ + "Tool" + ] + }, + "targetHandle": { + "fieldName": "tools", + "id": "Agent-crjWf", + "inputTypes": [ + "Tool" + ], + "type": "other" + } + }, + "id": "xy-edge__MCP-7EY21{ล“dataTypeล“:ล“MCPล“,ล“idล“:ล“MCP-7EY21ล“,ล“nameล“:ล“component_as_toolล“,ล“output_typesล“:[ล“Toolล“]}-Agent-crjWf{ล“fieldNameล“:ล“toolsล“,ล“idล“:ล“Agent-crjWfล“,ล“inputTypesล“:[ล“Toolล“],ล“typeล“:ล“otherล“}", + "source": "MCP-7EY21", + "sourceHandle": "{ล“dataTypeล“:ล“MCPล“,ล“idล“:ล“MCP-7EY21ล“,ล“nameล“:ล“component_as_toolล“,ล“output_typesล“:[ล“Toolล“]}", + "target": "Agent-crjWf", + "targetHandle": "{ล“fieldNameล“:ล“toolsล“,ล“idล“:ล“Agent-crjWfล“,ล“inputTypesล“:[ล“Toolล“],ล“typeล“:ล“otherล“}" } ], "nodes": [ @@ -730,7 +755,7 @@ ], "frozen": false, "icon": "OpenSearch", - "last_updated": "2025-10-04T05:41:33.344Z", + "last_updated": "2025-10-06T15:23:50.339Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -1384,7 +1409,7 @@ ], "frozen": false, "icon": "binary", - "last_updated": "2025-10-04T05:41:33.345Z", + "last_updated": "2025-10-06T15:23:50.341Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -1709,7 +1734,7 @@ ], "frozen": false, "icon": "bot", - "last_updated": "2025-10-04T05:41:33.399Z", + "last_updated": "2025-10-06T15:23:50.396Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -2248,7 +2273,7 @@ ], "frozen": false, "icon": "brain-circuit", - "last_updated": "2025-10-04T05:41:33.347Z", + "last_updated": "2025-10-06T15:23:50.343Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -2551,10 +2576,258 @@ }, "selected": false, "type": "genericNode" + }, + { + "data": { + "id": "MCP-7EY21", + "node": { + "base_classes": [ + "DataFrame" + ], + "beta": false, + "category": "MCP", + "conditional_paths": [], + "custom_fields": {}, + "description": "Connect to an MCP server to use its tools.", + "display_name": "MCP Tools", + "documentation": "https://docs.langflow.org/mcp-client", + "edited": false, + "field_order": [ + "mcp_server", + "use_cache", + "tool", + "tool_placeholder" + ], + "frozen": false, + "icon": "Mcp", + "key": "mcp_lf-starter_project", + "last_updated": "2025-10-06T15:23:56.578Z", + "legacy": false, + "mcpServerName": "lf-starter_project", + "metadata": { + "code_hash": "756d1e10d0ca", + "dependencies": { + "dependencies": [ + { + "name": "langchain_core", + "version": "0.3.77" + }, + { + "name": "lfx", + "version": null + }, + { + "name": "langflow", + "version": null + } + ], + "total_dependencies": 3 + }, + "module": "lfx.components.agents.mcp_component.MCPToolsComponent" + }, + "minimized": false, + "output_types": [], + "outputs": [ + { + "allows_loop": false, + "cache": true, + "display_name": "Toolset", + "group_outputs": false, + "hidden": null, + "method": "to_toolkit", + "name": "component_as_tool", + "options": null, + "required_inputs": null, + "selected": "Tool", + "tool_mode": true, + "types": [ + "Tool" + ], + "value": "__UNDEFINED__" + } + ], + "pinned": false, + "template": { + "_type": "Component", + "code": { + "advanced": true, + "dynamic": true, + "fileTypes": [], + "file_path": "", + "info": "", + "list": false, + "load_from_db": false, + "multiline": true, + "name": "code", + "password": false, + "placeholder": "", + "required": true, + "show": true, + "title_case": false, + "type": "code", + "value": "from __future__ import annotations\n\nimport asyncio\nimport uuid\nfrom typing import Any\n\nfrom langchain_core.tools import StructuredTool # noqa: TC002\n\nfrom lfx.base.agents.utils import maybe_unflatten_dict, safe_cache_get, safe_cache_set\nfrom lfx.base.mcp.util import MCPSseClient, MCPStdioClient, create_input_schema_from_json_schema, update_tools\nfrom lfx.custom.custom_component.component_with_cache import ComponentWithCache\nfrom lfx.inputs.inputs import InputTypes # noqa: TC001\nfrom lfx.io import BoolInput, DropdownInput, McpInput, MessageTextInput, Output\nfrom lfx.io.schema import flatten_schema, schema_to_langflow_inputs\nfrom lfx.log.logger import logger\nfrom lfx.schema.dataframe import DataFrame\nfrom lfx.schema.message import Message\nfrom lfx.services.deps import get_settings_service, get_storage_service, session_scope\n\n\nclass MCPToolsComponent(ComponentWithCache):\n schema_inputs: list = []\n tools: list[StructuredTool] = []\n _not_load_actions: bool = False\n _tool_cache: dict = {}\n _last_selected_server: str | None = None # Cache for the last selected server\n\n def __init__(self, **data) -> None:\n super().__init__(**data)\n # Initialize cache keys to avoid CacheMiss when accessing them\n self._ensure_cache_structure()\n\n # Initialize clients with access to the component cache\n self.stdio_client: MCPStdioClient = MCPStdioClient(component_cache=self._shared_component_cache)\n self.sse_client: MCPSseClient = MCPSseClient(component_cache=self._shared_component_cache)\n\n def _ensure_cache_structure(self):\n \"\"\"Ensure the cache has the required structure.\"\"\"\n # Check if servers key exists and is not CacheMiss\n servers_value = safe_cache_get(self._shared_component_cache, \"servers\")\n if servers_value is None:\n safe_cache_set(self._shared_component_cache, \"servers\", {})\n\n # Check if last_selected_server key exists and is not CacheMiss\n last_server_value = safe_cache_get(self._shared_component_cache, \"last_selected_server\")\n if last_server_value is None:\n safe_cache_set(self._shared_component_cache, \"last_selected_server\", \"\")\n\n default_keys: list[str] = [\n \"code\",\n \"_type\",\n \"tool_mode\",\n \"tool_placeholder\",\n \"mcp_server\",\n \"tool\",\n \"use_cache\",\n ]\n\n display_name = \"MCP Tools\"\n description = \"Connect to an MCP server to use its tools.\"\n documentation: str = \"https://docs.langflow.org/mcp-client\"\n icon = \"Mcp\"\n name = \"MCPTools\"\n\n inputs = [\n McpInput(\n name=\"mcp_server\",\n display_name=\"MCP Server\",\n info=\"Select the MCP Server that will be used by this component\",\n real_time_refresh=True,\n ),\n BoolInput(\n name=\"use_cache\",\n display_name=\"Use Cached Server\",\n info=(\n \"Enable caching of MCP Server and tools to improve performance. \"\n \"Disable to always fetch fresh tools and server updates.\"\n ),\n value=False,\n advanced=True,\n ),\n DropdownInput(\n name=\"tool\",\n display_name=\"Tool\",\n options=[],\n value=\"\",\n info=\"Select the tool to execute\",\n show=False,\n required=True,\n real_time_refresh=True,\n ),\n MessageTextInput(\n name=\"tool_placeholder\",\n display_name=\"Tool Placeholder\",\n info=\"Placeholder for the tool\",\n value=\"\",\n show=False,\n tool_mode=False,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Response\", name=\"response\", method=\"build_output\"),\n ]\n\n async def _validate_schema_inputs(self, tool_obj) -> list[InputTypes]:\n \"\"\"Validate and process schema inputs for a tool.\"\"\"\n try:\n if not tool_obj or not hasattr(tool_obj, \"args_schema\"):\n msg = \"Invalid tool object or missing input schema\"\n raise ValueError(msg)\n\n flat_schema = flatten_schema(tool_obj.args_schema.schema())\n input_schema = create_input_schema_from_json_schema(flat_schema)\n if not input_schema:\n msg = f\"Empty input schema for tool '{tool_obj.name}'\"\n raise ValueError(msg)\n\n schema_inputs = schema_to_langflow_inputs(input_schema)\n if not schema_inputs:\n msg = f\"No input parameters defined for tool '{tool_obj.name}'\"\n await logger.awarning(msg)\n return []\n\n except Exception as e:\n msg = f\"Error validating schema inputs: {e!s}\"\n await logger.aexception(msg)\n raise ValueError(msg) from e\n else:\n return schema_inputs\n\n async def update_tool_list(self, mcp_server_value=None):\n # Accepts mcp_server_value as dict {name, config} or uses self.mcp_server\n mcp_server = mcp_server_value if mcp_server_value is not None else getattr(self, \"mcp_server\", None)\n server_name = None\n server_config_from_value = None\n if isinstance(mcp_server, dict):\n server_name = mcp_server.get(\"name\")\n server_config_from_value = mcp_server.get(\"config\")\n else:\n server_name = mcp_server\n if not server_name:\n self.tools = []\n return [], {\"name\": server_name, \"config\": server_config_from_value}\n\n # Check if caching is enabled, default to False\n use_cache = getattr(self, \"use_cache\", False)\n\n # Use shared cache if available and caching is enabled\n cached = None\n if use_cache:\n servers_cache = safe_cache_get(self._shared_component_cache, \"servers\", {})\n cached = servers_cache.get(server_name) if isinstance(servers_cache, dict) else None\n\n if cached is not None:\n try:\n self.tools = cached[\"tools\"]\n self.tool_names = cached[\"tool_names\"]\n self._tool_cache = cached[\"tool_cache\"]\n server_config_from_value = cached[\"config\"]\n except (TypeError, KeyError, AttributeError) as e:\n # Handle corrupted cache data by clearing it and continuing to fetch fresh tools\n msg = f\"Unable to use cached data for MCP Server{server_name}: {e}\"\n await logger.awarning(msg)\n # Clear the corrupted cache entry\n current_servers_cache = safe_cache_get(self._shared_component_cache, \"servers\", {})\n if isinstance(current_servers_cache, dict) and server_name in current_servers_cache:\n current_servers_cache.pop(server_name)\n safe_cache_set(self._shared_component_cache, \"servers\", current_servers_cache)\n else:\n return self.tools, {\"name\": server_name, \"config\": server_config_from_value}\n\n try:\n try:\n from langflow.api.v2.mcp import get_server\n from langflow.services.database.models.user.crud import get_user_by_id\n except ImportError as e:\n msg = (\n \"Langflow MCP server functionality is not available. \"\n \"This feature requires the full Langflow installation.\"\n )\n raise ImportError(msg) from e\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching MCP tools.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n\n # Try to get server config from DB/API\n server_config = await get_server(\n server_name,\n current_user,\n db,\n storage_service=get_storage_service(),\n settings_service=get_settings_service(),\n )\n\n # If get_server returns empty but we have a config, use it\n if not server_config and server_config_from_value:\n server_config = server_config_from_value\n\n if not server_config:\n self.tools = []\n return [], {\"name\": server_name, \"config\": server_config}\n\n _, tool_list, tool_cache = await update_tools(\n server_name=server_name,\n server_config=server_config,\n mcp_stdio_client=self.stdio_client,\n mcp_sse_client=self.sse_client,\n )\n\n self.tool_names = [tool.name for tool in tool_list if hasattr(tool, \"name\")]\n self._tool_cache = tool_cache\n self.tools = tool_list\n\n # Cache the result only if caching is enabled\n if use_cache:\n cache_data = {\n \"tools\": tool_list,\n \"tool_names\": self.tool_names,\n \"tool_cache\": tool_cache,\n \"config\": server_config,\n }\n\n # Safely update the servers cache\n current_servers_cache = safe_cache_get(self._shared_component_cache, \"servers\", {})\n if isinstance(current_servers_cache, dict):\n current_servers_cache[server_name] = cache_data\n safe_cache_set(self._shared_component_cache, \"servers\", current_servers_cache)\n\n except (TimeoutError, asyncio.TimeoutError) as e:\n msg = f\"Timeout updating tool list: {e!s}\"\n await logger.aexception(msg)\n raise TimeoutError(msg) from e\n except Exception as e:\n msg = f\"Error updating tool list: {e!s}\"\n await logger.aexception(msg)\n raise ValueError(msg) from e\n else:\n return tool_list, {\"name\": server_name, \"config\": server_config}\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Toggle the visibility of connection-specific fields based on the selected mode.\"\"\"\n try:\n if field_name == \"tool\":\n try:\n if len(self.tools) == 0:\n try:\n self.tools, build_config[\"mcp_server\"][\"value\"] = await self.update_tool_list()\n build_config[\"tool\"][\"options\"] = [tool.name for tool in self.tools]\n build_config[\"tool\"][\"placeholder\"] = \"Select a tool\"\n except (TimeoutError, asyncio.TimeoutError) as e:\n msg = f\"Timeout updating tool list: {e!s}\"\n await logger.aexception(msg)\n if not build_config[\"tools_metadata\"][\"show\"]:\n build_config[\"tool\"][\"show\"] = True\n build_config[\"tool\"][\"options\"] = []\n build_config[\"tool\"][\"value\"] = \"\"\n build_config[\"tool\"][\"placeholder\"] = \"Timeout on MCP server\"\n else:\n build_config[\"tool\"][\"show\"] = False\n except ValueError:\n if not build_config[\"tools_metadata\"][\"show\"]:\n build_config[\"tool\"][\"show\"] = True\n build_config[\"tool\"][\"options\"] = []\n build_config[\"tool\"][\"value\"] = \"\"\n build_config[\"tool\"][\"placeholder\"] = \"Error on MCP Server\"\n else:\n build_config[\"tool\"][\"show\"] = False\n\n if field_value == \"\":\n return build_config\n tool_obj = None\n for tool in self.tools:\n if tool.name == field_value:\n tool_obj = tool\n break\n if tool_obj is None:\n msg = f\"Tool {field_value} not found in available tools: {self.tools}\"\n await logger.awarning(msg)\n return build_config\n await self._update_tool_config(build_config, field_value)\n except Exception as e:\n build_config[\"tool\"][\"options\"] = []\n msg = f\"Failed to update tools: {e!s}\"\n raise ValueError(msg) from e\n else:\n return build_config\n elif field_name == \"mcp_server\":\n if not field_value:\n build_config[\"tool\"][\"show\"] = False\n build_config[\"tool\"][\"options\"] = []\n build_config[\"tool\"][\"value\"] = \"\"\n build_config[\"tool\"][\"placeholder\"] = \"\"\n build_config[\"tool_placeholder\"][\"tool_mode\"] = False\n self.remove_non_default_keys(build_config)\n return build_config\n\n build_config[\"tool_placeholder\"][\"tool_mode\"] = True\n\n current_server_name = field_value.get(\"name\") if isinstance(field_value, dict) else field_value\n _last_selected_server = safe_cache_get(self._shared_component_cache, \"last_selected_server\", \"\")\n\n # To avoid unnecessary updates, only proceed if the server has actually changed\n if (_last_selected_server in (current_server_name, \"\")) and build_config[\"tool\"][\"show\"]:\n if current_server_name:\n servers_cache = safe_cache_get(self._shared_component_cache, \"servers\", {})\n if isinstance(servers_cache, dict):\n cached = servers_cache.get(current_server_name)\n if cached is not None and cached.get(\"tool_names\"):\n cached_tools = cached[\"tool_names\"]\n current_tools = build_config[\"tool\"][\"options\"]\n if current_tools == cached_tools:\n return build_config\n else:\n return build_config\n\n # Determine if \"Tool Mode\" is active by checking if the tool dropdown is hidden.\n is_in_tool_mode = build_config[\"tools_metadata\"][\"show\"]\n safe_cache_set(self._shared_component_cache, \"last_selected_server\", current_server_name)\n\n # Check if tools are already cached for this server before clearing\n cached_tools = None\n if current_server_name:\n use_cache = getattr(self, \"use_cache\", True)\n if use_cache:\n servers_cache = safe_cache_get(self._shared_component_cache, \"servers\", {})\n if isinstance(servers_cache, dict):\n cached = servers_cache.get(current_server_name)\n if cached is not None:\n try:\n cached_tools = cached[\"tools\"]\n self.tools = cached_tools\n self.tool_names = cached[\"tool_names\"]\n self._tool_cache = cached[\"tool_cache\"]\n except (TypeError, KeyError, AttributeError) as e:\n # Handle corrupted cache data by ignoring it\n msg = f\"Unable to use cached data for MCP Server,{current_server_name}: {e}\"\n await logger.awarning(msg)\n cached_tools = None\n\n # Only clear tools if we don't have cached tools for the current server\n if not cached_tools:\n self.tools = [] # Clear previous tools only if no cache\n\n self.remove_non_default_keys(build_config) # Clear previous tool inputs\n\n # Only show the tool dropdown if not in tool_mode\n if not is_in_tool_mode:\n build_config[\"tool\"][\"show\"] = True\n if cached_tools:\n # Use cached tools to populate options immediately\n build_config[\"tool\"][\"options\"] = [tool.name for tool in cached_tools]\n build_config[\"tool\"][\"placeholder\"] = \"Select a tool\"\n else:\n # Show loading state only when we need to fetch tools\n build_config[\"tool\"][\"placeholder\"] = \"Loading tools...\"\n build_config[\"tool\"][\"options\"] = []\n build_config[\"tool\"][\"value\"] = uuid.uuid4()\n else:\n # Keep the tool dropdown hidden if in tool_mode\n self._not_load_actions = True\n build_config[\"tool\"][\"show\"] = False\n\n elif field_name == \"tool_mode\":\n build_config[\"tool\"][\"placeholder\"] = \"\"\n build_config[\"tool\"][\"show\"] = not bool(field_value) and bool(build_config[\"mcp_server\"])\n self.remove_non_default_keys(build_config)\n self.tool = build_config[\"tool\"][\"value\"]\n if field_value:\n self._not_load_actions = True\n else:\n build_config[\"tool\"][\"value\"] = uuid.uuid4()\n build_config[\"tool\"][\"options\"] = []\n build_config[\"tool\"][\"show\"] = True\n build_config[\"tool\"][\"placeholder\"] = \"Loading tools...\"\n elif field_name == \"tools_metadata\":\n self._not_load_actions = False\n\n except Exception as e:\n msg = f\"Error in update_build_config: {e!s}\"\n await logger.aexception(msg)\n raise ValueError(msg) from e\n else:\n return build_config\n\n def get_inputs_for_all_tools(self, tools: list) -> dict:\n \"\"\"Get input schemas for all tools.\"\"\"\n inputs = {}\n for tool in tools:\n if not tool or not hasattr(tool, \"name\"):\n continue\n try:\n flat_schema = flatten_schema(tool.args_schema.schema())\n input_schema = create_input_schema_from_json_schema(flat_schema)\n langflow_inputs = schema_to_langflow_inputs(input_schema)\n inputs[tool.name] = langflow_inputs\n except (AttributeError, ValueError, TypeError, KeyError) as e:\n msg = f\"Error getting inputs for tool {getattr(tool, 'name', 'unknown')}: {e!s}\"\n logger.exception(msg)\n continue\n return inputs\n\n def remove_input_schema_from_build_config(\n self, build_config: dict, tool_name: str, input_schema: dict[list[InputTypes], Any]\n ):\n \"\"\"Remove the input schema for the tool from the build config.\"\"\"\n # Keep only schemas that don't belong to the current tool\n input_schema = {k: v for k, v in input_schema.items() if k != tool_name}\n # Remove all inputs from other tools\n for value in input_schema.values():\n for _input in value:\n if _input.name in build_config:\n build_config.pop(_input.name)\n\n def remove_non_default_keys(self, build_config: dict) -> None:\n \"\"\"Remove non-default keys from the build config.\"\"\"\n for key in list(build_config.keys()):\n if key not in self.default_keys:\n build_config.pop(key)\n\n async def _update_tool_config(self, build_config: dict, tool_name: str) -> None:\n \"\"\"Update tool configuration with proper error handling.\"\"\"\n if not self.tools:\n self.tools, build_config[\"mcp_server\"][\"value\"] = await self.update_tool_list()\n\n if not tool_name:\n return\n\n tool_obj = next((tool for tool in self.tools if tool.name == tool_name), None)\n if not tool_obj:\n msg = f\"Tool {tool_name} not found in available tools: {self.tools}\"\n self.remove_non_default_keys(build_config)\n build_config[\"tool\"][\"value\"] = \"\"\n await logger.awarning(msg)\n return\n\n try:\n # Store current values before removing inputs\n current_values = {}\n for key, value in build_config.items():\n if key not in self.default_keys and isinstance(value, dict) and \"value\" in value:\n current_values[key] = value[\"value\"]\n\n # Get all tool inputs and remove old ones\n input_schema_for_all_tools = self.get_inputs_for_all_tools(self.tools)\n self.remove_input_schema_from_build_config(build_config, tool_name, input_schema_for_all_tools)\n\n # Get and validate new inputs\n self.schema_inputs = await self._validate_schema_inputs(tool_obj)\n if not self.schema_inputs:\n msg = f\"No input parameters to configure for tool '{tool_name}'\"\n await logger.ainfo(msg)\n return\n\n # Add new inputs to build config\n for schema_input in self.schema_inputs:\n if not schema_input or not hasattr(schema_input, \"name\"):\n msg = \"Invalid schema input detected, skipping\"\n await logger.awarning(msg)\n continue\n\n try:\n name = schema_input.name\n input_dict = schema_input.to_dict()\n input_dict.setdefault(\"value\", None)\n input_dict.setdefault(\"required\", True)\n\n build_config[name] = input_dict\n\n # Preserve existing value if the parameter name exists in current_values\n if name in current_values:\n build_config[name][\"value\"] = current_values[name]\n\n except (AttributeError, KeyError, TypeError) as e:\n msg = f\"Error processing schema input {schema_input}: {e!s}\"\n await logger.aexception(msg)\n continue\n except ValueError as e:\n msg = f\"Schema validation error for tool {tool_name}: {e!s}\"\n await logger.aexception(msg)\n self.schema_inputs = []\n return\n except (AttributeError, KeyError, TypeError) as e:\n msg = f\"Error updating tool config: {e!s}\"\n await logger.aexception(msg)\n raise ValueError(msg) from e\n\n async def build_output(self) -> DataFrame:\n \"\"\"Build output with improved error handling and validation.\"\"\"\n try:\n self.tools, _ = await self.update_tool_list()\n if self.tool != \"\":\n # Set session context for persistent MCP sessions using Langflow session ID\n session_context = self._get_session_context()\n if session_context:\n self.stdio_client.set_session_context(session_context)\n self.sse_client.set_session_context(session_context)\n\n exec_tool = self._tool_cache[self.tool]\n tool_args = self.get_inputs_for_all_tools(self.tools)[self.tool]\n kwargs = {}\n for arg in tool_args:\n value = getattr(self, arg.name, None)\n if value is not None:\n if isinstance(value, Message):\n kwargs[arg.name] = value.text\n else:\n kwargs[arg.name] = value\n\n unflattened_kwargs = maybe_unflatten_dict(kwargs)\n\n output = await exec_tool.coroutine(**unflattened_kwargs)\n\n tool_content = []\n for item in output.content:\n item_dict = item.model_dump()\n tool_content.append(item_dict)\n return DataFrame(data=tool_content)\n return DataFrame(data=[{\"error\": \"You must select a tool\"}])\n except Exception as e:\n msg = f\"Error in build_output: {e!s}\"\n await logger.aexception(msg)\n raise ValueError(msg) from e\n\n def _get_session_context(self) -> str | None:\n \"\"\"Get the Langflow session ID for MCP session caching.\"\"\"\n # Try to get session ID from the component's execution context\n if hasattr(self, \"graph\") and hasattr(self.graph, \"session_id\"):\n session_id = self.graph.session_id\n # Include server name to ensure different servers get different sessions\n server_name = \"\"\n mcp_server = getattr(self, \"mcp_server\", None)\n if isinstance(mcp_server, dict):\n server_name = mcp_server.get(\"name\", \"\")\n elif mcp_server:\n server_name = str(mcp_server)\n return f\"{session_id}_{server_name}\" if session_id else None\n return None\n\n async def _get_tools(self):\n \"\"\"Get cached tools or update if necessary.\"\"\"\n mcp_server = getattr(self, \"mcp_server\", None)\n if not self._not_load_actions:\n tools, _ = await self.update_tool_list(mcp_server)\n return tools\n return []\n" + }, + "mcp_server": { + "_input_type": "McpInput", + "advanced": false, + "display_name": "MCP Server", + "dynamic": false, + "info": "Select the MCP Server that will be used by this component", + "name": "mcp_server", + "placeholder": "", + "real_time_refresh": true, + "required": false, + "show": true, + "title_case": false, + "trace_as_metadata": true, + "type": "mcp", + "value": { + "config": { + "args": [ + "mcp-proxy", + "--headers", + "x-api-key", + "sk-lq7nQIiX4jbYTIOGH7YG9z46E0IW1i-FSvn_hkcg2xE", + "http://localhost:7860/api/v1/mcp/project/304fb921-38e4-4763-b223-832a3e3546e0/sse" + ], + "command": "uvx" + }, + "name": "lf-starter_project" + } + }, + "tool": { + "_input_type": "DropdownInput", + "advanced": false, + "combobox": false, + "dialog_inputs": {}, + "display_name": "Tool", + "dynamic": false, + "external_options": {}, + "info": "Select the tool to execute", + "name": "tool", + "options": [ + "opensearch_url_ingestion_flow" + ], + "options_metadata": [], + "placeholder": "", + "real_time_refresh": true, + "required": true, + "show": false, + "title_case": false, + "toggle": false, + "tool_mode": false, + "trace_as_metadata": true, + "type": "str", + "value": "" + }, + "tool_placeholder": { + "_input_type": "MessageTextInput", + "advanced": false, + "display_name": "Tool Placeholder", + "dynamic": false, + "info": "Placeholder for the tool", + "input_types": [ + "Message" + ], + "list": false, + "list_add_label": "Add More", + "load_from_db": false, + "name": "tool_placeholder", + "placeholder": "", + "required": false, + "show": false, + "title_case": false, + "tool_mode": true, + "trace_as_input": true, + "trace_as_metadata": true, + "type": "str", + "value": "" + }, + "tools_metadata": { + "_input_type": "ToolsInput", + "advanced": false, + "display_name": "Actions", + "dynamic": false, + "info": "Modify tool names and descriptions to help agents understand when to use each tool.", + "is_list": true, + "list_add_label": "Add More", + "name": "tools_metadata", + "placeholder": "", + "real_time_refresh": true, + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "type": "tools", + "value": [ + { + "args": { + "input_value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Message to be passed as input.", + "title": "Input Value" + } + }, + "description": "This flow is to ingest the URL to open search.", + "display_description": "This flow is to ingest the URL to open search.", + "display_name": "opensearch_url_ingestion_flow", + "name": "opensearch_url_ingestion_flow", + "readonly": false, + "status": true, + "tags": [ + "opensearch_url_ingestion_flow" + ] + } + ] + }, + "use_cache": { + "_input_type": "BoolInput", + "advanced": true, + "display_name": "Use Cached Server", + "dynamic": false, + "info": "Enable caching of MCP Server and tools to improve performance. Disable to always fetch fresh tools and server updates.", + "list": false, + "list_add_label": "Add More", + "name": "use_cache", + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "type": "bool", + "value": false + } + }, + "tool_mode": true + }, + "showNode": true, + "type": "MCP" + }, + "id": "MCP-7EY21", + "measured": { + "height": 284, + "width": 320 + }, + "position": { + "x": 675.7137923419156, + "y": 878.6218422334763 + }, + "selected": false, + "type": "genericNode" } ], "viewport": { - "x": -149.48015964664273, + "x": -237.0727605845459, "y": 154.6885920024542, "zoom": 0.602433700773958 } @@ -2563,7 +2836,7 @@ "endpoint_name": null, "id": "1098eea1-6649-4e1d-aed1-b77249fb8dd0", "is_component": false, - "last_tested_version": "1.6.3.dev0", + "last_tested_version": "1.6.0", "name": "OpenRAG Open Search Agent", "tags": [ "assistants", From 42a198f96a261d06fb298106954024641bafac08 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 6 Oct 2025 16:24:15 -0400 Subject: [PATCH 025/113] Add CONNECTOR_TYPE_URL env and flow support Introduces the CONNECTOR_TYPE_URL environment variable to docker-compose files and assets, updates the OpenRAG URL ingestion flow to use it, and ensures it is set in the auth service global variables. This enables explicit configuration and handling of URL-based connectors in the OpenRAG system. --- docker-compose-cpu.yml | 1 + docker-compose.yml | 10 +++++----- flows/openrag_url_mcp.json | 23 ++++++++++++++--------- src/services/auth_service.py | 1 + src/tui/_assets/docker-compose-cpu.yml | 1 + src/tui/_assets/docker-compose.yml | 1 + 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml index 570bc3b8..937826a6 100644 --- a/docker-compose-cpu.yml +++ b/docker-compose-cpu.yml @@ -108,6 +108,7 @@ services: - OWNER_NAME=None - OWNER_EMAIL=None - CONNECTOR_TYPE=system + - CONNECTOR_TYPE_URL=url - OPENRAG-QUERY-FILTER="{}" - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - FILENAME=None diff --git a/docker-compose.yml b/docker-compose.yml index b97f7cca..6a4a41b8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -43,7 +43,7 @@ services: # build: # context: . # dockerfile: Dockerfile.backend - # container_name: openrag-backend + container_name: openrag-backend depends_on: - langflow environment: @@ -78,10 +78,9 @@ services: openrag-frontend: image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} - # build: - # context: . - # dockerfile: Dockerfile.frontend - #dockerfile: Dockerfile.frontend + build: + context: . + dockerfile: Dockerfile.frontend container_name: openrag-frontend depends_on: - openrag-backend @@ -109,6 +108,7 @@ services: - OWNER_NAME=None - OWNER_EMAIL=None - CONNECTOR_TYPE=system + - CONNECTOR_TYPE_URL=url - OPENRAG-QUERY-FILTER="{}" - FILENAME=None - MIMETYPE=None diff --git a/flows/openrag_url_mcp.json b/flows/openrag_url_mcp.json index 69dbc85d..9cab0fed 100644 --- a/flows/openrag_url_mcp.json +++ b/flows/openrag_url_mcp.json @@ -232,6 +232,7 @@ }, { "animated": false, + "className": "", "data": { "sourceHandle": { "dataType": "EmbeddingModel", @@ -733,6 +734,10 @@ { "key": "owner_email", "value": "OWNER_EMAIL" + }, + { + "key": "connector_type", + "value": "CONNECTOR_TYPE_URL" } ] }, @@ -1808,7 +1813,7 @@ ], "frozen": false, "icon": "table", - "last_updated": "2025-10-03T20:31:36.023Z", + "last_updated": "2025-10-06T17:46:55.068Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -2224,7 +2229,7 @@ ], "frozen": false, "icon": "table", - "last_updated": "2025-10-03T20:31:36.025Z", + "last_updated": "2025-10-06T17:46:55.069Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -2897,7 +2902,7 @@ ], "frozen": false, "icon": "table", - "last_updated": "2025-10-03T20:31:36.026Z", + "last_updated": "2025-10-06T17:46:55.069Z", "legacy": false, "metadata": { "code_hash": "b4d6b19b6eef", @@ -3310,7 +3315,7 @@ ], "frozen": false, "icon": "binary", - "last_updated": "2025-10-03T20:31:47.177Z", + "last_updated": "2025-10-06T17:46:54.996Z", "legacy": false, "metadata": { "code_hash": "8607e963fdef", @@ -3595,17 +3600,17 @@ } ], "viewport": { - "x": -407.1633937626607, - "y": -577.5291936220412, - "zoom": 0.5347553210574026 + "x": -538.2311610019549, + "y": -337.3313239657308, + "zoom": 0.45546556043892106 } }, "description": "This flow is to ingest the URL to open search.", "endpoint_name": null, - "mcp_enabled": true, "id": "72c3d17c-2dac-4a73-b48a-6518473d7830", + "mcp_enabled": true, "is_component": false, - "last_tested_version": "1.6.0", + "last_tested_version": "1.6.3.dev1", "name": "OpenSearch URL Ingestion Flow", "tags": [ "openai", diff --git a/src/services/auth_service.py b/src/services/auth_service.py index 6b19f77a..ab33f035 100644 --- a/src/services/auth_service.py +++ b/src/services/auth_service.py @@ -296,6 +296,7 @@ class AuthService: try: if self.langflow_mcp_service and isinstance(jwt_token, str) and jwt_token.strip(): global_vars = {"JWT": jwt_token} + global_vars["CONNECTOR_TYPE_URL"] = "url" if user_info: if user_info.get("id"): global_vars["OWNER"] = user_info.get("id") diff --git a/src/tui/_assets/docker-compose-cpu.yml b/src/tui/_assets/docker-compose-cpu.yml index 1086737b..4a1125f8 100644 --- a/src/tui/_assets/docker-compose-cpu.yml +++ b/src/tui/_assets/docker-compose-cpu.yml @@ -105,6 +105,7 @@ services: - OWNER_NAME=None - OWNER_EMAIL=None - CONNECTOR_TYPE=system + - CONNECTOR_TYPE_URL=url - OPENRAG-QUERY-FILTER="{}" - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - FILENAME=None diff --git a/src/tui/_assets/docker-compose.yml b/src/tui/_assets/docker-compose.yml index 32b72c65..6cac6506 100644 --- a/src/tui/_assets/docker-compose.yml +++ b/src/tui/_assets/docker-compose.yml @@ -105,6 +105,7 @@ services: - OWNER_NAME=None - OWNER_EMAIL=None - CONNECTOR_TYPE=system + - CONNECTOR_TYPE_URL=url - OPENRAG-QUERY-FILTER="{}" - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - FILENAME=None From 311f3c0fede49b2ce40c8fa356034d05509fdf68 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 6 Oct 2025 21:16:56 -0400 Subject: [PATCH 026/113] Update base image to langflow-nightly:1.6.3.dev1 Dockerfile now uses the newer langflow-nightly:1.6.3.dev1 image instead of 1.6.3.dev0 to ensure the latest updates and fixes are included. --- Dockerfile.langflow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.langflow b/Dockerfile.langflow index 71baf447..bdae1f70 100644 --- a/Dockerfile.langflow +++ b/Dockerfile.langflow @@ -1,4 +1,4 @@ -FROM langflowai/langflow-nightly:1.6.3.dev0 +FROM langflowai/langflow-nightly:1.6.3.dev1 EXPOSE 7860 From 08f441c0e5f5f34343de65d3f843ae9031161fe8 Mon Sep 17 00:00:00 2001 From: phact Date: Mon, 6 Oct 2025 21:22:44 -0400 Subject: [PATCH 027/113] backend fix --- src/api/docling.py | 104 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/api/docling.py diff --git a/src/api/docling.py b/src/api/docling.py new file mode 100644 index 00000000..66b7777e --- /dev/null +++ b/src/api/docling.py @@ -0,0 +1,104 @@ +"""Docling service proxy endpoints.""" + +import socket +import struct +from pathlib import Path + +import httpx +from starlette.requests import Request +from starlette.responses import JSONResponse + +from utils.container_utils import ( + detect_container_environment, + get_container_host, + guess_host_ip_for_containers, +) +from utils.logging_config import get_logger + +logger = get_logger(__name__) + + +def _get_gateway_ip_from_route() -> str | None: + """Return the default gateway IP visible from the current network namespace.""" + try: + with Path("/proc/net/route").open() as route_table: + next(route_table) # Skip header + for line in route_table: + fields = line.strip().split() + min_fields = 3 # interface, destination, gateway + if len(fields) >= min_fields and fields[1] == "00000000": + gateway_hex = fields[2] + gw_int = int(gateway_hex, 16) + gateway_ip = socket.inet_ntoa(struct.pack(" str: + """Determine the host address used for docling health checks.""" + container_type = detect_container_environment() + if container_type: + container_host = get_container_host() + if container_host: + logger.info("Using container-aware host '%s'", container_host) + return container_host + + gateway_ip = _get_gateway_ip_from_route() + if gateway_ip: + logger.info("Detected host gateway IP: %s", gateway_ip) + return gateway_ip + + # Either we're not inside a container or gateway detection failed. + fallback_ip = guess_host_ip_for_containers(logger=logger) + if container_type: + logger.info("Falling back to container bridge host %s", fallback_ip) + else: + logger.info("Running outside a container; using host %s", fallback_ip) + return fallback_ip + + +# Detect the host IP once at startup +HOST_IP = determine_docling_host() +DOCLING_SERVICE_URL = f"http://{HOST_IP}:5001" + + +async def health(request: Request) -> JSONResponse: + """ + Proxy health check to docling-serve. + This allows the frontend to check docling status via same-origin request. + """ + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{DOCLING_SERVICE_URL}/health", + timeout=2.0 + ) + + if response.status_code == 200: + return JSONResponse({ + "status": "healthy", + "host": HOST_IP + }) + else: + return JSONResponse({ + "status": "unhealthy", + "message": f"Health check failed with status: {response.status_code}", + "host": HOST_IP + }, status_code=503) + + except httpx.TimeoutException: + return JSONResponse({ + "status": "unhealthy", + "message": "Connection timeout", + "host": HOST_IP + }, status_code=503) + except Exception as e: + logger.error("Docling health check failed", error=str(e)) + return JSONResponse({ + "status": "unhealthy", + "message": str(e), + "host": HOST_IP + }, status_code=503) From 071a861a437ab3b10997cb273004b8c3a9aaa040 Mon Sep 17 00:00:00 2001 From: phact Date: Mon, 6 Oct 2025 21:26:59 -0400 Subject: [PATCH 028/113] linter --- frontend/src/app/api/queries/useGetNudgesQuery.ts | 3 +-- frontend/src/app/connectors/page.tsx | 1 + frontend/src/app/knowledge/chunks/page.tsx | 14 +++++++------- frontend/src/app/settings/page.tsx | 1 + .../src/components/cloud-connectors-dialog.tsx | 1 + frontend/src/contexts/task-context.tsx | 1 + 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/frontend/src/app/api/queries/useGetNudgesQuery.ts b/frontend/src/app/api/queries/useGetNudgesQuery.ts index 2e313e0c..cf6450db 100644 --- a/frontend/src/app/api/queries/useGetNudgesQuery.ts +++ b/frontend/src/app/api/queries/useGetNudgesQuery.ts @@ -6,8 +6,7 @@ import { type Nudge = string; -const DEFAULT_NUDGES = [ -]; +const DEFAULT_NUDGES: Nudge[] = []; export const useGetNudgesQuery = ( chatId?: string | null, diff --git a/frontend/src/app/connectors/page.tsx b/frontend/src/app/connectors/page.tsx index ad70ec90..06aa0265 100644 --- a/frontend/src/app/connectors/page.tsx +++ b/frontend/src/app/connectors/page.tsx @@ -92,6 +92,7 @@ export default function ConnectorsPage() { selectedFiles={selectedFiles} isAuthenticated={false} // This would come from auth context in real usage accessToken={undefined} // This would come from connected account + isIngesting={isSyncing} /> diff --git a/frontend/src/app/knowledge/chunks/page.tsx b/frontend/src/app/knowledge/chunks/page.tsx index 080120cc..6da3dc5e 100644 --- a/frontend/src/app/knowledge/chunks/page.tsx +++ b/frontend/src/app/knowledge/chunks/page.tsx @@ -83,13 +83,13 @@ function ChunksPageContent() { }, [data, filename]); // Set selected state for all checkboxes when selectAll changes - useEffect(() => { - if (selectAll) { - setSelectedChunks(new Set(chunks.map((_, index) => index))); - } else { - setSelectedChunks(new Set()); - } - }, [selectAll, setSelectedChunks, chunks]); + // useEffect(() => { + // if (selectAll) { + // setSelectedChunks(new Set(chunks.map((_, index) => index))); + // } else { + // setSelectedChunks(new Set()); + // } + // }, [selectAll, setSelectedChunks, chunks]); const handleBack = useCallback(() => { router.push("/knowledge"); diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx index 6fe74c4c..148da3bd 100644 --- a/frontend/src/app/settings/page.tsx +++ b/frontend/src/app/settings/page.tsx @@ -85,6 +85,7 @@ interface Connector { connectionId?: string; access_token?: string; selectedFiles?: GoogleDriveFile[] | OneDriveFile[]; + available?: boolean; } interface SyncResult { diff --git a/frontend/src/components/cloud-connectors-dialog.tsx b/frontend/src/components/cloud-connectors-dialog.tsx index d38cf44f..077582bf 100644 --- a/frontend/src/components/cloud-connectors-dialog.tsx +++ b/frontend/src/components/cloud-connectors-dialog.tsx @@ -283,6 +283,7 @@ export function CloudConnectorsDialog({ accessToken={connectorAccessTokens[connector.type]} onPickerStateChange={() => {}} clientId={connector.clientId} + isIngesting={false} /> ); diff --git a/frontend/src/contexts/task-context.tsx b/frontend/src/contexts/task-context.tsx index 12ad3c24..9b3d9908 100644 --- a/frontend/src/contexts/task-context.tsx +++ b/frontend/src/contexts/task-context.tsx @@ -19,6 +19,7 @@ import { import { useAuth } from "@/contexts/auth-context"; // Task interface is now imported from useGetTasksQuery +export type { Task }; export interface TaskFile { filename: string; From 681c9437cef24764d275d4fae02c5c59e7db3964 Mon Sep 17 00:00:00 2001 From: phact Date: Mon, 6 Oct 2025 22:06:50 -0400 Subject: [PATCH 029/113] copy [folders vs files] uploading one file -> one item --- frontend/src/app/upload/[provider]/page.tsx | 12 ++++++------ frontend/src/components/cloud-connectors-dialog.tsx | 4 ++-- frontend/src/components/cloud-picker/file-list.tsx | 2 +- .../src/components/cloud-picker/picker-header.tsx | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/frontend/src/app/upload/[provider]/page.tsx b/frontend/src/app/upload/[provider]/page.tsx index 10b9b0e5..b144106d 100644 --- a/frontend/src/app/upload/[provider]/page.tsx +++ b/frontend/src/app/upload/[provider]/page.tsx @@ -165,7 +165,7 @@ export default function UploadProviderPage() { const handleFileSelected = (files: CloudFile[]) => { setSelectedFiles(files); - console.log(`Selected ${files.length} files from ${provider}:`, files); + console.log(`Selected ${files.length} item(s) from ${provider}:`, files); // You can add additional handling here like triggering sync, etc. }; @@ -376,19 +376,19 @@ export default function UploadProviderPage() { loading={isIngesting} disabled={!hasSelectedFiles || isIngesting} > - {!hasSelectedFiles ? ( - <>Ingest files - ) : ( + {hasSelectedFiles ? ( <> - Ingest {selectedFiles.length} file + Ingest {selectedFiles.length} item {selectedFiles.length > 1 ? "s" : ""} + ) : ( + <>Ingest selected items )} {!hasSelectedFiles ? ( - Select at least one file before ingesting + Select at least one item before ingesting ) : null} diff --git a/frontend/src/components/cloud-connectors-dialog.tsx b/frontend/src/components/cloud-connectors-dialog.tsx index 077582bf..ee7dfbbe 100644 --- a/frontend/src/components/cloud-connectors-dialog.tsx +++ b/frontend/src/components/cloud-connectors-dialog.tsx @@ -201,7 +201,7 @@ export function CloudConnectorsDialog({ Cloud File Connectors - Select files from your connected cloud storage providers + Select files or folders from your connected cloud storage providers @@ -232,7 +232,7 @@ export function CloudConnectorsDialog({ !connector.hasAccessToken ? connector.accessTokenError || "Access token required - try reconnecting your account" - : `Select files from ${connector.name}` + : `Select files or folders from ${connector.name}` } onClick={e => { e.preventDefault(); diff --git a/frontend/src/components/cloud-picker/file-list.tsx b/frontend/src/components/cloud-picker/file-list.tsx index 7033fcf8..8cf2b728 100644 --- a/frontend/src/components/cloud-picker/file-list.tsx +++ b/frontend/src/components/cloud-picker/file-list.tsx @@ -26,7 +26,7 @@ export const FileList = ({ return (
-

Added files ({files.length})

+

Selected items ({files.length})

); } @@ -48,7 +48,7 @@ export const PickerHeader = ({

- Select files from {getProviderName(provider)} to ingest. + Select files or folders from {getProviderName(provider)} to ingest.

From e7799e1a2c4e027c84f270987fe13e3246edb767 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 6 Oct 2025 23:15:13 -0400 Subject: [PATCH 030/113] Quote OWNER_NAME to handle spaces in headers OWNER_NAME is now wrapped in double quotes to prevent issues with spaces and special characters when used in headers. This change improves reliability when passing user names containing spaces. --- src/services/auth_service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/services/auth_service.py b/src/services/auth_service.py index ab33f035..f58997ac 100644 --- a/src/services/auth_service.py +++ b/src/services/auth_service.py @@ -301,7 +301,11 @@ class AuthService: if user_info.get("id"): global_vars["OWNER"] = user_info.get("id") if user_info.get("name"): - global_vars["OWNER_NAME"] = user_info.get("name") + # OWNER_NAME may contain spaces, which can cause issues in headers. + # Alternative: URL-encode the owner name to preserve spaces and special characters. + owner_name = user_info.get("name") + if owner_name: + global_vars["OWNER_NAME"] = str(f"\"{owner_name}\"") if user_info.get("email"): global_vars["OWNER_EMAIL"] = user_info.get("email") From eb1acde7df9673f6d4fc835e294eecc20aa9e39f Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 6 Oct 2025 23:16:32 -0400 Subject: [PATCH 031/113] update the docker compose --- docker-compose.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 6a4a41b8..4a68d210 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,9 +78,9 @@ services: openrag-frontend: image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} - build: - context: . - dockerfile: Dockerfile.frontend + # build: + # context: . + # dockerfile: Dockerfile.frontend container_name: openrag-frontend depends_on: - openrag-backend From 03297a1f877aeefc420caa10db76b29c0f01e690 Mon Sep 17 00:00:00 2001 From: phact Date: Mon, 6 Oct 2025 23:21:09 -0400 Subject: [PATCH 032/113] 0.1.15 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index be8d359c..ee0143f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openrag" -version = "0.1.14.dev3" +version = "0.1.15" description = "Add your description here" readme = "README.md" requires-python = ">=3.13" From a0a93e96ce1df35a40f5283f851e24e58c13b3b2 Mon Sep 17 00:00:00 2001 From: phact Date: Mon, 6 Oct 2025 23:36:37 -0400 Subject: [PATCH 033/113] lint types fix --- frontend/src/components/cloud-picker/types.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/frontend/src/components/cloud-picker/types.ts b/frontend/src/components/cloud-picker/types.ts index 20b1eae0..85ce83a9 100644 --- a/frontend/src/components/cloud-picker/types.ts +++ b/frontend/src/components/cloud-picker/types.ts @@ -53,6 +53,7 @@ declare global { load: (callback: () => void) => void; }; PickerBuilder: new () => GooglePickerBuilder; + DocsView: new () => GoogleDocsView; ViewId: { DOCS: string; FOLDERS: string; @@ -83,8 +84,13 @@ declare global { } } +export interface GoogleDocsView { + setIncludeFolders: (include: boolean) => GoogleDocsView; + setSelectFolderEnabled: (enabled: boolean) => GoogleDocsView; +} + export interface GooglePickerBuilder { - addView: (view: string) => GooglePickerBuilder; + addView: (view: GoogleDocsView | string) => GooglePickerBuilder; setOAuthToken: (token: string) => GooglePickerBuilder; setCallback: ( callback: (data: GooglePickerData) => void From 8ee1011562721c4d5e6269174515285df6dc0799 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 01:32:56 -0400 Subject: [PATCH 034/113] unnecessary arg --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index a09d2488..732eee1f 100644 --- a/src/main.py +++ b/src/main.py @@ -131,7 +131,7 @@ async def configure_alerting_security(): # Don't fail startup if alerting config fails -async def _ensure_opensearch_index(self): +async def _ensure_opensearch_index(): """Ensure OpenSearch index exists when using traditional connector service.""" try: # Check if index already exists From 31e49106fa9aeaa53c63134fa3879739c7bf5151 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 01:34:01 -0400 Subject: [PATCH 035/113] dotenv override=False --- src/config/settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config/settings.py b/src/config/settings.py index 6f55520d..d5a0bcac 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -13,8 +13,8 @@ from utils.container_utils import get_container_host from utils.document_processing import create_document_converter from utils.logging_config import get_logger -load_dotenv() -load_dotenv("../") +load_dotenv(override=False) +load_dotenv("../", override=False) logger = get_logger(__name__) From 65590f2a60a432878f5222fbb9b6bc7aaac01d50 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 01:34:17 -0400 Subject: [PATCH 036/113] test-ci makefile with docling-serve --- Makefile | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index c24cce0b..47d61705 100644 --- a/Makefile +++ b/Makefile @@ -192,19 +192,26 @@ test-integration: # CI-friendly integration test target: brings up infra, waits, runs tests, tears down test-ci: @set -e; \ - echo "๐Ÿ“ฆ Installing test dependencies..."; \ + echo "Installing test dependencies..."; \ uv sync --group dev; \ - echo "๐Ÿš€ Starting infra (OpenSearch + Dashboards + Langflow) with CPU containers"; \ + echo "Starting infra (OpenSearch + Dashboards + Langflow) with CPU containers"; \ docker compose -f docker-compose-cpu.yml up -d opensearch dashboards langflow; \ - echo "โณ Waiting for OpenSearch..."; \ + echo "Starting docling-serve..."; \ + DOCLING_ENDPOINT=$$(uv run python scripts/docling_ctl.py start --port 5001 | grep "Endpoint:" | awk '{print $$2}'); \ + echo "Docling-serve started at $$DOCLING_ENDPOINT"; \ + echo "Waiting for OpenSearch..."; \ for i in $$(seq 1 60); do \ curl -k -s https://localhost:9200 -u admin:$${OPENSEARCH_PASSWORD} >/dev/null 2>&1 && break || sleep 2; \ done; \ - echo "โณ Waiting for Langflow..."; \ + echo "Waiting for Langflow..."; \ for i in $$(seq 1 60); do \ curl -s http://localhost:7860/ >/dev/null 2>&1 && break || sleep 2; \ done; \ - echo "๐Ÿงช Running integration tests"; \ + echo "Waiting for docling-serve at $$DOCLING_ENDPOINT..."; \ + for i in $$(seq 1 60); do \ + curl -s $${DOCLING_ENDPOINT}/health >/dev/null 2>&1 && break || sleep 2; \ + done; \ + echo "Running integration tests"; \ LOG_LEVEL=$${LOG_LEVEL:-DEBUG} \ GOOGLE_OAUTH_CLIENT_ID="" \ GOOGLE_OAUTH_CLIENT_SECRET="" \ @@ -212,7 +219,8 @@ test-ci: OPENSEARCH_USERNAME=admin OPENSEARCH_PASSWORD=$${OPENSEARCH_PASSWORD} \ DISABLE_STARTUP_INGEST=$${DISABLE_STARTUP_INGEST:-true} \ uv run pytest tests/integration -vv -s -o log_cli=true --log-cli-level=DEBUG; \ - echo "๐Ÿงน Tearing down infra"; \ + echo "Tearing down infra"; \ + uv run python scripts/docling_ctl.py stop || true; \ docker compose down -v || true lint: From adadb6ef0a7f99330e19c792f124b2f312f9de50 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 01:34:38 -0400 Subject: [PATCH 037/113] docling-ctl for test-ci makefile --- scripts/docling_ctl.py | 91 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 scripts/docling_ctl.py diff --git a/scripts/docling_ctl.py b/scripts/docling_ctl.py new file mode 100644 index 00000000..8dc5c879 --- /dev/null +++ b/scripts/docling_ctl.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Helper script to control docling-serve using DoclingManager for CI/testing.""" + +import sys +import asyncio +import argparse +from pathlib import Path + +# Add src to path so we can import DoclingManager +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from tui.managers.docling_manager import DoclingManager + + +async def start_docling(port: int = 5001, host: str = None, enable_ui: bool = False): + """Start docling-serve.""" + manager = DoclingManager() + + if manager.is_running(): + print(f"Docling-serve is already running") + status = manager.get_status() + print(f"Endpoint: {status['endpoint']}") + return 0 + + host_msg = f"{host}:{port}" if host else f"auto-detected host:{port}" + print(f"Starting docling-serve on {host_msg}...") + success, message = await manager.start(port=port, host=host, enable_ui=enable_ui) + + if success: + print(f"{message}") + status = manager.get_status() + print(f"Endpoint: {status['endpoint']}") + print(f"PID: {status['pid']}") + return 0 + else: + print(f"{message}", file=sys.stderr) + return 1 + + +async def stop_docling(): + """Stop docling-serve.""" + manager = DoclingManager() + + if not manager.is_running(): + print("Docling-serve is not running") + return 0 + + print("Stopping docling-serve...") + success, message = await manager.stop() + + if success: + print(f"{message}") + return 0 + else: + print(f"{message}", file=sys.stderr) + return 1 + + +async def status_docling(): + """Get docling-serve status.""" + manager = DoclingManager() + status = manager.get_status() + + print(f"Status: {status['status']}") + if status['status'] == 'running': + print(f"Endpoint: {status['endpoint']}") + print(f"Docs: {status['docs_url']}") + print(f"PID: {status['pid']}") + + return 0 if status['status'] == 'running' else 1 + + +async def main(): + parser = argparse.ArgumentParser(description="Control docling-serve for CI/testing") + parser.add_argument("command", choices=["start", "stop", "status"], help="Command to run") + parser.add_argument("--port", type=int, default=5001, help="Port to run on (default: 5001)") + parser.add_argument("--host", default=None, help="Host to bind to (default: auto-detect for containers)") + parser.add_argument("--enable-ui", action="store_true", help="Enable UI") + + args = parser.parse_args() + + if args.command == "start": + return await start_docling(port=args.port, host=args.host if args.host else None, enable_ui=args.enable_ui) + elif args.command == "stop": + return await stop_docling() + elif args.command == "status": + return await status_docling() + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) From ad890ef2bcfd3bbec4e34fd655e72bf5f82993db Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 01:35:20 -0400 Subject: [PATCH 038/113] index creation text fix --- tests/conftest.py | 6 +++--- tests/integration/test_api_endpoints.py | 14 ++++++++++++-- tests/integration/test_startup_ingest.py | 5 +++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 87722481..27a6f750 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,10 +10,10 @@ from dotenv import load_dotenv # Load environment variables load_dotenv() -# Force no-auth mode for testing by removing OAuth credentials +# Force no-auth mode for testing by setting OAuth credentials to empty strings # This ensures anonymous JWT tokens are created automatically -os.environ.pop('GOOGLE_OAUTH_CLIENT_ID', None) -os.environ.pop('GOOGLE_OAUTH_CLIENT_SECRET', None) +os.environ['GOOGLE_OAUTH_CLIENT_ID'] = '' +os.environ['GOOGLE_OAUTH_CLIENT_SECRET'] = '' from src.config.settings import clients from src.session_manager import SessionManager diff --git a/tests/integration/test_api_endpoints.py b/tests/integration/test_api_endpoints.py index 60810563..20f57d55 100644 --- a/tests/integration/test_api_endpoints.py +++ b/tests/integration/test_api_endpoints.py @@ -50,6 +50,7 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges "src.api.router", "src.api.connector_router", "src.config.settings", + "src.auth_middleware", "src.main", ]: sys.modules.pop(mod, None) @@ -68,7 +69,11 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges app = await create_app() # Manually run startup tasks since httpx ASGI transport here doesn't manage lifespan await startup_tasks(app.state.services) - + + # Ensure index exists for tests (startup_tasks only creates it if DISABLE_INGEST_WITH_LANGFLOW=True) + from src.main import _ensure_opensearch_index + await _ensure_opensearch_index() + # Verify index is truly empty after startup try: count_response = await clients.opensearch.count(index=INDEX_NAME) @@ -159,6 +164,7 @@ async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow "src.api.router", "src.api.connector_router", "src.config.settings", + "src.auth_middleware", "src.main", ]: sys.modules.pop(mod, None) @@ -176,7 +182,11 @@ async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow app = await create_app() await startup_tasks(app.state.services) - + + # Ensure index exists for tests (startup_tasks only creates it if DISABLE_INGEST_WITH_LANGFLOW=True) + from src.main import _ensure_opensearch_index + await _ensure_opensearch_index() + # Verify index is truly empty after startup try: count_response = await clients.opensearch.count(index=INDEX_NAME) diff --git a/tests/integration/test_startup_ingest.py b/tests/integration/test_startup_ingest.py index 436c4d28..b2243b33 100644 --- a/tests/integration/test_startup_ingest.py +++ b/tests/integration/test_startup_ingest.py @@ -51,6 +51,7 @@ async def test_startup_ingest_creates_task(disable_langflow_ingest: bool): "src.api.router", "src.api.connector_router", "src.config.settings", + "src.auth_middleware", "src.main", ]: sys.modules.pop(mod, None) @@ -69,6 +70,10 @@ async def test_startup_ingest_creates_task(disable_langflow_ingest: bool): # Trigger startup tasks explicitly await startup_tasks(app.state.services) + # Ensure index exists for tests (startup_tasks only creates it if DISABLE_INGEST_WITH_LANGFLOW=True) + from src.main import _ensure_opensearch_index + await _ensure_opensearch_index() + transport = httpx.ASGITransport(app=app) try: async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: From 330b16ae06e9e6e9ed35f5c89c8980e9a9b0bd92 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 02:00:57 -0400 Subject: [PATCH 039/113] preserve file name for upload --- src/services/document_service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/services/document_service.py b/src/services/document_service.py index 5204ea0e..d596fb25 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -126,7 +126,11 @@ class DocumentService: from utils.file_utils import auto_cleanup_tempfile import os - with auto_cleanup_tempfile() as tmp_path: + # Preserve file extension for docling format detection + filename = upload_file.filename or "uploaded" + suffix = os.path.splitext(filename)[1] or "" + + with auto_cleanup_tempfile(suffix=suffix) as tmp_path: # Stream upload file to temporary file file_size = 0 with open(tmp_path, 'wb') as tmp_file: From 5e48d7b791b88dc5bec3312b16d1f1e598d0ccc4 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 02:07:06 -0400 Subject: [PATCH 040/113] trace logging --- src/auth_middleware.py | 4 ++-- src/config/settings.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/auth_middleware.py b/src/auth_middleware.py index 44d1b2f0..45333c2f 100644 --- a/src/auth_middleware.py +++ b/src/auth_middleware.py @@ -28,7 +28,7 @@ def require_auth(session_manager): async def wrapper(request: Request): # In no-auth mode, bypass authentication entirely if is_no_auth_mode(): - logger.debug("No-auth mode: Creating anonymous user") + logger.trace("No-auth mode: Creating anonymous user") # Create an anonymous user object so endpoints don't break from session_manager import User from datetime import datetime @@ -36,7 +36,7 @@ def require_auth(session_manager): from session_manager import AnonymousUser request.state.user = AnonymousUser() request.state.jwt_token = None # No JWT in no-auth mode - logger.debug("Set user_id=anonymous, jwt_token=None") + logger.trace("Set user_id=anonymous, jwt_token=None") return await handler(request) user = get_current_user(request, session_manager) diff --git a/src/config/settings.py b/src/config/settings.py index d5a0bcac..6e4581dd 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -61,7 +61,7 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv( def is_no_auth_mode(): """Check if we're running in no-auth mode (OAuth credentials missing)""" result = not (GOOGLE_OAUTH_CLIENT_ID and GOOGLE_OAUTH_CLIENT_SECRET) - logger.debug( + logger.trace( "Checking auth mode", no_auth_mode=result, has_client_id=GOOGLE_OAUTH_CLIENT_ID is not None, From 13c33fca8f72710fad1af53b835a156a746bba29 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 02:11:51 -0400 Subject: [PATCH 041/113] remove logging --- src/auth_middleware.py | 2 -- src/config/settings.py | 6 ------ 2 files changed, 8 deletions(-) diff --git a/src/auth_middleware.py b/src/auth_middleware.py index 45333c2f..1bc6cf04 100644 --- a/src/auth_middleware.py +++ b/src/auth_middleware.py @@ -28,7 +28,6 @@ def require_auth(session_manager): async def wrapper(request: Request): # In no-auth mode, bypass authentication entirely if is_no_auth_mode(): - logger.trace("No-auth mode: Creating anonymous user") # Create an anonymous user object so endpoints don't break from session_manager import User from datetime import datetime @@ -36,7 +35,6 @@ def require_auth(session_manager): from session_manager import AnonymousUser request.state.user = AnonymousUser() request.state.jwt_token = None # No JWT in no-auth mode - logger.trace("Set user_id=anonymous, jwt_token=None") return await handler(request) user = get_current_user(request, session_manager) diff --git a/src/config/settings.py b/src/config/settings.py index 6e4581dd..598ccfb2 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -61,12 +61,6 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv( def is_no_auth_mode(): """Check if we're running in no-auth mode (OAuth credentials missing)""" result = not (GOOGLE_OAUTH_CLIENT_ID and GOOGLE_OAUTH_CLIENT_SECRET) - logger.trace( - "Checking auth mode", - no_auth_mode=result, - has_client_id=GOOGLE_OAUTH_CLIENT_ID is not None, - has_client_secret=GOOGLE_OAUTH_CLIENT_SECRET is not None, - ) return result From 3efcbfd36476094400fadf8fcf2f12d901ed8418 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 03:56:02 -0400 Subject: [PATCH 042/113] fix tests --- tests/integration/test_api_endpoints.py | 48 +++- uv.lock | 368 +++++++++++++++++++----- 2 files changed, 337 insertions(+), 79 deletions(-) diff --git a/tests/integration/test_api_endpoints.py b/tests/integration/test_api_endpoints.py index 20f57d55..fa36dc8b 100644 --- a/tests/integration/test_api_endpoints.py +++ b/tests/integration/test_api_endpoints.py @@ -18,14 +18,20 @@ async def wait_for_service_ready(client: httpx.AsyncClient, timeout_s: float = 3 while asyncio.get_event_loop().time() < deadline: try: r1 = await client.get("/auth/me") + if r1.status_code in (401, 403): + raise AssertionError(f"/auth/me returned {r1.status_code}: {r1.text}") if r1.status_code != 200: await asyncio.sleep(0.5) continue # match_all readiness probe; no embeddings r2 = await client.post("/search", json={"query": "*", "limit": 0}) + if r2.status_code in (401, 403): + raise AssertionError(f"/search returned {r2.status_code}: {r2.text}") if r2.status_code == 200: return last_err = r2.text + except AssertionError: + raise except Exception as e: last_err = str(e) await asyncio.sleep(0.5) @@ -48,14 +54,24 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges # Clear cached modules so settings pick up env and router sees new flag for mod in [ "src.api.router", + "api.router", # Also clear the non-src path "src.api.connector_router", + "api.connector_router", "src.config.settings", + "config.settings", "src.auth_middleware", + "auth_middleware", "src.main", + "api", # Clear the api package itself + "src.api", ]: sys.modules.pop(mod, None) from src.main import create_app, startup_tasks - from src.config.settings import clients, INDEX_NAME + import src.api.router as upload_router + from src.config.settings import clients, INDEX_NAME, DISABLE_INGEST_WITH_LANGFLOW + + # Verify settings loaded correctly + print(f"Settings DISABLE_INGEST_WITH_LANGFLOW: {DISABLE_INGEST_WITH_LANGFLOW}") # Ensure a clean index before startup await clients.initialize() @@ -108,9 +124,9 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges } upload_resp = await client.post("/upload", files=files) body = upload_resp.json() - # Router now returns 201 + task_id (async) regardless of mode assert upload_resp.status_code == 201, upload_resp.text - assert isinstance(body.get("task_id"), str) + assert body.get("status") in {"indexed", "unchanged"} + assert isinstance(body.get("id"), str) # Poll search for the specific content until it's indexed async def _wait_for_indexed(timeout_s: float = 30.0): @@ -162,14 +178,24 @@ async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow import sys for mod in [ "src.api.router", + "api.router", # Also clear the non-src path "src.api.connector_router", + "api.connector_router", "src.config.settings", + "config.settings", "src.auth_middleware", + "auth_middleware", "src.main", + "api", # Clear the api package itself + "src.api", ]: sys.modules.pop(mod, None) from src.main import create_app, startup_tasks - from src.config.settings import clients, INDEX_NAME + import src.api.router as upload_router + from src.config.settings import clients, INDEX_NAME, DISABLE_INGEST_WITH_LANGFLOW + + # Verify settings loaded correctly + print(f"Settings DISABLE_INGEST_WITH_LANGFLOW: {DISABLE_INGEST_WITH_LANGFLOW}") # Ensure a clean index before startup await clients.initialize() @@ -211,10 +237,18 @@ async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow ) } - resp = await client.post("/upload", files=files) + resp = await client.post("/router/upload_ingest", files=files) data = resp.json() - assert resp.status_code == 201, resp.text - assert isinstance(data.get("task_id"), str) + + print(f"data: {data}") + if disable_langflow_ingest: + assert resp.status_code == 201 or resp.status_code == 202, resp.text + assert data.get("status") in {"indexed", "unchanged"} + assert isinstance(data.get("id"), str) + else: + assert resp.status_code == 201 or resp.status_code == 202, resp.text + assert isinstance(data.get("task_id"), str) + assert data.get("file_count") == 1 finally: from src.config.settings import clients try: diff --git a/uv.lock b/uv.lock index c9bc6714..fd5164cb 100644 --- a/uv.lock +++ b/uv.lock @@ -5,7 +5,8 @@ resolution-markers = [ "sys_platform == 'darwin'", "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] [[package]] @@ -20,8 +21,9 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b1/72/ff3961c19ee395c3d30ac630ee77bfb0e1b46b87edc504d4f83bb4a89705/accelerate-1.10.1.tar.gz", hash = "sha256:3dea89e433420e4bfac0369cae7e36dcd6a56adfcfd38cdda145c6225eab5df8", size = 392446, upload-time = "2025-08-25T13:57:06.21Z" } wheels = [ @@ -293,7 +295,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -312,6 +315,67 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coverage" +version = "7.10.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/26/d22c300112504f5f9a9fd2297ce33c35f3d353e4aeb987c8419453b2a7c2/coverage-7.10.7.tar.gz", hash = "sha256:f4ab143ab113be368a3e9b795f9cd7906c5ef407d6173fe9675a902e1fffc239", size = 827704, upload-time = "2025-09-21T20:03:56.815Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/94/b765c1abcb613d103b64fcf10395f54d69b0ef8be6a0dd9c524384892cc7/coverage-7.10.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:981a651f543f2854abd3b5fcb3263aac581b18209be49863ba575de6edf4c14d", size = 218320, upload-time = "2025-09-21T20:01:56.629Z" }, + { url = "https://files.pythonhosted.org/packages/72/4f/732fff31c119bb73b35236dd333030f32c4bfe909f445b423e6c7594f9a2/coverage-7.10.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:73ab1601f84dc804f7812dc297e93cd99381162da39c47040a827d4e8dafe63b", size = 218575, upload-time = "2025-09-21T20:01:58.203Z" }, + { url = "https://files.pythonhosted.org/packages/87/02/ae7e0af4b674be47566707777db1aa375474f02a1d64b9323e5813a6cdd5/coverage-7.10.7-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a8b6f03672aa6734e700bbcd65ff050fd19cddfec4b031cc8cf1c6967de5a68e", size = 249568, upload-time = "2025-09-21T20:01:59.748Z" }, + { url = "https://files.pythonhosted.org/packages/a2/77/8c6d22bf61921a59bce5471c2f1f7ac30cd4ac50aadde72b8c48d5727902/coverage-7.10.7-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10b6ba00ab1132a0ce4428ff68cf50a25efd6840a42cdf4239c9b99aad83be8b", size = 252174, upload-time = "2025-09-21T20:02:01.192Z" }, + { url = "https://files.pythonhosted.org/packages/b1/20/b6ea4f69bbb52dac0aebd62157ba6a9dddbfe664f5af8122dac296c3ee15/coverage-7.10.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c79124f70465a150e89340de5963f936ee97097d2ef76c869708c4248c63ca49", size = 253447, upload-time = "2025-09-21T20:02:02.701Z" }, + { url = "https://files.pythonhosted.org/packages/f9/28/4831523ba483a7f90f7b259d2018fef02cb4d5b90bc7c1505d6e5a84883c/coverage-7.10.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:69212fbccdbd5b0e39eac4067e20a4a5256609e209547d86f740d68ad4f04911", size = 249779, upload-time = "2025-09-21T20:02:04.185Z" }, + { url = "https://files.pythonhosted.org/packages/a7/9f/4331142bc98c10ca6436d2d620c3e165f31e6c58d43479985afce6f3191c/coverage-7.10.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7ea7c6c9d0d286d04ed3541747e6597cbe4971f22648b68248f7ddcd329207f0", size = 251604, upload-time = "2025-09-21T20:02:06.034Z" }, + { url = "https://files.pythonhosted.org/packages/ce/60/bda83b96602036b77ecf34e6393a3836365481b69f7ed7079ab85048202b/coverage-7.10.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b9be91986841a75042b3e3243d0b3cb0b2434252b977baaf0cd56e960fe1e46f", size = 249497, upload-time = "2025-09-21T20:02:07.619Z" }, + { url = "https://files.pythonhosted.org/packages/5f/af/152633ff35b2af63977edd835d8e6430f0caef27d171edf2fc76c270ef31/coverage-7.10.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:b281d5eca50189325cfe1f365fafade89b14b4a78d9b40b05ddd1fc7d2a10a9c", size = 249350, upload-time = "2025-09-21T20:02:10.34Z" }, + { url = "https://files.pythonhosted.org/packages/9d/71/d92105d122bd21cebba877228990e1646d862e34a98bb3374d3fece5a794/coverage-7.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:99e4aa63097ab1118e75a848a28e40d68b08a5e19ce587891ab7fd04475e780f", size = 251111, upload-time = "2025-09-21T20:02:12.122Z" }, + { url = "https://files.pythonhosted.org/packages/a2/9e/9fdb08f4bf476c912f0c3ca292e019aab6712c93c9344a1653986c3fd305/coverage-7.10.7-cp313-cp313-win32.whl", hash = "sha256:dc7c389dce432500273eaf48f410b37886be9208b2dd5710aaf7c57fd442c698", size = 220746, upload-time = "2025-09-21T20:02:13.919Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b1/a75fd25df44eab52d1931e89980d1ada46824c7a3210be0d3c88a44aaa99/coverage-7.10.7-cp313-cp313-win_amd64.whl", hash = "sha256:cac0fdca17b036af3881a9d2729a850b76553f3f716ccb0360ad4dbc06b3b843", size = 221541, upload-time = "2025-09-21T20:02:15.57Z" }, + { url = "https://files.pythonhosted.org/packages/14/3a/d720d7c989562a6e9a14b2c9f5f2876bdb38e9367126d118495b89c99c37/coverage-7.10.7-cp313-cp313-win_arm64.whl", hash = "sha256:4b6f236edf6e2f9ae8fcd1332da4e791c1b6ba0dc16a2dc94590ceccb482e546", size = 220170, upload-time = "2025-09-21T20:02:17.395Z" }, + { url = "https://files.pythonhosted.org/packages/bb/22/e04514bf2a735d8b0add31d2b4ab636fc02370730787c576bb995390d2d5/coverage-7.10.7-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a0ec07fd264d0745ee396b666d47cef20875f4ff2375d7c4f58235886cc1ef0c", size = 219029, upload-time = "2025-09-21T20:02:18.936Z" }, + { url = "https://files.pythonhosted.org/packages/11/0b/91128e099035ece15da3445d9015e4b4153a6059403452d324cbb0a575fa/coverage-7.10.7-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd5e856ebb7bfb7672b0086846db5afb4567a7b9714b8a0ebafd211ec7ce6a15", size = 219259, upload-time = "2025-09-21T20:02:20.44Z" }, + { url = "https://files.pythonhosted.org/packages/8b/51/66420081e72801536a091a0c8f8c1f88a5c4bf7b9b1bdc6222c7afe6dc9b/coverage-7.10.7-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f57b2a3c8353d3e04acf75b3fed57ba41f5c0646bbf1d10c7c282291c97936b4", size = 260592, upload-time = "2025-09-21T20:02:22.313Z" }, + { url = "https://files.pythonhosted.org/packages/5d/22/9b8d458c2881b22df3db5bb3e7369e63d527d986decb6c11a591ba2364f7/coverage-7.10.7-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1ef2319dd15a0b009667301a3f84452a4dc6fddfd06b0c5c53ea472d3989fbf0", size = 262768, upload-time = "2025-09-21T20:02:24.287Z" }, + { url = "https://files.pythonhosted.org/packages/f7/08/16bee2c433e60913c610ea200b276e8eeef084b0d200bdcff69920bd5828/coverage-7.10.7-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83082a57783239717ceb0ad584de3c69cf581b2a95ed6bf81ea66034f00401c0", size = 264995, upload-time = "2025-09-21T20:02:26.133Z" }, + { url = "https://files.pythonhosted.org/packages/20/9d/e53eb9771d154859b084b90201e5221bca7674ba449a17c101a5031d4054/coverage-7.10.7-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:50aa94fb1fb9a397eaa19c0d5ec15a5edd03a47bf1a3a6111a16b36e190cff65", size = 259546, upload-time = "2025-09-21T20:02:27.716Z" }, + { url = "https://files.pythonhosted.org/packages/ad/b0/69bc7050f8d4e56a89fb550a1577d5d0d1db2278106f6f626464067b3817/coverage-7.10.7-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2120043f147bebb41c85b97ac45dd173595ff14f2a584f2963891cbcc3091541", size = 262544, upload-time = "2025-09-21T20:02:29.216Z" }, + { url = "https://files.pythonhosted.org/packages/ef/4b/2514b060dbd1bc0aaf23b852c14bb5818f244c664cb16517feff6bb3a5ab/coverage-7.10.7-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2fafd773231dd0378fdba66d339f84904a8e57a262f583530f4f156ab83863e6", size = 260308, upload-time = "2025-09-21T20:02:31.226Z" }, + { url = "https://files.pythonhosted.org/packages/54/78/7ba2175007c246d75e496f64c06e94122bdb914790a1285d627a918bd271/coverage-7.10.7-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:0b944ee8459f515f28b851728ad224fa2d068f1513ef6b7ff1efafeb2185f999", size = 258920, upload-time = "2025-09-21T20:02:32.823Z" }, + { url = "https://files.pythonhosted.org/packages/c0/b3/fac9f7abbc841409b9a410309d73bfa6cfb2e51c3fada738cb607ce174f8/coverage-7.10.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4b583b97ab2e3efe1b3e75248a9b333bd3f8b0b1b8e5b45578e05e5850dfb2c2", size = 261434, upload-time = "2025-09-21T20:02:34.86Z" }, + { url = "https://files.pythonhosted.org/packages/ee/51/a03bec00d37faaa891b3ff7387192cef20f01604e5283a5fabc95346befa/coverage-7.10.7-cp313-cp313t-win32.whl", hash = "sha256:2a78cd46550081a7909b3329e2266204d584866e8d97b898cd7fb5ac8d888b1a", size = 221403, upload-time = "2025-09-21T20:02:37.034Z" }, + { url = "https://files.pythonhosted.org/packages/53/22/3cf25d614e64bf6d8e59c7c669b20d6d940bb337bdee5900b9ca41c820bb/coverage-7.10.7-cp313-cp313t-win_amd64.whl", hash = "sha256:33a5e6396ab684cb43dc7befa386258acb2d7fae7f67330ebb85ba4ea27938eb", size = 222469, upload-time = "2025-09-21T20:02:39.011Z" }, + { url = "https://files.pythonhosted.org/packages/49/a1/00164f6d30d8a01c3c9c48418a7a5be394de5349b421b9ee019f380df2a0/coverage-7.10.7-cp313-cp313t-win_arm64.whl", hash = "sha256:86b0e7308289ddde73d863b7683f596d8d21c7d8664ce1dee061d0bcf3fbb4bb", size = 220731, upload-time = "2025-09-21T20:02:40.939Z" }, + { url = "https://files.pythonhosted.org/packages/23/9c/5844ab4ca6a4dd97a1850e030a15ec7d292b5c5cb93082979225126e35dd/coverage-7.10.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b06f260b16ead11643a5a9f955bd4b5fd76c1a4c6796aeade8520095b75de520", size = 218302, upload-time = "2025-09-21T20:02:42.527Z" }, + { url = "https://files.pythonhosted.org/packages/f0/89/673f6514b0961d1f0e20ddc242e9342f6da21eaba3489901b565c0689f34/coverage-7.10.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:212f8f2e0612778f09c55dd4872cb1f64a1f2b074393d139278ce902064d5b32", size = 218578, upload-time = "2025-09-21T20:02:44.468Z" }, + { url = "https://files.pythonhosted.org/packages/05/e8/261cae479e85232828fb17ad536765c88dd818c8470aca690b0ac6feeaa3/coverage-7.10.7-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3445258bcded7d4aa630ab8296dea4d3f15a255588dd535f980c193ab6b95f3f", size = 249629, upload-time = "2025-09-21T20:02:46.503Z" }, + { url = "https://files.pythonhosted.org/packages/82/62/14ed6546d0207e6eda876434e3e8475a3e9adbe32110ce896c9e0c06bb9a/coverage-7.10.7-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bb45474711ba385c46a0bfe696c695a929ae69ac636cda8f532be9e8c93d720a", size = 252162, upload-time = "2025-09-21T20:02:48.689Z" }, + { url = "https://files.pythonhosted.org/packages/ff/49/07f00db9ac6478e4358165a08fb41b469a1b053212e8a00cb02f0d27a05f/coverage-7.10.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:813922f35bd800dca9994c5971883cbc0d291128a5de6b167c7aa697fcf59360", size = 253517, upload-time = "2025-09-21T20:02:50.31Z" }, + { url = "https://files.pythonhosted.org/packages/a2/59/c5201c62dbf165dfbc91460f6dbbaa85a8b82cfa6131ac45d6c1bfb52deb/coverage-7.10.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:93c1b03552081b2a4423091d6fb3787265b8f86af404cff98d1b5342713bdd69", size = 249632, upload-time = "2025-09-21T20:02:51.971Z" }, + { url = "https://files.pythonhosted.org/packages/07/ae/5920097195291a51fb00b3a70b9bbd2edbfe3c84876a1762bd1ef1565ebc/coverage-7.10.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:cc87dd1b6eaf0b848eebb1c86469b9f72a1891cb42ac7adcfbce75eadb13dd14", size = 251520, upload-time = "2025-09-21T20:02:53.858Z" }, + { url = "https://files.pythonhosted.org/packages/b9/3c/a815dde77a2981f5743a60b63df31cb322c944843e57dbd579326625a413/coverage-7.10.7-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:39508ffda4f343c35f3236fe8d1a6634a51f4581226a1262769d7f970e73bffe", size = 249455, upload-time = "2025-09-21T20:02:55.807Z" }, + { url = "https://files.pythonhosted.org/packages/aa/99/f5cdd8421ea656abefb6c0ce92556709db2265c41e8f9fc6c8ae0f7824c9/coverage-7.10.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:925a1edf3d810537c5a3abe78ec5530160c5f9a26b1f4270b40e62cc79304a1e", size = 249287, upload-time = "2025-09-21T20:02:57.784Z" }, + { url = "https://files.pythonhosted.org/packages/c3/7a/e9a2da6a1fc5d007dd51fca083a663ab930a8c4d149c087732a5dbaa0029/coverage-7.10.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2c8b9a0636f94c43cd3576811e05b89aa9bc2d0a85137affc544ae5cb0e4bfbd", size = 250946, upload-time = "2025-09-21T20:02:59.431Z" }, + { url = "https://files.pythonhosted.org/packages/ef/5b/0b5799aa30380a949005a353715095d6d1da81927d6dbed5def2200a4e25/coverage-7.10.7-cp314-cp314-win32.whl", hash = "sha256:b7b8288eb7cdd268b0304632da8cb0bb93fadcfec2fe5712f7b9cc8f4d487be2", size = 221009, upload-time = "2025-09-21T20:03:01.324Z" }, + { url = "https://files.pythonhosted.org/packages/da/b0/e802fbb6eb746de006490abc9bb554b708918b6774b722bb3a0e6aa1b7de/coverage-7.10.7-cp314-cp314-win_amd64.whl", hash = "sha256:1ca6db7c8807fb9e755d0379ccc39017ce0a84dcd26d14b5a03b78563776f681", size = 221804, upload-time = "2025-09-21T20:03:03.4Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e8/71d0c8e374e31f39e3389bb0bd19e527d46f00ea8571ec7ec8fd261d8b44/coverage-7.10.7-cp314-cp314-win_arm64.whl", hash = "sha256:097c1591f5af4496226d5783d036bf6fd6cd0cbc132e071b33861de756efb880", size = 220384, upload-time = "2025-09-21T20:03:05.111Z" }, + { url = "https://files.pythonhosted.org/packages/62/09/9a5608d319fa3eba7a2019addeacb8c746fb50872b57a724c9f79f146969/coverage-7.10.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:a62c6ef0d50e6de320c270ff91d9dd0a05e7250cac2a800b7784bae474506e63", size = 219047, upload-time = "2025-09-21T20:03:06.795Z" }, + { url = "https://files.pythonhosted.org/packages/f5/6f/f58d46f33db9f2e3647b2d0764704548c184e6f5e014bef528b7f979ef84/coverage-7.10.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9fa6e4dd51fe15d8738708a973470f67a855ca50002294852e9571cdbd9433f2", size = 219266, upload-time = "2025-09-21T20:03:08.495Z" }, + { url = "https://files.pythonhosted.org/packages/74/5c/183ffc817ba68e0b443b8c934c8795553eb0c14573813415bd59941ee165/coverage-7.10.7-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8fb190658865565c549b6b4706856d6a7b09302c797eb2cf8e7fe9dabb043f0d", size = 260767, upload-time = "2025-09-21T20:03:10.172Z" }, + { url = "https://files.pythonhosted.org/packages/0f/48/71a8abe9c1ad7e97548835e3cc1adbf361e743e9d60310c5f75c9e7bf847/coverage-7.10.7-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:affef7c76a9ef259187ef31599a9260330e0335a3011732c4b9effa01e1cd6e0", size = 262931, upload-time = "2025-09-21T20:03:11.861Z" }, + { url = "https://files.pythonhosted.org/packages/84/fd/193a8fb132acfc0a901f72020e54be5e48021e1575bb327d8ee1097a28fd/coverage-7.10.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e16e07d85ca0cf8bafe5f5d23a0b850064e8e945d5677492b06bbe6f09cc699", size = 265186, upload-time = "2025-09-21T20:03:13.539Z" }, + { url = "https://files.pythonhosted.org/packages/b1/8f/74ecc30607dd95ad50e3034221113ccb1c6d4e8085cc761134782995daae/coverage-7.10.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:03ffc58aacdf65d2a82bbeb1ffe4d01ead4017a21bfd0454983b88ca73af94b9", size = 259470, upload-time = "2025-09-21T20:03:15.584Z" }, + { url = "https://files.pythonhosted.org/packages/0f/55/79ff53a769f20d71b07023ea115c9167c0bb56f281320520cf64c5298a96/coverage-7.10.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1b4fd784344d4e52647fd7857b2af5b3fbe6c239b0b5fa63e94eb67320770e0f", size = 262626, upload-time = "2025-09-21T20:03:17.673Z" }, + { url = "https://files.pythonhosted.org/packages/88/e2/dac66c140009b61ac3fc13af673a574b00c16efdf04f9b5c740703e953c0/coverage-7.10.7-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:0ebbaddb2c19b71912c6f2518e791aa8b9f054985a0769bdb3a53ebbc765c6a1", size = 260386, upload-time = "2025-09-21T20:03:19.36Z" }, + { url = "https://files.pythonhosted.org/packages/a2/f1/f48f645e3f33bb9ca8a496bc4a9671b52f2f353146233ebd7c1df6160440/coverage-7.10.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a2d9a3b260cc1d1dbdb1c582e63ddcf5363426a1a68faa0f5da28d8ee3c722a0", size = 258852, upload-time = "2025-09-21T20:03:21.007Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3b/8442618972c51a7affeead957995cfa8323c0c9bcf8fa5a027421f720ff4/coverage-7.10.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a3cc8638b2480865eaa3926d192e64ce6c51e3d29c849e09d5b4ad95efae5399", size = 261534, upload-time = "2025-09-21T20:03:23.12Z" }, + { url = "https://files.pythonhosted.org/packages/b2/dc/101f3fa3a45146db0cb03f5b4376e24c0aac818309da23e2de0c75295a91/coverage-7.10.7-cp314-cp314t-win32.whl", hash = "sha256:67f8c5cbcd3deb7a60b3345dffc89a961a484ed0af1f6f73de91705cc6e31235", size = 221784, upload-time = "2025-09-21T20:03:24.769Z" }, + { url = "https://files.pythonhosted.org/packages/4c/a1/74c51803fc70a8a40d7346660379e144be772bab4ac7bb6e6b905152345c/coverage-7.10.7-cp314-cp314t-win_amd64.whl", hash = "sha256:e1ed71194ef6dea7ed2d5cb5f7243d4bcd334bfb63e59878519be558078f848d", size = 222905, upload-time = "2025-09-21T20:03:26.93Z" }, + { url = "https://files.pythonhosted.org/packages/12/65/f116a6d2127df30bcafbceef0302d8a64ba87488bf6f73a6d8eebf060873/coverage-7.10.7-cp314-cp314t-win_arm64.whl", hash = "sha256:7fe650342addd8524ca63d77b2362b02345e5f1a093266787d210c70a50b471a", size = 220922, upload-time = "2025-09-21T20:03:28.672Z" }, + { url = "https://files.pythonhosted.org/packages/ec/16/114df1c291c22cac3b0c127a73e0af5c12ed7bbb6558d310429a0ae24023/coverage-7.10.7-py3-none-any.whl", hash = "sha256:f7941f6f2fe6dd6807a1208737b8a0cbcf1cc6d7b07d24998ad2d63590868260", size = 209952, upload-time = "2025-09-21T20:03:53.918Z" }, +] + [[package]] name = "cramjam" version = "2.11.0" @@ -456,7 +520,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" } wheels = [ @@ -570,10 +635,13 @@ dependencies = [ { name = "pydantic" }, { name = "rtree" }, { name = "safetensors", extra = ["torch"] }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "torchvision", version = "0.22.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "tqdm" }, { name = "transformers" }, ] @@ -621,7 +689,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ { name = "docling", marker = "sys_platform != 'darwin'" }, @@ -726,10 +795,13 @@ dependencies = [ { name = "scikit-image" }, { name = "scipy" }, { name = "shapely" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "torchvision", version = "0.22.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/bb/84/4a2cab0e6adde6a85e7ba543862e5fc0250c51f3ac721a078a55cdcff250/easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c", size = 2870178, upload-time = "2024-09-24T11:34:43.554Z" }, @@ -945,7 +1017,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] sdist = { url = "https://files.pythonhosted.org/packages/00/f7/27f15d41f0ed38e8fcc488584b57e902b331da7f7c6dcda53721b15838fc/fsspec-2025.5.1.tar.gz", hash = "sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475", size = 303033, upload-time = "2025-05-24T12:03:23.792Z" } wheels = [ @@ -1266,7 +1339,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ { name = "filelock", marker = "sys_platform != 'darwin'" }, @@ -1339,6 +1413,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, ] +[[package]] +name = "iniconfig" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -1962,7 +2045,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform != 'darwin' and sys_platform != 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ { name = "dill", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'darwin'" }, @@ -2282,7 +2366,7 @@ wheels = [ [[package]] name = "openrag" -version = "0.1.14.dev3" +version = "0.1.15" source = { editable = "." } dependencies = [ { name = "agentd" }, @@ -2307,11 +2391,37 @@ dependencies = [ { name = "structlog" }, { name = "textual" }, { name = "textual-fspicker" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, { name = "uvicorn" }, ] +[package.optional-dependencies] +torch = [ + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, +] +torch-cu128 = [ + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "pytest-cov" }, + { name = "pytest-mock" }, +] + [package.metadata] requires-dist = [ { name = "agentd", specifier = ">=0.2.2" }, @@ -2336,10 +2446,25 @@ requires-dist = [ { name = "structlog", specifier = ">=25.4.0" }, { name = "textual", specifier = ">=0.45.0" }, { name = "textual-fspicker", specifier = ">=0.6.0" }, - { name = "torch", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'", specifier = ">=2.7.1" }, - { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'", specifier = ">=2.7.1", index = "https://download.pytorch.org/whl/cu128" }, + { name = "torch", marker = "sys_platform == 'linux' and extra == 'torch'" }, + { name = "torch", marker = "sys_platform == 'linux' and extra == 'torch-cu128'", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "openrag", extra = "torch-cu128" } }, + { name = "torch", marker = "sys_platform != 'linux' and extra == 'torch'", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "openrag", extra = "torch" } }, + { name = "torch", marker = "sys_platform != 'linux' and extra == 'torch-cu128'" }, + { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'torch'" }, + { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'torch-cu128'", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "openrag", extra = "torch-cu128" } }, + { name = "torchvision", marker = "sys_platform != 'linux' and extra == 'torch'", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "openrag", extra = "torch" } }, + { name = "torchvision", marker = "sys_platform != 'linux' and extra == 'torch-cu128'" }, { name = "uvicorn", specifier = ">=0.35.0" }, ] +provides-extras = ["torch", "torch-cu128"] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=8" }, + { name = "pytest-asyncio", specifier = ">=0.21.0" }, + { name = "pytest-cov", specifier = ">=4.0.0" }, + { name = "pytest-mock", specifier = ">=3.12.0" }, +] [[package]] name = "opensearch-py" @@ -2836,6 +2961,60 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/6b/2706497c86e8d69fb76afe5ea857fe1794621aa0f3b1d863feb953fe0f22/pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c", size = 2814810, upload-time = "2024-12-19T19:28:09.857Z" }, ] +[[package]] +name = "pytest" +version = "8.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, +] + +[[package]] +name = "pytest-asyncio" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/86/9e3c5f48f7b7b638b216e4b9e645f54d199d7abbbab7a64a13b4e12ba10f/pytest_asyncio-1.2.0.tar.gz", hash = "sha256:c609a64a2a8768462d0c99811ddb8bd2583c33fd33cf7f21af1c142e824ffb57", size = 50119, upload-time = "2025-09-12T07:33:53.816Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/93/2fa34714b7a4ae72f2f8dad66ba17dd9a2c793220719e736dda28b7aec27/pytest_asyncio-1.2.0-py3-none-any.whl", hash = "sha256:8e17ae5e46d8e7efe51ab6494dd2010f4ca8dae51652aa3c8d55acf50bfb2e99", size = 15095, upload-time = "2025-09-12T07:33:52.639Z" }, +] + +[[package]] +name = "pytest-cov" +version = "7.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coverage" }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, +] + +[[package]] +name = "pytest-mock" +version = "3.15.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/14/eb014d26be205d38ad5ad20d9a80f7d201472e08167f0bb4361e251084a9/pytest_mock-3.15.1.tar.gz", hash = "sha256:1849a238f6f396da19762269de72cb1814ab44416fa73a8686deac10b0d87a0f", size = 34036, upload-time = "2025-09-16T16:37:27.081Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/cc/06253936f4a7fa2e0f48dfe6d851d9c56df896a9ab09ac019d70b760619c/pytest_mock-3.15.1-py3-none-any.whl", hash = "sha256:0a25e2eb88fe5168d535041d09a4529a188176ae608a6d249ee65abc0949630d", size = 10095, upload-time = "2025-09-16T16:37:25.734Z" }, +] + [[package]] name = "python-bidi" version = "0.6.6" @@ -3261,8 +3440,9 @@ wheels = [ [package.optional-dependencies] torch = [ { name = "numpy" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] [[package]] @@ -3586,13 +3766,15 @@ name = "torch" version = "2.7.1+cu128" source = { registry = "https://download.pytorch.org/whl/cu128" } resolution-markers = [ + "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ - { name = "filelock", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "jinja2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "networkx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "filelock", marker = "sys_platform == 'linux'" }, + { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" }, + { name = "jinja2", marker = "sys_platform == 'linux'" }, + { name = "networkx", marker = "sys_platform == 'linux'" }, { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -3607,86 +3789,128 @@ dependencies = [ { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "sympy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "sys_platform == 'linux'" }, + { name = "sympy", marker = "sys_platform == 'linux'" }, + { name = "triton", marker = "sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "sys_platform == 'linux'" }, ] wheels = [ + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:d56d29a6ad7758ba5173cc2b0c51c93e126e2b0a918e874101dc66545283967f" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9560425f9ea1af1791507e8ca70d5b9ecf62fed7ca226a95fcd58d0eb2cca78f" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f112465fdf42eb1297c6dddda1a8b7f411914428b704e1b8a47870c52e290909" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c355db49c218ada70321d5c5c9bb3077312738b99113c8f3723ef596b554a7b9" }, ] [[package]] name = "torch" version = "2.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://download.pytorch.org/whl/cpu" } resolution-markers = [ "sys_platform == 'darwin'", - "platform_machine == 'aarch64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ - { name = "filelock", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "filelock", marker = "sys_platform == 'darwin'" }, { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, - { name = "jinja2", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "networkx", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "setuptools", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "sympy", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "typing-extensions", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "jinja2", marker = "sys_platform == 'darwin'" }, + { name = "networkx", marker = "sys_platform == 'darwin'" }, + { name = "setuptools", marker = "sys_platform == 'darwin'" }, + { name = "sympy", marker = "sys_platform == 'darwin'" }, + { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/10/4e/469ced5a0603245d6a19a556e9053300033f9c5baccf43a3d25ba73e189e/torch-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b2f96814e0345f5a5aed9bf9734efa913678ed19caf6dc2cddb7930672d6128", size = 101936856, upload-time = "2025-08-06T14:54:01.526Z" }, - { url = "https://files.pythonhosted.org/packages/16/82/3948e54c01b2109238357c6f86242e6ecbf0c63a1af46906772902f82057/torch-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:65616ca8ec6f43245e1f5f296603e33923f4c30f93d65e103d9e50c25b35150b", size = 887922844, upload-time = "2025-08-06T14:55:50.78Z" }, - { url = "https://files.pythonhosted.org/packages/e3/54/941ea0a860f2717d86a811adf0c2cd01b3983bdd460d0803053c4e0b8649/torch-2.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:659df54119ae03e83a800addc125856effda88b016dfc54d9f65215c3975be16", size = 241330968, upload-time = "2025-08-06T14:54:45.293Z" }, - { url = "https://files.pythonhosted.org/packages/de/69/8b7b13bba430f5e21d77708b616f767683629fc4f8037564a177d20f90ed/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:1a62a1ec4b0498930e2543535cf70b1bef8c777713de7ceb84cd79115f553767", size = 73915128, upload-time = "2025-08-06T14:54:34.769Z" }, - { url = "https://files.pythonhosted.org/packages/15/0e/8a800e093b7f7430dbaefa80075aee9158ec22e4c4fc3c1a66e4fb96cb4f/torch-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:83c13411a26fac3d101fe8035a6b0476ae606deb8688e904e796a3534c197def", size = 102020139, upload-time = "2025-08-06T14:54:39.047Z" }, - { url = "https://files.pythonhosted.org/packages/4a/15/5e488ca0bc6162c86a33b58642bc577c84ded17c7b72d97e49b5833e2d73/torch-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8f0a9d617a66509ded240add3754e462430a6c1fc5589f86c17b433dd808f97a", size = 887990692, upload-time = "2025-08-06T14:56:18.286Z" }, - { url = "https://files.pythonhosted.org/packages/b4/a8/6a04e4b54472fc5dba7ca2341ab219e529f3c07b6941059fbf18dccac31f/torch-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a7242b86f42be98ac674b88a4988643b9bc6145437ec8f048fea23f72feb5eca", size = 241603453, upload-time = "2025-08-06T14:55:22.945Z" }, - { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:fbe2e149c5174ef90d29a5f84a554dfaf28e003cb4f61fa2c8c024c17ec7ca58" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:057efd30a6778d2ee5e2374cd63a63f63311aa6f33321e627c655df60abdd390" }, +] + +[[package]] +name = "torch" +version = "2.8.0+cpu" +source = { registry = "https://download.pytorch.org/whl/cpu" } +resolution-markers = [ + "sys_platform != 'darwin' and sys_platform != 'linux'", +] +dependencies = [ + { name = "filelock", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "jinja2", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "networkx", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "setuptools", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "sympy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "typing-extensions", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:e1ee1b2346ade3ea90306dfbec7e8ff17bc220d344109d189ae09078333b0856" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_arm64.whl", hash = "sha256:64c187345509f2b1bb334feed4666e2c781ca381874bde589182f81247e61f88" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:6d93a7165419bc4b2b907e859ccab0dea5deeab261448ae9a5ec5431f14c0e64" }, ] [[package]] name = "torchvision" version = "0.22.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://download.pytorch.org/whl/cu128" } resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", ] dependencies = [ - { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/b0/3cffd6a285b5ffee3fe4a31caff49e350c98c5963854474d1c4f7a51dea5/torchvision-0.22.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7ee682be589bb1a002b7704f06b8ec0b89e4b9068f48e79307d2c6e937a9fdf4", size = 7485894, upload-time = "2025-06-04T17:43:01.371Z" }, - { url = "https://files.pythonhosted.org/packages/94/8b/04c6b15f8c29b39f0679589753091cec8b192ab296d4fdaf9055544c4ec9/torchvision-0.22.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ef46e065502f7300ad6abc98554131c35dc4c837b978d91306658f1a65c00baa", size = 7658543, upload-time = "2025-06-04T17:42:46.064Z" }, + { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:75f519ebe412ced95d727c71c30c68084cc6fd36347b88f338e88ff9d07a3ac8" }, + { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f6565fd22e04e51f9600f34a3a20b120ee9f5a73161bfcb79c826225054aa44e" }, +] + +[[package]] +name = "torchvision" +version = "0.22.1+cu128" +source = { registry = "https://download.pytorch.org/whl/cu128" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "numpy", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:bc4fef193917b51db6b409acd3ffdec9286d877baac0aee5dcfbb72592d00bfc" }, + { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:02faf51fbf5070592768fa935327d13a484b745faef38b0fee01d85cfb35f5bc" }, ] [[package]] name = "torchvision" version = "0.23.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://download.pytorch.org/whl/cpu" } resolution-markers = [ "sys_platform == 'darwin'", - "platform_machine == 'aarch64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ - { name = "numpy", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "pillow", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "numpy", marker = "sys_platform == 'darwin'" }, + { name = "pillow", marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/91/37/45a5b9407a7900f71d61b2b2f62db4b7c632debca397f205fdcacb502780/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600", size = 1856886, upload-time = "2025-08-06T14:58:05.491Z" }, - { url = "https://files.pythonhosted.org/packages/ac/da/a06c60fc84fc849377cf035d3b3e9a1c896d52dbad493b963c0f1cdd74d0/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2f7fd6c15f3697e80627b77934f77705f3bc0e98278b989b2655de01f6903e1d", size = 2353112, upload-time = "2025-08-06T14:58:26.265Z" }, - { url = "https://files.pythonhosted.org/packages/a0/27/5ce65ba5c9d3b7d2ccdd79892ab86a2f87ac2ca6638f04bb0280321f1a9c/torchvision-0.23.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:a76fafe113b2977be3a21bf78f115438c1f88631d7a87203acb3dd6ae55889e6", size = 8627658, upload-time = "2025-08-06T14:58:15.999Z" }, - { url = "https://files.pythonhosted.org/packages/1f/e4/028a27b60aa578a2fa99d9d7334ff1871bb17008693ea055a2fdee96da0d/torchvision-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:07d069cb29691ff566e3b7f11f20d91044f079e1dbdc9d72e0655899a9b06938", size = 1600749, upload-time = "2025-08-06T14:58:10.719Z" }, - { url = "https://files.pythonhosted.org/packages/05/35/72f91ad9ac7c19a849dedf083d347dc1123f0adeb401f53974f84f1d04c8/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9", size = 2047192, upload-time = "2025-08-06T14:58:11.813Z" }, - { url = "https://files.pythonhosted.org/packages/1d/9d/406cea60a9eb9882145bcd62a184ee61e823e8e1d550cdc3c3ea866a9445/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a3299d2b1d5a7aed2d3b6ffb69c672ca8830671967eb1cee1497bacd82fe47b", size = 2359295, upload-time = "2025-08-06T14:58:17.469Z" }, - { url = "https://files.pythonhosted.org/packages/2b/f4/34662f71a70fa1e59de99772142f22257ca750de05ccb400b8d2e3809c1d/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:76bc4c0b63d5114aa81281390f8472a12a6a35ce9906e67ea6044e5af4cab60c", size = 8800474, upload-time = "2025-08-06T14:58:22.53Z" }, - { url = "https://files.pythonhosted.org/packages/6e/f5/b5a2d841a8d228b5dbda6d524704408e19e7ca6b7bb0f24490e081da1fa1/torchvision-0.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e2dabf0da9c8aa9ea241afb63a8f3e98489e706b22ac3f30416a1be377153b", size = 1527667, upload-time = "2025-08-06T14:58:14.446Z" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9" }, +] + +[[package]] +name = "torchvision" +version = "0.23.0+cpu" +source = { registry = "https://download.pytorch.org/whl/cpu" } +resolution-markers = [ + "sys_platform != 'darwin' and sys_platform != 'linux'", +] +dependencies = [ + { name = "numpy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "pillow", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:82928788025170c62e7df1120dcdc0cd175bfc31c08374613ce6d1a040bc0cda" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:8d6a47e23d7896f0ef9aa7ea7179eb6324e82438aa66d19884c2020d0646b104" }, ] [[package]] @@ -3728,7 +3952,7 @@ name = "triton" version = "3.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/74/1f/dfb531f90a2d367d914adfee771babbd3f1a5b26c3f5fbc458dee21daa78/triton-3.3.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b89d846b5a4198317fec27a5d3a609ea96b6d557ff44b56c23176546023c4240", size = 155673035, upload-time = "2025-05-29T23:40:02.468Z" }, From bde95a58701456a8e913db791b721db02c54f9e9 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 04:02:39 -0400 Subject: [PATCH 043/113] fix tests --- tests/integration/test_api_endpoints.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_api_endpoints.py b/tests/integration/test_api_endpoints.py index fa36dc8b..1d325a1b 100644 --- a/tests/integration/test_api_endpoints.py +++ b/tests/integration/test_api_endpoints.py @@ -64,15 +64,16 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges "src.main", "api", # Clear the api package itself "src.api", + "services", # Clear services that import clients + "src.services", + "services.search_service", + "src.services.search_service", ]: sys.modules.pop(mod, None) from src.main import create_app, startup_tasks import src.api.router as upload_router from src.config.settings import clients, INDEX_NAME, DISABLE_INGEST_WITH_LANGFLOW - # Verify settings loaded correctly - print(f"Settings DISABLE_INGEST_WITH_LANGFLOW: {DISABLE_INGEST_WITH_LANGFLOW}") - # Ensure a clean index before startup await clients.initialize() try: @@ -188,15 +189,16 @@ async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow "src.main", "api", # Clear the api package itself "src.api", + "services", # Clear services that import clients + "src.services", + "services.search_service", + "src.services.search_service", ]: sys.modules.pop(mod, None) from src.main import create_app, startup_tasks import src.api.router as upload_router from src.config.settings import clients, INDEX_NAME, DISABLE_INGEST_WITH_LANGFLOW - # Verify settings loaded correctly - print(f"Settings DISABLE_INGEST_WITH_LANGFLOW: {DISABLE_INGEST_WITH_LANGFLOW}") - # Ensure a clean index before startup await clients.initialize() try: From 5ace89ded5eb41617422547c34d67601730c2773 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 10:18:05 -0400 Subject: [PATCH 044/113] big runners for integration-tests --- .github/workflows/test-integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 46bbe977..e20a5b70 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -8,7 +8,7 @@ on: jobs: tests: - runs-on: ubuntu-latest + runs-on: [self-hosted, linux, ARM64, langflow-ai-arm64-2] env: # Prefer repository/environment variable first, then secret, then a sane fallback OPENSEARCH_PASSWORD: ${{ vars.OPENSEARCH_PASSWORD || secrets.OPENSEARCH_PASSWORD || 'OpenRag#2025!' }} From af1163e449121ba81b09c1fb66c0bd27e75104c2 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 10:20:36 -0400 Subject: [PATCH 045/113] remove sudo disk cleanup --- .github/workflows/test-integration.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index e20a5b70..44a2abbf 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -16,11 +16,11 @@ jobs: steps: - run: df -h - - name: "node-cleanup" - run: | - sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL - sudo docker image prune --all --force - sudo docker builder prune -a + #- name: "node-cleanup" + #run: | + # sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL + # sudo docker image prune --all --force + # sudo docker builder prune -a - run: df -h - name: Checkout uses: actions/checkout@v4 From 1b04e044d7d20ee69dd2448fd24d8cc807277dad Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Tue, 7 Oct 2025 10:27:04 -0400 Subject: [PATCH 046/113] remove-preview-admonition --- docs/docs/_partial-external-preview.mdx | 4 ---- docs/docs/core-components/agents.mdx | 3 --- docs/docs/core-components/ingestion.mdx | 3 --- docs/docs/core-components/knowledge.mdx | 3 --- docs/docs/get-started/docker.mdx | 3 --- docs/docs/get-started/install.mdx | 3 --- docs/docs/get-started/quickstart.mdx | 3 --- docs/docs/get-started/tui.mdx | 4 ---- docs/docs/get-started/what-is-openrag.mdx | 4 ---- docs/docs/support/troubleshoot.mdx | 3 --- 10 files changed, 33 deletions(-) delete mode 100644 docs/docs/_partial-external-preview.mdx diff --git a/docs/docs/_partial-external-preview.mdx b/docs/docs/_partial-external-preview.mdx deleted file mode 100644 index 8563720c..00000000 --- a/docs/docs/_partial-external-preview.mdx +++ /dev/null @@ -1,4 +0,0 @@ -:::info -OpenRAG is is currently in public preview. -Development is ongoing, and the features and functionality are subject to change. -::: \ No newline at end of file diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx index 3ee4617b..70ac31d0 100644 --- a/docs/docs/core-components/agents.mdx +++ b/docs/docs/core-components/agents.mdx @@ -7,9 +7,6 @@ import Icon from "@site/src/components/icon/icon"; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - OpenRAG leverages Langflow's Agent component to power the OpenRAG OpenSearch Agent flow. diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx index d3ce81b0..a2d0fbdd 100644 --- a/docs/docs/core-components/ingestion.mdx +++ b/docs/docs/core-components/ingestion.mdx @@ -7,9 +7,6 @@ import Icon from "@site/src/components/icon/icon"; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - OpenRAG uses [Docling](https://docling-project.github.io/docling/) for its document ingestion pipeline. More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling-serve` process on your local machine and runs Docling ingestion through an API service. diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx index d2a74ca4..0959c495 100644 --- a/docs/docs/core-components/knowledge.mdx +++ b/docs/docs/core-components/knowledge.mdx @@ -7,9 +7,6 @@ import Icon from "@site/src/components/icon/icon"; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - OpenRAG uses [OpenSearch](https://docs.opensearch.org/latest/) for its vector-backed knowledge store. This is a specialized database for storing and retrieving embeddings, which helps your Agent efficiently find relevant information. diff --git a/docs/docs/get-started/docker.mdx b/docs/docs/get-started/docker.mdx index f7ec730b..eee2e866 100644 --- a/docs/docs/get-started/docker.mdx +++ b/docs/docs/get-started/docker.mdx @@ -4,9 +4,6 @@ slug: /get-started/docker --- import PartialOnboarding from '@site/docs/_partial-onboarding.mdx'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - There are two different Docker Compose files. They deploy the same applications and containers, but to different environments. diff --git a/docs/docs/get-started/install.mdx b/docs/docs/get-started/install.mdx index 1759e813..82fe9bf8 100644 --- a/docs/docs/get-started/install.mdx +++ b/docs/docs/get-started/install.mdx @@ -6,9 +6,6 @@ slug: /install import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import PartialOnboarding from '@site/docs/_partial-onboarding.mdx'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - [Install the OpenRAG Python wheel](#install-python-wheel), and then run the [OpenRAG Terminal User Interface(TUI)](#setup) to start your OpenRAG deployment with a guided setup process. diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx index 838ad006..c2f4b3a5 100644 --- a/docs/docs/get-started/quickstart.mdx +++ b/docs/docs/get-started/quickstart.mdx @@ -6,9 +6,6 @@ slug: /quickstart import Icon from "@site/src/components/icon/icon"; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - Get started with OpenRAG by loading your knowledge, swapping out your language model, and then chatting with the OpenRAG API. diff --git a/docs/docs/get-started/tui.mdx b/docs/docs/get-started/tui.mdx index f3cfe51e..0a27a1e8 100644 --- a/docs/docs/get-started/tui.mdx +++ b/docs/docs/get-started/tui.mdx @@ -3,10 +3,6 @@ title: Terminal User Interface (TUI) commands slug: /get-started/tui --- -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - - The OpenRAG Terminal User Interface (TUI) allows you to set up, configure, and monitor your OpenRAG deployment directly from the terminal, on any operating system. ![OpenRAG TUI Interface](@site/static/img/OpenRAG_TUI_2025-09-10T13_04_11_757637.svg) diff --git a/docs/docs/get-started/what-is-openrag.mdx b/docs/docs/get-started/what-is-openrag.mdx index 18c01482..7d2340d0 100644 --- a/docs/docs/get-started/what-is-openrag.mdx +++ b/docs/docs/get-started/what-is-openrag.mdx @@ -3,10 +3,6 @@ title: What is OpenRAG? slug: / --- -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - - OpenRAG is an open-source package for building agentic RAG systems. It supports integration with a wide range of orchestration tools, vector databases, and LLM providers. diff --git a/docs/docs/support/troubleshoot.mdx b/docs/docs/support/troubleshoot.mdx index 9946db38..93599d04 100644 --- a/docs/docs/support/troubleshoot.mdx +++ b/docs/docs/support/troubleshoot.mdx @@ -5,9 +5,6 @@ slug: /support/troubleshoot import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import PartialExternalPreview from '@site/docs/_partial-external-preview.mdx'; - - This page provides troubleshooting advice for issues you might encounter when using OpenRAG or contributing to OpenRAG. From bccbcf8d12fe61fcb73ed70746a5904fdeb36ddb Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 10:36:27 -0400 Subject: [PATCH 047/113] torch extra --- .github/workflows/test-integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 44a2abbf..e2afa334 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -34,7 +34,7 @@ jobs: run: uv python install 3.13 - name: Install dependencies - run: uv sync --group dev + run: uv sync --group dev --extra torch-cu128 - name: Run integration tests env: From 188aa7586680cc17c76d9b475c3de8377972ccbf Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 10:53:51 -0400 Subject: [PATCH 048/113] torch extra --- .github/workflows/test-integration.yml | 2 +- Dockerfile.backend | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index e2afa334..a46f911f 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -34,7 +34,7 @@ jobs: run: uv python install 3.13 - name: Install dependencies - run: uv sync --group dev --extra torch-cu128 + run: uv sync --group dev --extra torch - name: Run integration tests env: diff --git a/Dockerfile.backend b/Dockerfile.backend index d314eefe..5d9d84f4 100644 --- a/Dockerfile.backend +++ b/Dockerfile.backend @@ -18,7 +18,7 @@ WORKDIR /app # Copy Python dependencies COPY pyproject.toml uv.lock ./ -RUN uv sync --extra torch-cu128 +RUN uv sync # Copy sample document and warmup script for docling COPY documents/warmup_ocr.pdf ./ From ab6eb6e779f3f3a9d8904d89d3279e2dd4f73693 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 10:55:14 -0400 Subject: [PATCH 049/113] no torch --- .github/workflows/test-integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index a46f911f..44a2abbf 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -34,7 +34,7 @@ jobs: run: uv python install 3.13 - name: Install dependencies - run: uv sync --group dev --extra torch + run: uv sync --group dev - name: Run integration tests env: From c6907e104ae4a0d25fd21225031dfd38b102619a Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 10:56:34 -0400 Subject: [PATCH 050/113] test without dev dependencies --- .github/workflows/test-integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 44a2abbf..51b856b3 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -34,7 +34,7 @@ jobs: run: uv python install 3.13 - name: Install dependencies - run: uv sync --group dev + run: uv sync - name: Run integration tests env: From b8e8440397b87b914db0c7d6d8381ad7040c4d63 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 11:07:12 -0400 Subject: [PATCH 051/113] fix: add router back --- frontend/src/app/admin/page.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/app/admin/page.tsx b/frontend/src/app/admin/page.tsx index c8c9ecf8..a318d511 100644 --- a/frontend/src/app/admin/page.tsx +++ b/frontend/src/app/admin/page.tsx @@ -51,7 +51,7 @@ function AdminPage() { const formData = new FormData() formData.append("file", selectedFile) - const response = await fetch("/api/upload", { + const response = await fetch("/api/router/upload_ingest", { method: "POST", body: formData, }) From 65d7430fac2bb4c84db7995d37d9ee9428cb82dd Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 7 Oct 2025 11:08:38 -0400 Subject: [PATCH 052/113] fixes --- pyproject.toml | 20 +--- src/api/upload_utils.py | 47 -------- uv.lock | 240 +++++++++++++--------------------------- 3 files changed, 83 insertions(+), 224 deletions(-) delete mode 100644 src/api/upload_utils.py diff --git a/pyproject.toml b/pyproject.toml index bc8cb811..cbdd7be4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "pyjwt>=2.8.0", "python-multipart>=0.0.20", "starlette>=0.47.1", + "torch>=2.7.1", "uvicorn>=0.35.0", "boto3>=1.35.0", "psutil>=7.0.0", @@ -30,10 +31,6 @@ dependencies = [ "docling-serve>=1.4.1", ] -[project.optional-dependencies] -torch = ["torch", "torchvision"] -torch-cu128 = ["torch", "torchvision"] - [dependency-groups] dev = ["pytest>=8", "pytest-asyncio>=0.21.0", "pytest-mock>=3.12.0", "pytest-cov>=4.0.0"] @@ -46,20 +43,13 @@ package = true [tool.uv.sources] torch = [ - { index = "pytorch-cu128", extra = "torch-cu128", marker = "sys_platform == 'linux'" }, - { index = "pytorch-cpu", extra = "torch", marker = "sys_platform != 'linux'" } + { index = "pytorch-cu128", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" }, ] torchvision = [ - { index = "pytorch-cu128", extra = "torch-cu128", marker = "sys_platform == 'linux'" }, - { index = "pytorch-cpu", extra = "torch", marker = "sys_platform != 'linux'" } + { index = "pytorch-cu128", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" }, ] [[tool.uv.index]] -name = "pytorch-cu128" -url = "https://download.pytorch.org/whl/cu128" -explicit = true - -[[tool.uv.index]] -name = "pytorch-cpu" -url = "https://download.pytorch.org/whl/cpu" +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" explicit = true diff --git a/src/api/upload_utils.py b/src/api/upload_utils.py deleted file mode 100644 index f2479107..00000000 --- a/src/api/upload_utils.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import List - -from starlette.requests import Request - - -async def extract_user_context(request: Request) -> dict: - """Extract user/auth context from request.state. Honors no-auth mode.""" - from config.settings import is_no_auth_mode - - user = getattr(request.state, "user", None) - jwt_token = getattr(request.state, "jwt_token", None) - - if is_no_auth_mode(): - return { - "owner_user_id": None, - "owner_name": None, - "owner_email": None, - "jwt_token": None, - } - - return { - "owner_user_id": getattr(user, "user_id", None), - "owner_name": getattr(user, "name", None), - "owner_email": getattr(user, "email", None), - "jwt_token": jwt_token, - } - - -async def create_temp_files_from_form_files(upload_files: List) -> list[str]: - """Persist UploadFile items to temp files; return list of paths.""" - import tempfile - import os - - temp_file_paths: list[str] = [] - for upload_file in upload_files: - content = await upload_file.read() - safe_filename = ( - upload_file.filename.replace(" ", "_").replace("/", "_") - if getattr(upload_file, "filename", None) - else "uploaded" - ) - fd, temp_path = tempfile.mkstemp(suffix=f"_{safe_filename}") - with os.fdopen(fd, "wb") as temp_file: - temp_file.write(content) - temp_file_paths.append(temp_path) - return temp_file_paths - diff --git a/uv.lock b/uv.lock index fd5164cb..8b795659 100644 --- a/uv.lock +++ b/uv.lock @@ -2,11 +2,10 @@ version = 1 revision = 2 requires-python = ">=3.13" resolution-markers = [ - "sys_platform == 'darwin'", - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform == 'darwin'", ] [[package]] @@ -21,9 +20,8 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b1/72/ff3961c19ee395c3d30ac630ee77bfb0e1b46b87edc504d4f83bb4a89705/accelerate-1.10.1.tar.gz", hash = "sha256:3dea89e433420e4bfac0369cae7e36dcd6a56adfcfd38cdda145c6225eab5df8", size = 392446, upload-time = "2025-08-25T13:57:06.21Z" } wheels = [ @@ -293,10 +291,9 @@ name = "click" version = "8.2.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -518,10 +515,9 @@ name = "dill" version = "0.4.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" } wheels = [ @@ -635,13 +631,10 @@ dependencies = [ { name = "pydantic" }, { name = "rtree" }, { name = "safetensors", extra = ["torch"] }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torchvision", version = "0.22.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, { name = "tqdm" }, { name = "transformers" }, ] @@ -687,10 +680,9 @@ name = "docling-mcp" version = "1.1.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ { name = "docling", marker = "sys_platform != 'darwin'" }, @@ -795,13 +787,10 @@ dependencies = [ { name = "scikit-image" }, { name = "scipy" }, { name = "shapely" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torchvision", version = "0.22.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/bb/84/4a2cab0e6adde6a85e7ba543862e5fc0250c51f3ac721a078a55cdcff250/easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c", size = 2870178, upload-time = "2024-09-24T11:34:43.554Z" }, @@ -1015,10 +1004,9 @@ name = "fsspec" version = "2025.5.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] sdist = { url = "https://files.pythonhosted.org/packages/00/f7/27f15d41f0ed38e8fcc488584b57e902b331da7f7c6dcda53721b15838fc/fsspec-2025.5.1.tar.gz", hash = "sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475", size = 303033, upload-time = "2025-05-24T12:03:23.792Z" } wheels = [ @@ -1337,10 +1325,9 @@ name = "huggingface-hub" version = "0.33.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ { name = "filelock", marker = "sys_platform != 'darwin'" }, @@ -2043,10 +2030,9 @@ name = "multiprocess" version = "0.70.18" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "sys_platform != 'darwin' and sys_platform != 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ { name = "dill", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'darwin'" }, @@ -2391,29 +2377,11 @@ dependencies = [ { name = "structlog" }, { name = "textual" }, { name = "textual-fspicker" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, { name = "uvicorn" }, ] -[package.optional-dependencies] -torch = [ - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, -] -torch-cu128 = [ - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torchvision", version = "0.22.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.22.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, -] - [package.dev-dependencies] dev = [ { name = "pytest" }, @@ -2446,17 +2414,10 @@ requires-dist = [ { name = "structlog", specifier = ">=25.4.0" }, { name = "textual", specifier = ">=0.45.0" }, { name = "textual-fspicker", specifier = ">=0.6.0" }, - { name = "torch", marker = "sys_platform == 'linux' and extra == 'torch'" }, - { name = "torch", marker = "sys_platform == 'linux' and extra == 'torch-cu128'", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "openrag", extra = "torch-cu128" } }, - { name = "torch", marker = "sys_platform != 'linux' and extra == 'torch'", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "openrag", extra = "torch" } }, - { name = "torch", marker = "sys_platform != 'linux' and extra == 'torch-cu128'" }, - { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'torch'" }, - { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'torch-cu128'", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "openrag", extra = "torch-cu128" } }, - { name = "torchvision", marker = "sys_platform != 'linux' and extra == 'torch'", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "openrag", extra = "torch" } }, - { name = "torchvision", marker = "sys_platform != 'linux' and extra == 'torch-cu128'" }, + { name = "torch", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'", specifier = ">=2.7.1" }, + { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'", specifier = ">=2.7.1", index = "https://download.pytorch.org/whl/cu128" }, { name = "uvicorn", specifier = ">=0.35.0" }, ] -provides-extras = ["torch", "torch-cu128"] [package.metadata.requires-dev] dev = [ @@ -3440,9 +3401,8 @@ wheels = [ [package.optional-dependencies] torch = [ { name = "numpy" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, ] [[package]] @@ -3766,15 +3726,13 @@ name = "torch" version = "2.7.1+cu128" source = { registry = "https://download.pytorch.org/whl/cu128" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ - { name = "filelock", marker = "sys_platform == 'linux'" }, - { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" }, - { name = "jinja2", marker = "sys_platform == 'linux'" }, - { name = "networkx", marker = "sys_platform == 'linux'" }, + { name = "filelock", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "jinja2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "networkx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -3789,128 +3747,86 @@ dependencies = [ { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "setuptools", marker = "sys_platform == 'linux'" }, - { name = "sympy", marker = "sys_platform == 'linux'" }, - { name = "triton", marker = "sys_platform == 'linux'" }, - { name = "typing-extensions", marker = "sys_platform == 'linux'" }, + { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "sympy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:d56d29a6ad7758ba5173cc2b0c51c93e126e2b0a918e874101dc66545283967f" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9560425f9ea1af1791507e8ca70d5b9ecf62fed7ca226a95fcd58d0eb2cca78f" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f112465fdf42eb1297c6dddda1a8b7f411914428b704e1b8a47870c52e290909" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c355db49c218ada70321d5c5c9bb3077312738b99113c8f3723ef596b554a7b9" }, ] [[package]] name = "torch" version = "2.8.0" -source = { registry = "https://download.pytorch.org/whl/cpu" } +source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", "sys_platform == 'darwin'", ] dependencies = [ - { name = "filelock", marker = "sys_platform == 'darwin'" }, + { name = "filelock", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "jinja2", marker = "sys_platform == 'darwin'" }, - { name = "networkx", marker = "sys_platform == 'darwin'" }, - { name = "setuptools", marker = "sys_platform == 'darwin'" }, - { name = "sympy", marker = "sys_platform == 'darwin'" }, - { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, + { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "jinja2", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "networkx", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "setuptools", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "sympy", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "typing-extensions", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:fbe2e149c5174ef90d29a5f84a554dfaf28e003cb4f61fa2c8c024c17ec7ca58" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:057efd30a6778d2ee5e2374cd63a63f63311aa6f33321e627c655df60abdd390" }, -] - -[[package]] -name = "torch" -version = "2.8.0+cpu" -source = { registry = "https://download.pytorch.org/whl/cpu" } -resolution-markers = [ - "sys_platform != 'darwin' and sys_platform != 'linux'", -] -dependencies = [ - { name = "filelock", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "fsspec", version = "2025.5.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "jinja2", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "networkx", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "setuptools", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "sympy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "typing-extensions", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, -] -wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:e1ee1b2346ade3ea90306dfbec7e8ff17bc220d344109d189ae09078333b0856" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313-win_arm64.whl", hash = "sha256:64c187345509f2b1bb334feed4666e2c781ca381874bde589182f81247e61f88" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:6d93a7165419bc4b2b907e859ccab0dea5deeab261448ae9a5ec5431f14c0e64" }, + { url = "https://files.pythonhosted.org/packages/10/4e/469ced5a0603245d6a19a556e9053300033f9c5baccf43a3d25ba73e189e/torch-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2b2f96814e0345f5a5aed9bf9734efa913678ed19caf6dc2cddb7930672d6128", size = 101936856, upload-time = "2025-08-06T14:54:01.526Z" }, + { url = "https://files.pythonhosted.org/packages/16/82/3948e54c01b2109238357c6f86242e6ecbf0c63a1af46906772902f82057/torch-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:65616ca8ec6f43245e1f5f296603e33923f4c30f93d65e103d9e50c25b35150b", size = 887922844, upload-time = "2025-08-06T14:55:50.78Z" }, + { url = "https://files.pythonhosted.org/packages/e3/54/941ea0a860f2717d86a811adf0c2cd01b3983bdd460d0803053c4e0b8649/torch-2.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:659df54119ae03e83a800addc125856effda88b016dfc54d9f65215c3975be16", size = 241330968, upload-time = "2025-08-06T14:54:45.293Z" }, + { url = "https://files.pythonhosted.org/packages/de/69/8b7b13bba430f5e21d77708b616f767683629fc4f8037564a177d20f90ed/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:1a62a1ec4b0498930e2543535cf70b1bef8c777713de7ceb84cd79115f553767", size = 73915128, upload-time = "2025-08-06T14:54:34.769Z" }, + { url = "https://files.pythonhosted.org/packages/15/0e/8a800e093b7f7430dbaefa80075aee9158ec22e4c4fc3c1a66e4fb96cb4f/torch-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:83c13411a26fac3d101fe8035a6b0476ae606deb8688e904e796a3534c197def", size = 102020139, upload-time = "2025-08-06T14:54:39.047Z" }, + { url = "https://files.pythonhosted.org/packages/4a/15/5e488ca0bc6162c86a33b58642bc577c84ded17c7b72d97e49b5833e2d73/torch-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8f0a9d617a66509ded240add3754e462430a6c1fc5589f86c17b433dd808f97a", size = 887990692, upload-time = "2025-08-06T14:56:18.286Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a8/6a04e4b54472fc5dba7ca2341ab219e529f3c07b6941059fbf18dccac31f/torch-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a7242b86f42be98ac674b88a4988643b9bc6145437ec8f048fea23f72feb5eca", size = 241603453, upload-time = "2025-08-06T14:55:22.945Z" }, + { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" }, ] [[package]] name = "torchvision" version = "0.22.1" -source = { registry = "https://download.pytorch.org/whl/cu128" } -resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", -] -dependencies = [ - { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, -] -wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:75f519ebe412ced95d727c71c30c68084cc6fd36347b88f338e88ff9d07a3ac8" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f6565fd22e04e51f9600f34a3a20b120ee9f5a73161bfcb79c826225054aa44e" }, -] - -[[package]] -name = "torchvision" -version = "0.22.1+cu128" -source = { registry = "https://download.pytorch.org/whl/cu128" } +source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'x86_64' and sys_platform == 'linux'", - "platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ - { name = "numpy", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "pillow", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.7.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:bc4fef193917b51db6b409acd3ffdec9286d877baac0aee5dcfbb72592d00bfc" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:02faf51fbf5070592768fa935327d13a484b745faef38b0fee01d85cfb35f5bc" }, + { url = "https://files.pythonhosted.org/packages/8d/b0/3cffd6a285b5ffee3fe4a31caff49e350c98c5963854474d1c4f7a51dea5/torchvision-0.22.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7ee682be589bb1a002b7704f06b8ec0b89e4b9068f48e79307d2c6e937a9fdf4", size = 7485894, upload-time = "2025-06-04T17:43:01.371Z" }, + { url = "https://files.pythonhosted.org/packages/94/8b/04c6b15f8c29b39f0679589753091cec8b192ab296d4fdaf9055544c4ec9/torchvision-0.22.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ef46e065502f7300ad6abc98554131c35dc4c837b978d91306658f1a65c00baa", size = 7658543, upload-time = "2025-06-04T17:42:46.064Z" }, ] [[package]] name = "torchvision" version = "0.23.0" -source = { registry = "https://download.pytorch.org/whl/cpu" } +source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "platform_machine == 'aarch64' and sys_platform == 'linux'", + "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", "sys_platform == 'darwin'", ] dependencies = [ - { name = "numpy", marker = "sys_platform == 'darwin'" }, - { name = "pillow", marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, + { name = "numpy", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "pillow", marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9" }, -] - -[[package]] -name = "torchvision" -version = "0.23.0+cpu" -source = { registry = "https://download.pytorch.org/whl/cpu" } -resolution-markers = [ - "sys_platform != 'darwin' and sys_platform != 'linux'", -] -dependencies = [ - { name = "numpy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "pillow", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, -] -wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:82928788025170c62e7df1120dcdc0cd175bfc31c08374613ce6d1a040bc0cda" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:8d6a47e23d7896f0ef9aa7ea7179eb6324e82438aa66d19884c2020d0646b104" }, + { url = "https://files.pythonhosted.org/packages/91/37/45a5b9407a7900f71d61b2b2f62db4b7c632debca397f205fdcacb502780/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600", size = 1856886, upload-time = "2025-08-06T14:58:05.491Z" }, + { url = "https://files.pythonhosted.org/packages/ac/da/a06c60fc84fc849377cf035d3b3e9a1c896d52dbad493b963c0f1cdd74d0/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2f7fd6c15f3697e80627b77934f77705f3bc0e98278b989b2655de01f6903e1d", size = 2353112, upload-time = "2025-08-06T14:58:26.265Z" }, + { url = "https://files.pythonhosted.org/packages/a0/27/5ce65ba5c9d3b7d2ccdd79892ab86a2f87ac2ca6638f04bb0280321f1a9c/torchvision-0.23.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:a76fafe113b2977be3a21bf78f115438c1f88631d7a87203acb3dd6ae55889e6", size = 8627658, upload-time = "2025-08-06T14:58:15.999Z" }, + { url = "https://files.pythonhosted.org/packages/1f/e4/028a27b60aa578a2fa99d9d7334ff1871bb17008693ea055a2fdee96da0d/torchvision-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:07d069cb29691ff566e3b7f11f20d91044f079e1dbdc9d72e0655899a9b06938", size = 1600749, upload-time = "2025-08-06T14:58:10.719Z" }, + { url = "https://files.pythonhosted.org/packages/05/35/72f91ad9ac7c19a849dedf083d347dc1123f0adeb401f53974f84f1d04c8/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9", size = 2047192, upload-time = "2025-08-06T14:58:11.813Z" }, + { url = "https://files.pythonhosted.org/packages/1d/9d/406cea60a9eb9882145bcd62a184ee61e823e8e1d550cdc3c3ea866a9445/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a3299d2b1d5a7aed2d3b6ffb69c672ca8830671967eb1cee1497bacd82fe47b", size = 2359295, upload-time = "2025-08-06T14:58:17.469Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f4/34662f71a70fa1e59de99772142f22257ca750de05ccb400b8d2e3809c1d/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:76bc4c0b63d5114aa81281390f8472a12a6a35ce9906e67ea6044e5af4cab60c", size = 8800474, upload-time = "2025-08-06T14:58:22.53Z" }, + { url = "https://files.pythonhosted.org/packages/6e/f5/b5a2d841a8d228b5dbda6d524704408e19e7ca6b7bb0f24490e081da1fa1/torchvision-0.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e2dabf0da9c8aa9ea241afb63a8f3e98489e706b22ac3f30416a1be377153b", size = 1527667, upload-time = "2025-08-06T14:58:14.446Z" }, ] [[package]] @@ -3952,7 +3868,7 @@ name = "triton" version = "3.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "setuptools", marker = "sys_platform == 'linux'" }, + { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/74/1f/dfb531f90a2d367d914adfee771babbd3f1a5b26c3f5fbc458dee21daa78/triton-3.3.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b89d846b5a4198317fec27a5d3a609ea96b6d557ff44b56c23176546023c4240", size = 155673035, upload-time = "2025-05-29T23:40:02.468Z" }, From 16faac6ac0b91fd6ddee3f0ef6c066eea9acc3f0 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:27:41 -0400 Subject: [PATCH 053/113] remove-ui-sync --- docs/docs/core-components/knowledge.mdx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx index d2a74ca4..9f604431 100644 --- a/docs/docs/core-components/knowledge.mdx +++ b/docs/docs/core-components/knowledge.mdx @@ -78,18 +78,6 @@ You can select multiples. The ingestion process may take some time, depending on the size of your documents. 4. When ingestion is complete, your documents are available in the Knowledge screen. -### Sync cloud connectors - -Your connected data sources are found in the
- Quick Start   |   + Quickstart   |   TUI Interface   |   Docker Deployment   |   Development   |   Troubleshooting
-## Quick Start +## Quickstart + ### Prerequisites - Docker or Podman with Compose installed - Make (for development commands) -### 1. Environment Setup +### Install and start OpenRAG + +1. Set up development environment. ```bash # Clone and setup environment @@ -40,9 +43,7 @@ cd openrag make setup # Creates .env and installs dependencies ``` -### 2. Configure Environment - -Edit `.env` with your API keys and credentials: +2. Configure the `.env` file with your API keys and credentials. ```bash # Required @@ -54,9 +55,10 @@ LANGFLOW_CHAT_FLOW_ID=your_chat_flow_id LANGFLOW_INGEST_FLOW_ID=your_ingest_flow_id NUDGES_FLOW_ID=your_nudges_flow_id ``` -See extended configuration, including ingestion and optional variables: [docs/reference/configuration.mdx](docs/docs/reference/configuration.mdx) -### 3. Start OpenRAG +For extended configuration, including ingestion and optional variables, see [docs/reference/configuration.mdx](docs/docs/reference/configuration.mdx) + +3. Start OpenRAG. ```bash # Full stack with GPU support @@ -73,7 +75,7 @@ Access the services: - **OpenSearch**: http://localhost:9200 - **OpenSearch Dashboards**: http://localhost:5601 -With OpenRAG started, ingest and retrieve documents with the [OpenRAG Quickstart](/docs/get-started/quickstart.mdx). +With OpenRAG started, ingest and retrieve documents with the [OpenRAG Quickstart](docs/docs/get-started/quickstart.mdx). ## TUI interface @@ -93,14 +95,14 @@ uv sync uv run openrag ``` -For the full TUI guide, see [docs/get-started/tui.mdx](docs/docs/get-started/tui.mdx) +For the full TUI guide, see [TUI](docs/docs/get-started/tui.mdx). ## Docker Deployment -The repository includes two Docker Compose files. +The repository includes two Docker Compose `.yml` files. They deploy the same applications and containers, but to different environments. -- [`docker-compose.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose.yml) is an OpenRAG deployment with GPU support for accelerated AI processing. +- [`docker-compose.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose.yml) is an OpenRAG deployment for environments with GPU support. GPU support requires an NVIDIA GPU with CUDA support and compatible NVIDIA drivers installed on the OpenRAG host machine. - [`docker-compose-cpu.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose-cpu.yml) is a CPU-only version of OpenRAG for systems without GPU support. Use this Docker compose file for environments where GPU drivers aren't available. @@ -123,11 +125,11 @@ For environments without GPU support, run: docker compose -f docker-compose-cpu.yml up -d ``` -For more information, see [docs/get-started/docker.mdx](docs/docs/get-started/docker.mdx) +For more information, see [Deploy with Docker](docs/docs/get-started/docker.mdx). ## Troubleshooting -For common issues and fixes, see [docs/support/troubleshoot.mdx](docs/docs/support/troubleshoot.mdx). +For common issues and fixes, see [Troubleshoot](docs/docs/support/troubleshoot.mdx). ## Development From 4720dc6d4affb7ebc287f89401eb694f2263ee30 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:59:36 -0400 Subject: [PATCH 102/113] tui-quickstart --- CONTRIBUTING.md | 42 ++++++++++++++++++++++----- README.md | 77 ++++++------------------------------------------- 2 files changed, 44 insertions(+), 75 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 19b01709..6b8cd832 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,20 +11,48 @@ Thank you for your interest in contributing to OpenRAG! This guide will help you - Python 3.13+ with uv package manager - Node.js 18+ and npm -### Environment Setup +### Set up OpenRAG for development + +1. Set up your development environment. ```bash -# Clone the repository -git clone +# Clone and setup environment +git clone https://github.com/langflow-ai/openrag.git cd openrag - -# Setup development environment make setup # Creates .env and installs dependencies ``` -### Configuration +2. Configure the `.env` file with your API keys and credentials. -Edit `.env` with your API keys and credentials. See the main README for required environment variables. +```bash +# Required +OPENAI_API_KEY=your_openai_api_key +OPENSEARCH_PASSWORD=your_secure_password +LANGFLOW_SUPERUSER=admin +LANGFLOW_SUPERUSER_PASSWORD=your_secure_password +LANGFLOW_CHAT_FLOW_ID=your_chat_flow_id +LANGFLOW_INGEST_FLOW_ID=your_ingest_flow_id +NUDGES_FLOW_ID=your_nudges_flow_id +``` + +For extended configuration, including ingestion and optional variables, see [docs/reference/configuration.mdx](docs/docs/reference/configuration.mdx). + +3. Start OpenRAG. + +```bash +# Full stack with GPU support +make dev + +# Or CPU only +make dev-cpu +``` + +Access the services: +- **Frontend**: http://localhost:3000 +- **Backend API**: http://localhost:8000 +- **Langflow**: http://localhost:7860 +- **OpenSearch**: http://localhost:9200 +- **OpenSearch Dashboards**: http://localhost:5601 ## ๐Ÿ”ง Development Commands diff --git a/README.md b/README.md index 3d15b53c..4c06e4a9 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ Ask DeepWiki -OpenRAG is a comprehensive Retrieval-Augmented Generation platform that enables intelligent document search and AI-powered conversations. Users can upload, process, and query documents through a chat interface backed by large language models and semantic search capabilities. The system utilizes Langflow for document ingestion, retrieval workflows, and intelligent nudges, providing a seamless RAG experience. Built with Starlette, Next.js, OpenSearch, and Langflow integration. +OpenRAG is a comprehensive Retrieval-Augmented Generation platform that enables intelligent document search and AI-powered conversations. Users can upload, process, and query documents through a chat interface backed by large language models and semantic search capabilities. The system utilizes Langflow for document ingestion, retrieval workflows, and intelligent nudges, providing a seamless RAG experience. Built with Starlette, Next.js, OpenSearch, and Langflow integration. +
Quickstart   |   @@ -26,67 +27,17 @@ OpenRAG is a comprehensive Retrieval-Augmented Generation platform that enables ## Quickstart +To get started quickly, use the OpenRAG Terminal User Interface (TUI) to manage your OpenRAG installation without complex command-line operations. -### Prerequisites - -- Docker or Podman with Compose installed -- Make (for development commands) - -### Install and start OpenRAG - -1. Set up development environment. +To launch OpenRAG with the TUI, do the following: +1. Clone the OpenRAG repository. ```bash -# Clone and setup environment git clone https://github.com/langflow-ai/openrag.git cd openrag -make setup # Creates .env and installs dependencies ``` -2. Configure the `.env` file with your API keys and credentials. - -```bash -# Required -OPENAI_API_KEY=your_openai_api_key -OPENSEARCH_PASSWORD=your_secure_password -LANGFLOW_SUPERUSER=admin -LANGFLOW_SUPERUSER_PASSWORD=your_secure_password -LANGFLOW_CHAT_FLOW_ID=your_chat_flow_id -LANGFLOW_INGEST_FLOW_ID=your_ingest_flow_id -NUDGES_FLOW_ID=your_nudges_flow_id -``` - -For extended configuration, including ingestion and optional variables, see [docs/reference/configuration.mdx](docs/docs/reference/configuration.mdx) - -3. Start OpenRAG. - -```bash -# Full stack with GPU support -make dev - -# Or CPU only -make dev-cpu -``` - -Access the services: -- **Frontend**: http://localhost:3000 -- **Backend API**: http://localhost:8000 -- **Langflow**: http://localhost:7860 -- **OpenSearch**: http://localhost:9200 -- **OpenSearch Dashboards**: http://localhost:5601 - -With OpenRAG started, ingest and retrieve documents with the [OpenRAG Quickstart](docs/docs/get-started/quickstart.mdx). - -## TUI interface - -OpenRAG includes a powerful Terminal User Interface (TUI) for easy setup, configuration, and monitoring. The TUI provides a user-friendly way to manage your OpenRAG installation without complex command-line operations. - -![OpenRAG TUI Interface](assets/OpenRAG_TUI_2025-09-10T13_04_11_757637.svg) - -### Launch OpenRAG with the TUI - -From the repository root, run: - +2. To start the TUI, from the repository root, run: ```bash # Install dependencies first uv sync @@ -95,6 +46,8 @@ uv sync uv run openrag ``` +The TUI opens and guides you through OpenRAG setup. + For the full TUI guide, see [TUI](docs/docs/get-started/tui.mdx). ## Docker Deployment @@ -133,16 +86,4 @@ For common issues and fixes, see [Troubleshoot](docs/docs/support/troubleshoot.m ## Development -For developers wanting to contribute to OpenRAG or set up a development environment, please see our comprehensive development guide: - -**[๐Ÿ“š See CONTRIBUTING.md for detailed development instructions](CONTRIBUTING.md)** - -### Quick Development Commands - -```bash -make help # See all available commands -make setup # Initial development setup -make infra # Start infrastructure services -make backend # Run backend locally -make frontend # Run frontend locally -``` \ No newline at end of file +For developers wanting to contribute to OpenRAG or set up a development environment, see [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file From 19c86c8b72e4ef0701e1f6c8b9114dd65148bf30 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:02:46 -0400 Subject: [PATCH 103/113] style --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4c06e4a9..b4beef2a 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ For the full TUI guide, see [TUI](docs/docs/get-started/tui.mdx). ## Docker Deployment -The repository includes two Docker Compose `.yml` files. +If you prefer to use Docker to run OpenRAG, the repository includes two Docker Compose `.yml` files. They deploy the same applications and containers, but to different environments. - [`docker-compose.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose.yml) is an OpenRAG deployment for environments with GPU support. GPU support requires an NVIDIA GPU with CUDA support and compatible NVIDIA drivers installed on the OpenRAG host machine. @@ -67,16 +67,16 @@ cd openrag 2. Build and start all services. -For the GPU-accelerated deployment, run: -```bash -docker compose build -docker compose up -d -``` + For the GPU-accelerated deployment, run: + ```bash + docker compose build + docker compose up -d + ``` -For environments without GPU support, run: -```bash -docker compose -f docker-compose-cpu.yml up -d -``` + For environments without GPU support, run: + ```bash + docker compose -f docker-compose-cpu.yml up -d + ``` For more information, see [Deploy with Docker](docs/docs/get-started/docker.mdx). From 1b45813dfdbb16bf2dab0fb71a17396c77b32848 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:03:56 -0400 Subject: [PATCH 104/113] restating-headline --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b4beef2a..b4838cd8 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ OpenRAG is a comprehensive Retrieval-Augmented Generation platform that enables ## Quickstart -To get started quickly, use the OpenRAG Terminal User Interface (TUI) to manage your OpenRAG installation without complex command-line operations. +Use the OpenRAG Terminal User Interface (TUI) to manage your OpenRAG installation without complex command-line operations. To launch OpenRAG with the TUI, do the following: From e88601c05aa72f58d223fbea4b7dbf21788eff55 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:05:54 -0400 Subject: [PATCH 105/113] spacing --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b4838cd8..bfccdb6a 100644 --- a/README.md +++ b/README.md @@ -32,21 +32,21 @@ Use the OpenRAG Terminal User Interface (TUI) to manage your OpenRAG installatio To launch OpenRAG with the TUI, do the following: 1. Clone the OpenRAG repository. -```bash -git clone https://github.com/langflow-ai/openrag.git -cd openrag -``` + ```bash + git clone https://github.com/langflow-ai/openrag.git + cd openrag + ``` 2. To start the TUI, from the repository root, run: -```bash -# Install dependencies first -uv sync + ```bash + # Install dependencies first + uv sync + + # Launch the TUI + uv run openrag + ``` -# Launch the TUI -uv run openrag -``` - -The TUI opens and guides you through OpenRAG setup. + The TUI opens and guides you through OpenRAG setup. For the full TUI guide, see [TUI](docs/docs/get-started/tui.mdx). @@ -60,10 +60,10 @@ They deploy the same applications and containers, but to different environments. - [`docker-compose-cpu.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose-cpu.yml) is a CPU-only version of OpenRAG for systems without GPU support. Use this Docker compose file for environments where GPU drivers aren't available. 1. Clone the OpenRAG repository. -```bash -git clone https://github.com/langflow-ai/openrag.git -cd openrag -``` + ```bash + git clone https://github.com/langflow-ai/openrag.git + cd openrag + ``` 2. Build and start all services. From 0dcfff15b309d22c8bb587def11b6fc5b1da9df8 Mon Sep 17 00:00:00 2001 From: phact Date: Wed, 8 Oct 2025 13:25:55 -0400 Subject: [PATCH 106/113] podman: flow directory security --- docker-compose-cpu.yml | 4 ++-- docker-compose.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml index 40507c94..c0af8a01 100644 --- a/docker-compose-cpu.yml +++ b/docker-compose-cpu.yml @@ -74,7 +74,7 @@ services: volumes: - ./documents:/app/documents:Z - ./keys:/app/keys:Z - - ./flows:/app/flows:Z + - ./flows:/app/flows:U,z openrag-frontend: image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} @@ -91,7 +91,7 @@ services: langflow: volumes: - - ./flows:/app/flows:Z + - ./flows:/app/flows:U,z image: phact/openrag-langflow:${LANGFLOW_VERSION:-latest} # build: # context: . diff --git a/docker-compose.yml b/docker-compose.yml index 4a68d210..df8a3228 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,7 +73,7 @@ services: volumes: - ./documents:/app/documents:Z - ./keys:/app/keys:Z - - ./flows:/app/flows:z + - ./flows:/app/flows:U,z gpus: all openrag-frontend: @@ -91,7 +91,7 @@ services: langflow: volumes: - - ./flows:/app/flows:z + - ./flows:/app/flows:U,z image: phact/openrag-langflow:${LANGFLOW_VERSION:-latest} # build: # context: . From c5b88b0201fbf65cd93072720627230353ef4a28 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:30:17 -0400 Subject: [PATCH 107/113] docling-requirement --- README.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index bfccdb6a..7d643727 100644 --- a/README.md +++ b/README.md @@ -59,13 +59,42 @@ They deploy the same applications and containers, but to different environments. - [`docker-compose-cpu.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose-cpu.yml) is a CPU-only version of OpenRAG for systems without GPU support. Use this Docker compose file for environments where GPU drivers aren't available. +Both Docker deployments depend on `docling serve` to be running on port `5001` on the host machine. This is required to take advantage of[Mac MLX](https://opensource.apple.com/projects/mlx/) support for document processing. Installing OpenRAG with the TUI starts `docling serve` automatically, but for a Docker deployment you must manually start the `docling serve` process. + +Alternatively, set `DISABLE_INGEST_WITH_LANGFLOW=true` in your `.env` to use OpenRAG's built-in pipeline, which uses docling directly without requiring `docling serve`. + +To deploy OpenRAG with Docker: + 1. Clone the OpenRAG repository. ```bash git clone https://github.com/langflow-ai/openrag.git cd openrag ``` -2. Build and start all services. +2. Install dependencies. + ```bash + uv sync + ``` + +3. Start `docling serve` on the host machine. + ```bash + uv run python scripts/docling_ctl.py start --port 5001 + ``` + +4. Confirm `docling serve` is running. + ``` + uv run python scripts/docling_ctl.py status + ``` + + Successful result: + ```bash + Status: running + Endpoint: http://127.0.0.1:5001 + Docs: http://127.0.0.1:5001/docs + PID: 27746 + ``` + +5. Build and start all services. For the GPU-accelerated deployment, run: ```bash @@ -78,6 +107,23 @@ They deploy the same applications and containers, but to different environments. docker compose -f docker-compose-cpu.yml up -d ``` + The OpenRAG Docker Compose file starts five containers: + | Container Name | Default Address | Purpose | + |---|---|---| + | OpenRAG Backend | http://localhost:8000 | FastAPI server and core functionality. | + | OpenRAG Frontend | http://localhost:3000 | React web interface for users. | + | Langflow | http://localhost:7860 | AI workflow engine and flow management. | + | OpenSearch | http://localhost:9200 | Vector database for document storage. | + | OpenSearch Dashboards | http://localhost:5601 | Database administration interface. | + + You can now access the OpenRAG application at `http://localhost:3000`. + + To stop `docling serve`, run: + + ```bash + uv run python scripts/docling_ctl.py stop + ``` + For more information, see [Deploy with Docker](docs/docs/get-started/docker.mdx). ## Troubleshooting From a2608d0281dc503fdae93ec6b27f622836639dbb Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:34:01 -0400 Subject: [PATCH 108/113] style --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7d643727..a237b7ed 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ They deploy the same applications and containers, but to different environments. - [`docker-compose-cpu.yml`](https://github.com/langflow-ai/openrag/blob/main/docker-compose-cpu.yml) is a CPU-only version of OpenRAG for systems without GPU support. Use this Docker compose file for environments where GPU drivers aren't available. -Both Docker deployments depend on `docling serve` to be running on port `5001` on the host machine. This is required to take advantage of[Mac MLX](https://opensource.apple.com/projects/mlx/) support for document processing. Installing OpenRAG with the TUI starts `docling serve` automatically, but for a Docker deployment you must manually start the `docling serve` process. +Both Docker deployments depend on `docling serve` to be running on port `5001` on the host machine. This enables [Mac MLX](https://opensource.apple.com/projects/mlx/) support for document processing. Installing OpenRAG with the TUI starts `docling serve` automatically, but for a Docker deployment you must manually start the `docling serve` process. Alternatively, set `DISABLE_INGEST_WITH_LANGFLOW=true` in your `.env` to use OpenRAG's built-in pipeline, which uses docling directly without requiring `docling serve`. @@ -118,7 +118,7 @@ To deploy OpenRAG with Docker: You can now access the OpenRAG application at `http://localhost:3000`. - To stop `docling serve`, run: +To stop `docling serve`, run: ```bash uv run python scripts/docling_ctl.py stop From 6b8ff56e6fbe7726822b0512e086458855c44282 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:37:00 -0400 Subject: [PATCH 109/113] shorten-sentence --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a237b7ed..77ce3191 100644 --- a/README.md +++ b/README.md @@ -116,10 +116,10 @@ To deploy OpenRAG with Docker: | OpenSearch | http://localhost:9200 | Vector database for document storage. | | OpenSearch Dashboards | http://localhost:5601 | Database administration interface. | - You can now access the OpenRAG application at `http://localhost:3000`. - -To stop `docling serve`, run: +6. Access the OpenRAG application at `http://localhost:3000` and continue with the [Quickstart](docs/docs/get-started/quickstart.mdx). + To stop `docling serve`, run: + ```bash uv run python scripts/docling_ctl.py stop ``` From bd21001b6457ee9576a9d9eb4a6143d2e6c83590 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:45:45 -0400 Subject: [PATCH 110/113] remove-env-var-option --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 77ce3191..a7abbbe6 100644 --- a/README.md +++ b/README.md @@ -61,8 +61,6 @@ They deploy the same applications and containers, but to different environments. Both Docker deployments depend on `docling serve` to be running on port `5001` on the host machine. This enables [Mac MLX](https://opensource.apple.com/projects/mlx/) support for document processing. Installing OpenRAG with the TUI starts `docling serve` automatically, but for a Docker deployment you must manually start the `docling serve` process. -Alternatively, set `DISABLE_INGEST_WITH_LANGFLOW=true` in your `.env` to use OpenRAG's built-in pipeline, which uses docling directly without requiring `docling serve`. - To deploy OpenRAG with Docker: 1. Clone the OpenRAG repository. From a5aab8e92af25ba912a74206d0c60bf068f3df72 Mon Sep 17 00:00:00 2001 From: phact Date: Wed, 8 Oct 2025 13:56:52 -0400 Subject: [PATCH 111/113] podman fixes: bind docling-serve to 0.0, improve logging, support magic podman and docker hostnames --- .env.example | 3 +++ src/api/docling.py | 32 +++++++++++++++++++++-------- src/tui/managers/docling_manager.py | 23 +++++++++++++-------- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/.env.example b/.env.example index 8d412670..f790ce09 100644 --- a/.env.example +++ b/.env.example @@ -37,6 +37,9 @@ AWS_SECRET_ACCESS_KEY= # OPTIONAL url for openrag link to langflow in the UI LANGFLOW_PUBLIC_URL= +# OPTIONAL: Override host for docling service (for special networking setups) +# HOST_DOCKER_INTERNAL=host.containers.internal + # Langflow auth LANGFLOW_AUTO_LOGIN=False LANGFLOW_SUPERUSER= diff --git a/src/api/docling.py b/src/api/docling.py index 66b7777e..22b709ef 100644 --- a/src/api/docling.py +++ b/src/api/docling.py @@ -41,23 +41,36 @@ def determine_docling_host() -> str: """Determine the host address used for docling health checks.""" container_type = detect_container_environment() if container_type: + # Try HOST_DOCKER_INTERNAL env var first container_host = get_container_host() if container_host: logger.info("Using container-aware host '%s'", container_host) return container_host + # Try special hostnames (Docker Desktop and rootless podman) + import socket + for hostname in ["host.docker.internal", "host.containers.internal"]: + try: + socket.getaddrinfo(hostname, None) + logger.info("Using %s for container-to-host communication", hostname) + return hostname + except socket.gaierror: + logger.debug("%s not available", hostname) + + # Try gateway IP detection (Docker on Linux) gateway_ip = _get_gateway_ip_from_route() if gateway_ip: logger.info("Detected host gateway IP: %s", gateway_ip) return gateway_ip - # Either we're not inside a container or gateway detection failed. - fallback_ip = guess_host_ip_for_containers(logger=logger) - if container_type: + # Fallback to bridge IP + fallback_ip = guess_host_ip_for_containers(logger=logger) logger.info("Falling back to container bridge host %s", fallback_ip) - else: - logger.info("Running outside a container; using host %s", fallback_ip) - return fallback_ip + return fallback_ip + + # Running outside a container + logger.info("Running outside a container; using localhost") + return "localhost" # Detect the host IP once at startup @@ -70,10 +83,11 @@ async def health(request: Request) -> JSONResponse: Proxy health check to docling-serve. This allows the frontend to check docling status via same-origin request. """ + health_url = f"{DOCLING_SERVICE_URL}/health" try: async with httpx.AsyncClient() as client: response = await client.get( - f"{DOCLING_SERVICE_URL}/health", + health_url, timeout=2.0 ) @@ -83,6 +97,7 @@ async def health(request: Request) -> JSONResponse: "host": HOST_IP }) else: + logger.warning("Docling health check failed", url=health_url, status_code=response.status_code) return JSONResponse({ "status": "unhealthy", "message": f"Health check failed with status: {response.status_code}", @@ -90,13 +105,14 @@ async def health(request: Request) -> JSONResponse: }, status_code=503) except httpx.TimeoutException: + logger.warning("Docling health check timeout", url=health_url) return JSONResponse({ "status": "unhealthy", "message": "Connection timeout", "host": HOST_IP }, status_code=503) except Exception as e: - logger.error("Docling health check failed", error=str(e)) + logger.error("Docling health check failed", url=health_url, error=str(e)) return JSONResponse({ "status": "unhealthy", "message": str(e), diff --git a/src/tui/managers/docling_manager.py b/src/tui/managers/docling_manager.py index 7cb5d1e8..e58a5b1e 100644 --- a/src/tui/managers/docling_manager.py +++ b/src/tui/managers/docling_manager.py @@ -8,7 +8,6 @@ import threading import time from typing import Optional, Tuple, Dict, Any, List, AsyncIterator from utils.logging_config import get_logger -from utils.container_utils import guess_host_ip_for_containers logger = get_logger(__name__) @@ -32,7 +31,8 @@ class DoclingManager: self._process: Optional[subprocess.Popen] = None self._port = 5001 - self._host = guess_host_ip_for_containers(logger=logger) # Get appropriate host IP based on runtime + # Bind to all interfaces by default (can be overridden with DOCLING_BIND_HOST env var) + self._host = os.getenv('DOCLING_BIND_HOST', '0.0.0.0') self._running = False self._external_process = False @@ -150,16 +150,20 @@ class DoclingManager: else: pid = self._load_pid() + # Use localhost for display URLs when bound to 0.0.0.0 + display_host = "localhost" if self._host == "0.0.0.0" else self._host + return { "status": "running", "port": self._port, "host": self._host, - "endpoint": f"http://{self._host}:{self._port}", - "docs_url": f"http://{self._host}:{self._port}/docs", - "ui_url": f"http://{self._host}:{self._port}/ui", + "endpoint": f"http://{display_host}:{self._port}", + "docs_url": f"http://{display_host}:{self._port}/docs", + "ui_url": f"http://{display_host}:{self._port}/ui", "pid": pid } else: + display_host = "localhost" if self._host == "0.0.0.0" else self._host return { "status": "stopped", "port": self._port, @@ -176,10 +180,9 @@ class DoclingManager: return False, "Docling serve is already running" self._port = port - # Use provided host or the bridge IP we detected in __init__ + # Use provided host or keep default from __init__ if host is not None: self._host = host - # else: keep self._host as already set in __init__ # Check if port is already in use before trying to start import socket @@ -293,7 +296,8 @@ class DoclingManager: self._running = False return False, f"Docling serve process exited immediately (code: {return_code})" - return True, f"Docling serve starting on http://{host}:{port}" + display_host = "localhost" if self._host == "0.0.0.0" else self._host + return True, f"Docling serve starting on http://{display_host}:{port}" except FileNotFoundError: return False, "docling-serve not available. Please install: uv add docling-serve" @@ -454,7 +458,8 @@ class DoclingManager: async def follow_logs(self) -> AsyncIterator[str]: """Follow logs from the docling-serve process in real-time.""" # First yield status message and any existing logs - status_msg = f"Docling serve is running on http://{self._host}:{self._port}" + display_host = "localhost" if self._host == "0.0.0.0" else self._host + status_msg = f"Docling serve is running on http://{display_host}:{self._port}" with self._log_lock: if self._log_buffer: From 64512c79bedb12ac0612b8ca0e357567e23c5f58 Mon Sep 17 00:00:00 2001 From: Alex Leventer <3254549+alexleventer@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:21:04 -0700 Subject: [PATCH 112/113] Do not check in .DS_Store --- .DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index ca39b6e347a514559b8ebc1e3e1057a1eda59931..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMU2GIp6h7Zl=uF4L6k4`GSXYVxb%C}trTn`6LFKndTUyG`vfCNx$aJRc%_M#0P(ZPewg=?ku6DyeLtObCbDe z?z!i?bI+OY%gC!VGG(sjh z3RVY=@3{^LFv(D+Lpd&G0Od7h^?(qHu*86b^L$d6vrLC_TuKQiNH`&UGQtW4!P6O+ z49p2>DWg7$Koo)NBOq#bl?-xehzi%%@7`$6c5^&?E;njhp5NH`1!B`Grq8HUDiuxH z*E8&vdi<jJz~`Bx^}L7pUv&E%u+_L?Q|VKYdK~<09b~}CVTrW$8bv>Zox6U z0NW!c6jf0(di~f~%gW|dvZb}@L@GJ9y17x@x12bkDs>Gj*6z$4wnrTIlynB+2Z5?5 zIrsR~p4ncpQ+gKrt?b#QvS(_Qc3Zr+s!yvLu!dZx$2x9i^jRJ|w~2>4@@A2p>K}6L z!ieK#^qQfpJyfu>zS(Zu!kBflg-Q3QZ@G@Q%glQ$2YmCWFQx}QH}7`_=!~uxJaebX zV5?|$@eaNlo z6=l#H&gRYT64R=T*rX`It75%?Y&0z2El(!4DavlKSi52HIyJNOnceXRbG(4$tESyoj@S8|Uy2KEwrlf=}@czQ<+!fM4-PY<6r;tS&Y;7LP5BEt5XP z%GuFB(WlsQ9#4_?oE>4_#|l$sN8EAblcrDObW#IsOCqJ|P~@OZTcha}oLF+WUb|P*N#wK~cO<$J zx=541LfE(HUHXU#dx;5qnSP`zs6aL5U?J1<9xTIhG@%)5u^tlkr^&}o;x}L&QcpA^(G@iv7yo^`yDqh3uIFI)x(Q@-dTH^hw zX<6*aJGSc_;#ooH>2#26KC(*Uhvt9xplWSel}fH``@09#H6Bg!bz*>S>S&r=tzOjs z8*lvk|4n^nv{)2@C<3=u1W?h{)78OyD*v%Cd3&PPPV#ktFVTeExRe2eyPzCr8I Date: Wed, 8 Oct 2025 17:59:30 -0300 Subject: [PATCH 113/113] Added validation for api keys --- src/services/models_service.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/services/models_service.py b/src/services/models_service.py index 8c779940..01707dd2 100644 --- a/src/services/models_service.py +++ b/src/services/models_service.py @@ -242,6 +242,35 @@ class ModelsService: headers["Authorization"] = f"Bearer {api_key}" if project_id: headers["Project-ID"] = project_id + + # Validate credentials with a minimal completion request + async with httpx.AsyncClient() as client: + validation_url = f"{watson_endpoint}/ml/v1/text/generation" + validation_params = {"version": "2024-09-16"} + validation_payload = { + "input": "test", + "model_id": "ibm/granite-3-2b-instruct", + "project_id": project_id, + "parameters": { + "max_new_tokens": 1, + }, + } + + validation_response = await client.post( + validation_url, + headers=headers, + params=validation_params, + json=validation_payload, + timeout=10.0, + ) + + if validation_response.status_code != 200: + raise Exception( + f"Invalid credentials or endpoint: {validation_response.status_code} - {validation_response.text}" + ) + + logger.info("IBM Watson credentials validated successfully") + # Fetch foundation models using the correct endpoint models_url = f"{watson_endpoint}/ml/v1/foundation_model_specs"