From 580c6f8e3282d66fc1949945bb8e8fda1af1a965 Mon Sep 17 00:00:00 2001 From: Lucas Oliveira Date: Sat, 20 Sep 2025 12:19:54 -0300 Subject: [PATCH] ingest documents only if user wants to --- src/api/settings.py | 54 +++++++++++++++++++++--------------- src/config/config_manager.py | 3 +- src/main.py | 3 +- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/src/api/settings.py b/src/api/settings.py index 3a577d92..47a83a09 100644 --- a/src/api/settings.py +++ b/src/api/settings.py @@ -198,8 +198,6 @@ async def update_settings(request, session_manager): allowed_fields = { "llm_model", "system_prompt", - "ocr", - "picture_descriptions", "chunk_size", "chunk_overlap", "doclingPresets", @@ -241,23 +239,6 @@ async def update_settings(request, session_manager): current_config.knowledge.doclingPresets = body["doclingPresets"] config_updated = True - if "ocr" in body: - if not isinstance(body["ocr"], bool): - return JSONResponse( - {"error": "ocr must be a boolean value"}, status_code=400 - ) - current_config.knowledge.ocr = body["ocr"] - config_updated = True - - if "picture_descriptions" in body: - if not isinstance(body["picture_descriptions"], bool): - return JSONResponse( - {"error": "picture_descriptions must be a boolean value"}, - status_code=400, - ) - current_config.knowledge.picture_descriptions = body["picture_descriptions"] - config_updated = True - if "chunk_size" in body: if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0: return JSONResponse( @@ -402,13 +383,14 @@ async def onboarding(request, flows_service): current_config.provider.project_id = body["project_id"].strip() config_updated = True - # Handle sample_data (unused for now but validate) + # Handle sample_data + should_ingest_sample_data = False if "sample_data" in body: if not isinstance(body["sample_data"], bool): return JSONResponse( {"error": "sample_data must be a boolean value"}, status_code=400 ) - # Note: sample_data is accepted but not used as requested + should_ingest_sample_data = body["sample_data"] if not config_updated: return JSONResponse( @@ -445,16 +427,44 @@ async def onboarding(request, flows_service): except Exception as e: logger.error( - f"Error assigning model provider to flows", + "Error assigning model provider to flows", provider=provider, error=str(e), ) # Continue even if flow assignment fails - configuration was still saved + # Handle sample data ingestion if requested + if should_ingest_sample_data: + try: + # Import the function here to avoid circular imports + from main import ingest_default_documents_when_ready + + # Get services from the current app state + # We need to access the app instance to get services + app = request.scope.get("app") + if app and hasattr(app.state, "services"): + services = app.state.services + logger.info( + "Starting sample data ingestion as requested in onboarding" + ) + await ingest_default_documents_when_ready(services) + logger.info("Sample data ingestion completed successfully") + else: + logger.error( + "Could not access services for sample data ingestion" + ) + + except Exception as e: + logger.error( + "Failed to complete sample data ingestion", error=str(e) + ) + # Don't fail the entire onboarding process if sample data fails + return JSONResponse( { "message": "Onboarding configuration updated successfully", "edited": True, # Confirm that config is now marked as edited + "sample_data_ingested": should_ingest_sample_data, } ) else: diff --git a/src/config/config_manager.py b/src/config/config_manager.py index d2de63d3..055d48a7 100644 --- a/src/config/config_manager.py +++ b/src/config/config_manager.py @@ -25,8 +25,7 @@ class KnowledgeConfig: embedding_model: str = "text-embedding-3-small" chunk_size: int = 1000 chunk_overlap: int = 200 - ocr: bool = True - picture_descriptions: bool = False + doclingPresets: str = "standard" @dataclass diff --git a/src/main.py b/src/main.py index 4ac08889..e7cca718 100644 --- a/src/main.py +++ b/src/main.py @@ -392,7 +392,8 @@ async def startup_tasks(services): """Startup tasks""" logger.info("Starting startup tasks") await init_index() - await ingest_default_documents_when_ready(services) + # Sample data ingestion is now handled by the onboarding endpoint when sample_data=True + logger.info("Sample data ingestion moved to onboarding endpoint") async def initialize_services():