ingest documents only if user wants to

2025-09-20 12:19:54 -03:00 · 2025-09-20 12:19:54 -03:00 · 580c6f8e32
commit 580c6f8e32
parent abb1ae0819
3 changed files with 35 additions and 25 deletions
--- a/src/api/settings.py
+++ b/src/api/settings.py
@ -198,8 +198,6 @@ async def update_settings(request, session_manager):
        allowed_fields = {
            "llm_model",
            "system_prompt",
            "ocr",
            "picture_descriptions",
            "chunk_size",
            "chunk_overlap",
            "doclingPresets",
@ -241,23 +239,6 @@ async def update_settings(request, session_manager):
            current_config.knowledge.doclingPresets = body["doclingPresets"]
            config_updated = True
        if "ocr" in body:
            if not isinstance(body["ocr"], bool):
                return JSONResponse(
                    {"error": "ocr must be a boolean value"}, status_code=400
                )
            current_config.knowledge.ocr = body["ocr"]
            config_updated = True
        if "picture_descriptions" in body:
            if not isinstance(body["picture_descriptions"], bool):
                return JSONResponse(
                    {"error": "picture_descriptions must be a boolean value"},
                    status_code=400,
                )
            current_config.knowledge.picture_descriptions = body["picture_descriptions"]
            config_updated = True
        if "chunk_size" in body:
            if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0:
                return JSONResponse(
@ -402,13 +383,14 @@ async def onboarding(request, flows_service):
            current_config.provider.project_id = body["project_id"].strip()
            config_updated = True
-        # Handle sample_data (unused for now but validate)
+        # Handle sample_data
        should_ingest_sample_data = False
        if "sample_data" in body:
            if not isinstance(body["sample_data"], bool):
                return JSONResponse(
                    {"error": "sample_data must be a boolean value"}, status_code=400
                )
-            # Note: sample_data is accepted but not used as requested
+            should_ingest_sample_data = body["sample_data"]
        if not config_updated:
            return JSONResponse(
@ -445,16 +427,44 @@ async def onboarding(request, flows_service):
                except Exception as e:
                    logger.error(
-                        f"Error assigning model provider to flows",
+                        "Error assigning model provider to flows",
                        provider=provider,
                        error=str(e),
                    )
                    # Continue even if flow assignment fails - configuration was still saved
            # Handle sample data ingestion if requested
            if should_ingest_sample_data:
                try:
                    # Import the function here to avoid circular imports
                    from main import ingest_default_documents_when_ready
                    # Get services from the current app state
                    # We need to access the app instance to get services
                    app = request.scope.get("app")
                    if app and hasattr(app.state, "services"):
                        services = app.state.services
                        logger.info(
                            "Starting sample data ingestion as requested in onboarding"
                        )
                        await ingest_default_documents_when_ready(services)
                        logger.info("Sample data ingestion completed successfully")
                    else:
                        logger.error(
                            "Could not access services for sample data ingestion"
                        )
                except Exception as e:
                    logger.error(
                        "Failed to complete sample data ingestion", error=str(e)
                    )
                    # Don't fail the entire onboarding process if sample data fails
            return JSONResponse(
                {
                    "message": "Onboarding configuration updated successfully",
                    "edited": True,  # Confirm that config is now marked as edited
                    "sample_data_ingested": should_ingest_sample_data,
                }
            )
        else:
--- a/src/config/config_manager.py
+++ b/src/config/config_manager.py
@ -25,8 +25,7 @@ class KnowledgeConfig:
    embedding_model: str = "text-embedding-3-small"
    chunk_size: int = 1000
    chunk_overlap: int = 200
-    ocr: bool = True
+    doclingPresets: str = "standard"
    picture_descriptions: bool = False
@dataclass
--- a/src/main.py
+++ b/src/main.py
@ -392,7 +392,8 @@ async def startup_tasks(services):
    """Startup tasks"""
    logger.info("Starting startup tasks")
    await init_index()
-    await ingest_default_documents_when_ready(services)
+    # Sample data ingestion is now handled by the onboarding endpoint when sample_data=True
    logger.info("Sample data ingestion moved to onboarding endpoint")
 async def initialize_services():