ingest documents only if user wants to

This commit is contained in:
Lucas Oliveira 2025-09-20 12:19:54 -03:00
parent abb1ae0819
commit 580c6f8e32
3 changed files with 35 additions and 25 deletions

View file

@ -198,8 +198,6 @@ async def update_settings(request, session_manager):
allowed_fields = {
"llm_model",
"system_prompt",
"ocr",
"picture_descriptions",
"chunk_size",
"chunk_overlap",
"doclingPresets",
@ -241,23 +239,6 @@ async def update_settings(request, session_manager):
current_config.knowledge.doclingPresets = body["doclingPresets"]
config_updated = True
if "ocr" in body:
if not isinstance(body["ocr"], bool):
return JSONResponse(
{"error": "ocr must be a boolean value"}, status_code=400
)
current_config.knowledge.ocr = body["ocr"]
config_updated = True
if "picture_descriptions" in body:
if not isinstance(body["picture_descriptions"], bool):
return JSONResponse(
{"error": "picture_descriptions must be a boolean value"},
status_code=400,
)
current_config.knowledge.picture_descriptions = body["picture_descriptions"]
config_updated = True
if "chunk_size" in body:
if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0:
return JSONResponse(
@ -402,13 +383,14 @@ async def onboarding(request, flows_service):
current_config.provider.project_id = body["project_id"].strip()
config_updated = True
# Handle sample_data (unused for now but validate)
# Handle sample_data
should_ingest_sample_data = False
if "sample_data" in body:
if not isinstance(body["sample_data"], bool):
return JSONResponse(
{"error": "sample_data must be a boolean value"}, status_code=400
)
# Note: sample_data is accepted but not used as requested
should_ingest_sample_data = body["sample_data"]
if not config_updated:
return JSONResponse(
@ -445,16 +427,44 @@ async def onboarding(request, flows_service):
except Exception as e:
logger.error(
f"Error assigning model provider to flows",
"Error assigning model provider to flows",
provider=provider,
error=str(e),
)
# Continue even if flow assignment fails - configuration was still saved
# Handle sample data ingestion if requested
if should_ingest_sample_data:
try:
# Import the function here to avoid circular imports
from main import ingest_default_documents_when_ready
# Get services from the current app state
# We need to access the app instance to get services
app = request.scope.get("app")
if app and hasattr(app.state, "services"):
services = app.state.services
logger.info(
"Starting sample data ingestion as requested in onboarding"
)
await ingest_default_documents_when_ready(services)
logger.info("Sample data ingestion completed successfully")
else:
logger.error(
"Could not access services for sample data ingestion"
)
except Exception as e:
logger.error(
"Failed to complete sample data ingestion", error=str(e)
)
# Don't fail the entire onboarding process if sample data fails
return JSONResponse(
{
"message": "Onboarding configuration updated successfully",
"edited": True, # Confirm that config is now marked as edited
"sample_data_ingested": should_ingest_sample_data,
}
)
else:

View file

@ -25,8 +25,7 @@ class KnowledgeConfig:
embedding_model: str = "text-embedding-3-small"
chunk_size: int = 1000
chunk_overlap: int = 200
ocr: bool = True
picture_descriptions: bool = False
doclingPresets: str = "standard"
@dataclass

View file

@ -392,7 +392,8 @@ async def startup_tasks(services):
"""Startup tasks"""
logger.info("Starting startup tasks")
await init_index()
await ingest_default_documents_when_ready(services)
# Sample data ingestion is now handled by the onboarding endpoint when sample_data=True
logger.info("Sample data ingestion moved to onboarding endpoint")
async def initialize_services():