ingest documents only if user wants to
This commit is contained in:
parent
abb1ae0819
commit
580c6f8e32
3 changed files with 35 additions and 25 deletions
|
|
@ -198,8 +198,6 @@ async def update_settings(request, session_manager):
|
||||||
allowed_fields = {
|
allowed_fields = {
|
||||||
"llm_model",
|
"llm_model",
|
||||||
"system_prompt",
|
"system_prompt",
|
||||||
"ocr",
|
|
||||||
"picture_descriptions",
|
|
||||||
"chunk_size",
|
"chunk_size",
|
||||||
"chunk_overlap",
|
"chunk_overlap",
|
||||||
"doclingPresets",
|
"doclingPresets",
|
||||||
|
|
@ -241,23 +239,6 @@ async def update_settings(request, session_manager):
|
||||||
current_config.knowledge.doclingPresets = body["doclingPresets"]
|
current_config.knowledge.doclingPresets = body["doclingPresets"]
|
||||||
config_updated = True
|
config_updated = True
|
||||||
|
|
||||||
if "ocr" in body:
|
|
||||||
if not isinstance(body["ocr"], bool):
|
|
||||||
return JSONResponse(
|
|
||||||
{"error": "ocr must be a boolean value"}, status_code=400
|
|
||||||
)
|
|
||||||
current_config.knowledge.ocr = body["ocr"]
|
|
||||||
config_updated = True
|
|
||||||
|
|
||||||
if "picture_descriptions" in body:
|
|
||||||
if not isinstance(body["picture_descriptions"], bool):
|
|
||||||
return JSONResponse(
|
|
||||||
{"error": "picture_descriptions must be a boolean value"},
|
|
||||||
status_code=400,
|
|
||||||
)
|
|
||||||
current_config.knowledge.picture_descriptions = body["picture_descriptions"]
|
|
||||||
config_updated = True
|
|
||||||
|
|
||||||
if "chunk_size" in body:
|
if "chunk_size" in body:
|
||||||
if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0:
|
if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0:
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|
@ -402,13 +383,14 @@ async def onboarding(request, flows_service):
|
||||||
current_config.provider.project_id = body["project_id"].strip()
|
current_config.provider.project_id = body["project_id"].strip()
|
||||||
config_updated = True
|
config_updated = True
|
||||||
|
|
||||||
# Handle sample_data (unused for now but validate)
|
# Handle sample_data
|
||||||
|
should_ingest_sample_data = False
|
||||||
if "sample_data" in body:
|
if "sample_data" in body:
|
||||||
if not isinstance(body["sample_data"], bool):
|
if not isinstance(body["sample_data"], bool):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
{"error": "sample_data must be a boolean value"}, status_code=400
|
{"error": "sample_data must be a boolean value"}, status_code=400
|
||||||
)
|
)
|
||||||
# Note: sample_data is accepted but not used as requested
|
should_ingest_sample_data = body["sample_data"]
|
||||||
|
|
||||||
if not config_updated:
|
if not config_updated:
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|
@ -445,16 +427,44 @@ async def onboarding(request, flows_service):
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Error assigning model provider to flows",
|
"Error assigning model provider to flows",
|
||||||
provider=provider,
|
provider=provider,
|
||||||
error=str(e),
|
error=str(e),
|
||||||
)
|
)
|
||||||
# Continue even if flow assignment fails - configuration was still saved
|
# Continue even if flow assignment fails - configuration was still saved
|
||||||
|
|
||||||
|
# Handle sample data ingestion if requested
|
||||||
|
if should_ingest_sample_data:
|
||||||
|
try:
|
||||||
|
# Import the function here to avoid circular imports
|
||||||
|
from main import ingest_default_documents_when_ready
|
||||||
|
|
||||||
|
# Get services from the current app state
|
||||||
|
# We need to access the app instance to get services
|
||||||
|
app = request.scope.get("app")
|
||||||
|
if app and hasattr(app.state, "services"):
|
||||||
|
services = app.state.services
|
||||||
|
logger.info(
|
||||||
|
"Starting sample data ingestion as requested in onboarding"
|
||||||
|
)
|
||||||
|
await ingest_default_documents_when_ready(services)
|
||||||
|
logger.info("Sample data ingestion completed successfully")
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
"Could not access services for sample data ingestion"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Failed to complete sample data ingestion", error=str(e)
|
||||||
|
)
|
||||||
|
# Don't fail the entire onboarding process if sample data fails
|
||||||
|
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
{
|
{
|
||||||
"message": "Onboarding configuration updated successfully",
|
"message": "Onboarding configuration updated successfully",
|
||||||
"edited": True, # Confirm that config is now marked as edited
|
"edited": True, # Confirm that config is now marked as edited
|
||||||
|
"sample_data_ingested": should_ingest_sample_data,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -25,8 +25,7 @@ class KnowledgeConfig:
|
||||||
embedding_model: str = "text-embedding-3-small"
|
embedding_model: str = "text-embedding-3-small"
|
||||||
chunk_size: int = 1000
|
chunk_size: int = 1000
|
||||||
chunk_overlap: int = 200
|
chunk_overlap: int = 200
|
||||||
ocr: bool = True
|
doclingPresets: str = "standard"
|
||||||
picture_descriptions: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
|
|
@ -392,7 +392,8 @@ async def startup_tasks(services):
|
||||||
"""Startup tasks"""
|
"""Startup tasks"""
|
||||||
logger.info("Starting startup tasks")
|
logger.info("Starting startup tasks")
|
||||||
await init_index()
|
await init_index()
|
||||||
await ingest_default_documents_when_ready(services)
|
# Sample data ingestion is now handled by the onboarding endpoint when sample_data=True
|
||||||
|
logger.info("Sample data ingestion moved to onboarding endpoint")
|
||||||
|
|
||||||
|
|
||||||
async def initialize_services():
|
async def initialize_services():
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue