openrag/src/api/settings.py

import json
import platform
import time
from starlette.responses import JSONResponse
from utils.container_utils import transform_localhost_url
from utils.logging_config import get_logger
from utils.telemetry import TelemetryClient, Category, MessageId
from config.settings import (
    DISABLE_INGEST_WITH_LANGFLOW,
    LANGFLOW_URL,
    LANGFLOW_CHAT_FLOW_ID,
    LANGFLOW_INGEST_FLOW_ID,
    LANGFLOW_PUBLIC_URL,
    LOCALHOST_URL,
    clients,
    get_openrag_config,
    config_manager,
    is_no_auth_mode,
)
from api.provider_validation import validate_provider_setup

logger = get_logger(__name__)


# Docling preset configurations
def get_docling_preset_configs(
    table_structure=False, ocr=False, picture_descriptions=False
):
    """Get docling preset configurations based on toggle settings

    Args:
        table_structure: Enable table structure parsing (default: False)
        ocr: Enable OCR for text extraction from images (default: False)
        picture_descriptions: Enable picture descriptions/captions (default: False)
    """
    is_macos = platform.system() == "Darwin"

    config = {
        "do_ocr": ocr,
        "ocr_engine": "ocrmac" if is_macos else "easyocr",
        "do_table_structure": table_structure,
        "do_picture_classification": picture_descriptions,
        "do_picture_description": picture_descriptions,
        "picture_description_local": {
            "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",
            "prompt": "Describe this image in a few sentences.",
        },
    }

    return config


async def get_settings(request, session_manager):
    """Get application settings"""
    try:
        openrag_config = get_openrag_config()

        knowledge_config = openrag_config.knowledge
        agent_config = openrag_config.agent

        # Return public settings that are safe to expose to frontend
        settings = {
            "langflow_url": LANGFLOW_URL,
            "flow_id": LANGFLOW_CHAT_FLOW_ID,
            "ingest_flow_id": LANGFLOW_INGEST_FLOW_ID,
            "langflow_public_url": LANGFLOW_PUBLIC_URL,
            "edited": openrag_config.edited,
            # OpenRAG configuration
            "providers": {
                "openai": {
                    "has_api_key": bool(openrag_config.providers.openai.api_key),
                    "configured": openrag_config.providers.openai.configured,
                    # Note: API key is not exposed for security
                },
                "anthropic": {
                    "has_api_key": bool(openrag_config.providers.anthropic.api_key),
                    "configured": openrag_config.providers.anthropic.configured,
                },
                "watsonx": {
                    "has_api_key": bool(openrag_config.providers.watsonx.api_key),
                    "endpoint": openrag_config.providers.watsonx.endpoint or None,
                    "project_id": openrag_config.providers.watsonx.project_id or None,
                    "configured": openrag_config.providers.watsonx.configured,
                },
                "ollama": {
                    "endpoint": openrag_config.providers.ollama.endpoint or None,
                    "configured": openrag_config.providers.ollama.configured,
                },
            },
            "knowledge": {
                "embedding_model": knowledge_config.embedding_model,
                "embedding_provider": knowledge_config.embedding_provider,
                "chunk_size": knowledge_config.chunk_size,
                "chunk_overlap": knowledge_config.chunk_overlap,
                "table_structure": knowledge_config.table_structure,
                "ocr": knowledge_config.ocr,
                "picture_descriptions": knowledge_config.picture_descriptions,
            },
            "agent": {
                "llm_model": agent_config.llm_model,
                "llm_provider": agent_config.llm_provider,
                "system_prompt": agent_config.system_prompt,
            },
            "localhost_url": LOCALHOST_URL,
        }

        # Only expose edit URLs when a public URL is configured
        if LANGFLOW_PUBLIC_URL and LANGFLOW_CHAT_FLOW_ID:
            settings["langflow_edit_url"] = (
                f"{LANGFLOW_PUBLIC_URL.rstrip('/')}/flow/{LANGFLOW_CHAT_FLOW_ID}"
            )

        if LANGFLOW_PUBLIC_URL and LANGFLOW_INGEST_FLOW_ID:
            settings["langflow_ingest_edit_url"] = (
                f"{LANGFLOW_PUBLIC_URL.rstrip('/')}/flow/{LANGFLOW_INGEST_FLOW_ID}"
            )

        # Fetch ingestion flow configuration to get actual component defaults
        if LANGFLOW_INGEST_FLOW_ID and openrag_config.edited:
            try:
                response = await clients.langflow_request(
                    "GET", f"/api/v1/flows/{LANGFLOW_INGEST_FLOW_ID}"
                )
                if response.status_code == 200:
                    flow_data = response.json()

                    # Extract component defaults (ingestion-specific settings only)
                    # Start with configured defaults
                    ingestion_defaults = {
                        "chunkSize": knowledge_config.chunk_size,
                        "chunkOverlap": knowledge_config.chunk_overlap,
                        "separator": "\\n",  # Keep hardcoded for now as it's not in config
                        "embeddingModel": knowledge_config.embedding_model,
                    }

                    if flow_data.get("data", {}).get("nodes"):
                        for node in flow_data["data"]["nodes"]:
                            node_template = (
                                node.get("data", {}).get("node", {}).get("template", {})
                            )

                            # Split Text component (SplitText-QIKhg)
                            if node.get("id") == "SplitText-QIKhg":
                                if node_template.get("chunk_size", {}).get("value"):
                                    ingestion_defaults["chunkSize"] = node_template[
                                        "chunk_size"
                                    ]["value"]
                                if node_template.get("chunk_overlap", {}).get("value"):
                                    ingestion_defaults["chunkOverlap"] = node_template[
                                        "chunk_overlap"
                                    ]["value"]
                                if node_template.get("separator", {}).get("value"):
                                    ingestion_defaults["separator"] = node_template[
                                        "separator"
                                    ]["value"]

                            # OpenAI Embeddings component (OpenAIEmbeddings-joRJ6)
                            elif node.get("id") == "OpenAIEmbeddings-joRJ6":
                                if node_template.get("model", {}).get("value"):
                                    ingestion_defaults["embeddingModel"] = (
                                        node_template["model"]["value"]
                                    )

                            # Note: OpenSearch component settings are not exposed for ingestion
                            # (search-related parameters like number_of_results, score_threshold
                            # are for retrieval, not ingestion)

                    settings["ingestion_defaults"] = ingestion_defaults

            except Exception as e:
                logger.warning(f"Failed to fetch ingestion flow defaults: {e}")
                # Continue without ingestion defaults

        return JSONResponse(settings)

    except Exception as e:
        return JSONResponse(
            {"error": f"Failed to retrieve settings: {str(e)}"}, status_code=500
        )


async def update_settings(request, session_manager):
    """Update application settings"""
    try:
        # Get current configuration
        current_config = get_openrag_config()

        # Check if config is marked as edited
        if not current_config.edited:
            return JSONResponse(
                {
                    "error": "Configuration must be marked as edited before updates are allowed"
                },
                status_code=403,
            )

        # Parse request body
        body = await request.json()

        # Validate allowed fields
        allowed_fields = {
            "llm_model",
            "llm_provider",
            "system_prompt",
            "chunk_size",
            "chunk_overlap",
            "table_structure",
            "ocr",
            "picture_descriptions",
            "embedding_model",
            "embedding_provider",
            # Provider-specific fields (structured as provider_name.field_name)
            "openai_api_key",
            "anthropic_api_key",
            "watsonx_api_key",
            "watsonx_endpoint",
            "watsonx_project_id",
            "ollama_endpoint",
        }

        # Check for invalid fields
        invalid_fields = set(body.keys()) - allowed_fields
        if invalid_fields:
            return JSONResponse(
                {
                    "error": f"Invalid fields: {', '.join(invalid_fields)}. Allowed fields: {', '.join(allowed_fields)}"
                },
                status_code=400,
            )

        # Validate types early before modifying config
        if "embedding_model" in body:
            if (
                not isinstance(body["embedding_model"], str)
                or not body["embedding_model"].strip()
            ):
                return JSONResponse(
                    {"error": "embedding_model must be a non-empty string"},
                    status_code=400,
                )

        if "table_structure" in body:
            if not isinstance(body["table_structure"], bool):
                return JSONResponse(
                    {"error": "table_structure must be a boolean"}, status_code=400
                )

        if "ocr" in body:
            if not isinstance(body["ocr"], bool):
                return JSONResponse({"error": "ocr must be a boolean"}, status_code=400)

        if "picture_descriptions" in body:
            if not isinstance(body["picture_descriptions"], bool):
                return JSONResponse(
                    {"error": "picture_descriptions must be a boolean"}, status_code=400
                )

        if "chunk_size" in body:
            if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0:
                return JSONResponse(
                    {"error": "chunk_size must be a positive integer"}, status_code=400
                )

        if "chunk_overlap" in body:
            if not isinstance(body["chunk_overlap"], int) or body["chunk_overlap"] < 0:
                return JSONResponse(
                    {"error": "chunk_overlap must be a non-negative integer"},
                    status_code=400,
                )

        if "llm_provider" in body:
            if (
                not isinstance(body["llm_provider"], str)
                or not body["llm_provider"].strip()
            ):
                return JSONResponse(
                    {"error": "llm_provider must be a non-empty string"},
                    status_code=400,
                )
            if body["llm_provider"] not in ["openai", "anthropic", "watsonx", "ollama"]:
                return JSONResponse(
                    {"error": "llm_provider must be one of: openai, anthropic, watsonx, ollama"},
                    status_code=400,
                )

        if "embedding_provider" in body:
            if (
                not isinstance(body["embedding_provider"], str)
                or not body["embedding_provider"].strip()
            ):
                return JSONResponse(
                    {"error": "embedding_provider must be a non-empty string"},
                    status_code=400,
                )
            # Anthropic doesn't have embeddings
            if body["embedding_provider"] not in ["openai", "watsonx", "ollama"]:
                return JSONResponse(
                    {"error": "embedding_provider must be one of: openai, watsonx, ollama"},
                    status_code=400,
                )

        # Validate provider-specific fields
        for key in ["openai_api_key", "anthropic_api_key", "watsonx_api_key"]:
            if key in body and not isinstance(body[key], str):
                return JSONResponse(
                    {"error": f"{key} must be a string"}, status_code=400
                )

        for key in ["watsonx_endpoint", "ollama_endpoint"]:
            if key in body:
                if not isinstance(body[key], str) or not body[key].strip():
                    return JSONResponse(
                        {"error": f"{key} must be a non-empty string"}, status_code=400
                    )

        if "watsonx_project_id" in body:
            if (
                not isinstance(body["watsonx_project_id"], str)
                or not body["watsonx_project_id"].strip()
            ):
                return JSONResponse(
                    {"error": "watsonx_project_id must be a non-empty string"}, status_code=400
                )

        # Validate provider setup if provider-related fields are being updated
        # Do this BEFORE modifying any config
        provider_fields = [
            "llm_provider",
            "embedding_provider",
            "llm_model",
            "embedding_model",
            "openai_api_key",
            "anthropic_api_key",
            "watsonx_api_key",
            "watsonx_endpoint",
            "watsonx_project_id",
            "ollama_endpoint",
        ]
        should_validate = any(field in body for field in provider_fields)

        if should_validate:
            try:
                logger.info("Running provider validation before modifying config")

                # Validate LLM provider if being changed
                if "llm_provider" in body or "llm_model" in body:
                    llm_provider = body.get("llm_provider", current_config.agent.llm_provider)
                    llm_model = body.get("llm_model", current_config.agent.llm_model)

                    # Get the provider config (with any updates from the request)
                    llm_provider_config = current_config.providers.get_provider_config(llm_provider)

                    # Apply any updates from the request
                    api_key = getattr(llm_provider_config, "api_key", None)
                    endpoint = getattr(llm_provider_config, "endpoint", None)
                    project_id = getattr(llm_provider_config, "project_id", None)

                    if f"{llm_provider}_api_key" in body and body[f"{llm_provider}_api_key"].strip():
                        api_key = body[f"{llm_provider}_api_key"]
                    if f"{llm_provider}_endpoint" in body:
                        endpoint = body[f"{llm_provider}_endpoint"]
                    if f"{llm_provider}_project_id" in body:
                        project_id = body[f"{llm_provider}_project_id"]

                    await validate_provider_setup(
                        provider=llm_provider,
                        api_key=api_key,
                        llm_model=llm_model,
                        endpoint=endpoint,
                        project_id=project_id,
                    )
                    logger.info(f"LLM provider validation successful for {llm_provider}")

                # Validate embedding provider if being changed
                if "embedding_provider" in body or "embedding_model" in body:
                    embedding_provider = body.get("embedding_provider", current_config.knowledge.embedding_provider)
                    embedding_model = body.get("embedding_model", current_config.knowledge.embedding_model)

                    # Get the provider config (with any updates from the request)
                    embedding_provider_config = current_config.providers.get_provider_config(embedding_provider)

                    # Apply any updates from the request
                    api_key = getattr(embedding_provider_config, "api_key", None)
                    endpoint = getattr(embedding_provider_config, "endpoint", None)
                    project_id = getattr(embedding_provider_config, "project_id", None)

                    if f"{embedding_provider}_api_key" in body and body[f"{embedding_provider}_api_key"].strip():
                        api_key = body[f"{embedding_provider}_api_key"]
                    if f"{embedding_provider}_endpoint" in body:
                        endpoint = body[f"{embedding_provider}_endpoint"]
                    if f"{embedding_provider}_project_id" in body:
                        project_id = body[f"{embedding_provider}_project_id"]

                    await validate_provider_setup(
                        provider=embedding_provider,
                        api_key=api_key,
                        embedding_model=embedding_model,
                        endpoint=endpoint,
                        project_id=project_id,
                    )
                    logger.info(f"Embedding provider validation successful for {embedding_provider}")

            except Exception as e:
                logger.error(f"Provider validation failed: {str(e)}")
                return JSONResponse({"error": f"{str(e)}"}, status_code=400)

        # Update configuration
        # Only reached if validation passed or wasn't needed
        config_updated = False

        # Update agent settings
        if "llm_model" in body:
            old_model = current_config.agent.llm_model
            current_config.agent.llm_model = body["llm_model"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_LLM_MODEL
            )
            logger.info(f"LLM model changed from {old_model} to {body['llm_model']}")

        if "llm_provider" in body:
            old_provider = current_config.agent.llm_provider
            current_config.agent.llm_provider = body["llm_provider"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_LLM_PROVIDER
            )
            logger.info(f"LLM provider changed from {old_provider} to {body['llm_provider']}")

        if "system_prompt" in body:
            current_config.agent.system_prompt = body["system_prompt"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_SYSTEM_PROMPT
            )

            # Also update the chat flow with the new system prompt
            try:
                flows_service = _get_flows_service()
                await _update_langflow_system_prompt(current_config, flows_service)
            except Exception as e:
                logger.error(f"Failed to update chat flow system prompt: {str(e)}")
                # Don't fail the entire settings update if flow update fails
                # The config will still be saved

        # Update knowledge settings
        if "embedding_model" in body:
            old_model = current_config.knowledge.embedding_model
            new_embedding_model = body["embedding_model"].strip()
            current_config.knowledge.embedding_model = new_embedding_model
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_EMBED_MODEL
            )
            logger.info(f"Embedding model changed from {old_model} to {new_embedding_model}")

        if "embedding_provider" in body:
            old_provider = current_config.knowledge.embedding_provider
            current_config.knowledge.embedding_provider = body["embedding_provider"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_EMBED_PROVIDER
            )
            logger.info(f"Embedding provider changed from {old_provider} to {body['embedding_provider']}")

        if "table_structure" in body:
            current_config.knowledge.table_structure = body["table_structure"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_DOCLING_UPDATED
            )

            # Also update the flow with the new docling settings
            try:
                flows_service = _get_flows_service()
                await _update_langflow_docling_settings(current_config, flows_service)
            except Exception as e:
                logger.error(f"Failed to update docling settings in flow: {str(e)}")

        if "ocr" in body:
            current_config.knowledge.ocr = body["ocr"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_DOCLING_UPDATED
            )

            # Also update the flow with the new docling settings
            try:
                flows_service = _get_flows_service()
                await _update_langflow_docling_settings(current_config, flows_service)
            except Exception as e:
                logger.error(f"Failed to update docling settings in flow: {str(e)}")

        if "picture_descriptions" in body:
            current_config.knowledge.picture_descriptions = body["picture_descriptions"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_DOCLING_UPDATED
            )

            # Also update the flow with the new docling settings
            try:
                flows_service = _get_flows_service()
                await _update_langflow_docling_settings(current_config, flows_service)
            except Exception as e:
                logger.error(f"Failed to update docling settings in flow: {str(e)}")

        if "chunk_size" in body:
            current_config.knowledge.chunk_size = body["chunk_size"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_CHUNK_UPDATED
            )

            # Also update the ingest flow with the new chunk size
            try:
                flows_service = _get_flows_service()
                await flows_service.update_ingest_flow_chunk_size(body["chunk_size"])
                logger.info(
                    f"Successfully updated ingest flow chunk size to {body['chunk_size']}"
                )
            except Exception as e:
                logger.error(f"Failed to update ingest flow chunk size: {str(e)}")
                # Don't fail the entire settings update if flow update fails
                # The config will still be saved

        if "chunk_overlap" in body:
            current_config.knowledge.chunk_overlap = body["chunk_overlap"]
            config_updated = True
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_CHUNK_UPDATED
            )

            # Also update the ingest flow with the new chunk overlap
            try:
                flows_service = _get_flows_service()
                await flows_service.update_ingest_flow_chunk_overlap(
                    body["chunk_overlap"]
                )
                logger.info(
                    f"Successfully updated ingest flow chunk overlap to {body['chunk_overlap']}"
                )
            except Exception as e:
                logger.error(f"Failed to update ingest flow chunk overlap: {str(e)}")
                # Don't fail the entire settings update if flow update fails
                # The config will still be saved

        # Update provider-specific settings
        provider_updated = False
        if "openai_api_key" in body and body["openai_api_key"].strip():
            current_config.providers.openai.api_key = body["openai_api_key"].strip()
            current_config.providers.openai.configured = True
            config_updated = True
            provider_updated = True

        if "anthropic_api_key" in body and body["anthropic_api_key"].strip():
            current_config.providers.anthropic.api_key = body["anthropic_api_key"]
            current_config.providers.anthropic.configured = True
            config_updated = True
            provider_updated = True

        if "watsonx_api_key" in body and body["watsonx_api_key"].strip():
            current_config.providers.watsonx.api_key = body["watsonx_api_key"]
            current_config.providers.watsonx.configured = True
            config_updated = True
            provider_updated = True

        if "watsonx_endpoint" in body:
            current_config.providers.watsonx.endpoint = body["watsonx_endpoint"].strip()
            current_config.providers.watsonx.configured = True
            config_updated = True
            provider_updated = True

        if "watsonx_project_id" in body:
            current_config.providers.watsonx.project_id = body["watsonx_project_id"].strip()
            current_config.providers.watsonx.configured = True
            config_updated = True
            provider_updated = True

        if "ollama_endpoint" in body:
            current_config.providers.ollama.endpoint = body["ollama_endpoint"].strip()
            current_config.providers.ollama.configured = True
            config_updated = True
            provider_updated = True

        if provider_updated:
            await TelemetryClient.send_event(
                Category.SETTINGS_OPERATIONS,
                MessageId.ORB_SETTINGS_PROVIDER_CREDS
            )

        if not config_updated:
            return JSONResponse(
                {"error": "No valid fields provided for update"}, status_code=400
            )

        # Save the updated configuration
        if not config_manager.save_config_file(current_config):
            return JSONResponse(
                {"error": "Failed to save configuration"}, status_code=500
            )

        # Update Langflow global variables and model values if provider settings changed
        provider_fields_to_check = [
            "llm_provider", "embedding_provider",
            "openai_api_key", "anthropic_api_key",
            "watsonx_api_key", "watsonx_endpoint", "watsonx_project_id",
            "ollama_endpoint"
        ]

        await clients.refresh_patched_client()

        if any(key in body for key in provider_fields_to_check):
            try:
                flows_service = _get_flows_service()

                # Update global variables
                await _update_langflow_global_variables(current_config)

                # Update LLM client credentials when embedding selection changes
                if "embedding_provider" in body or "embedding_model" in body:
                    await _update_mcp_servers_with_provider_credentials(
                        current_config, session_manager
                    )

                # Update model values if provider or model changed
                if "llm_provider" in body or "llm_model" in body or "embedding_provider" in body or "embedding_model" in body:
                    await _update_langflow_model_values(current_config, flows_service)

            except Exception as e:
                logger.error(f"Failed to update Langflow settings: {str(e)}")
                # Don't fail the entire settings update if Langflow update fails
                # The config was still saved


        logger.info(
            "Configuration updated successfully", updated_fields=list(body.keys())
        )
        await TelemetryClient.send_event(
            Category.SETTINGS_OPERATIONS,
            MessageId.ORB_SETTINGS_UPDATED
        )
        return JSONResponse({"message": "Configuration updated successfully"})

    except Exception as e:
        logger.error("Failed to update settings", error=str(e))
        await TelemetryClient.send_event(
            Category.SETTINGS_OPERATIONS,
            MessageId.ORB_SETTINGS_UPDATE_FAILED
        )
        return JSONResponse(
            {"error": f"Failed to update settings: {str(e)}"}, status_code=500
        )


async def onboarding(request, flows_service, session_manager=None):
    """Handle onboarding configuration setup"""
    try:
        await TelemetryClient.send_event(Category.ONBOARDING, MessageId.ORB_ONBOARD_START)

        # Get current configuration
        current_config = get_openrag_config()

        # Warn if config was already edited (onboarding being re-run)
        if current_config.edited:
            logger.warning(
                "Onboarding is being run although configuration was already edited before"
            )

        # Parse request body
        body = await request.json()

        # Validate allowed fields
        allowed_fields = {
            "llm_provider",
            "llm_model",
            "embedding_provider",
            "embedding_model",
            "sample_data",
            # Provider-specific fields
            "openai_api_key",
            "anthropic_api_key",
            "watsonx_api_key",
            "watsonx_endpoint",
            "watsonx_project_id",
            "ollama_endpoint",
        }

        # Check for invalid fields
        invalid_fields = set(body.keys()) - allowed_fields
        if invalid_fields:
            return JSONResponse(
                {
                    "error": f"Invalid fields: {', '.join(invalid_fields)}. Allowed fields: {', '.join(allowed_fields)}"
                },
                status_code=400,
            )

        # Update configuration
        config_updated = False

        # Update agent settings (LLM)
        llm_model_selected = None
        llm_provider_selected = None

        if "llm_model" in body:
            if not isinstance(body["llm_model"], str) or not body["llm_model"].strip():
                return JSONResponse(
                    {"error": "llm_model must be a non-empty string"}, status_code=400
                )
            llm_model_selected = body["llm_model"].strip()
            current_config.agent.llm_model = llm_model_selected
            config_updated = True
            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_LLM_MODEL,
                metadata={"llm_model": llm_model_selected}
            )
            logger.info(f"LLM model selected during onboarding: {llm_model_selected}")

        if "llm_provider" in body:
            if (
                not isinstance(body["llm_provider"], str)
                or not body["llm_provider"].strip()
            ):
                return JSONResponse(
                    {"error": "llm_provider must be a non-empty string"},
                    status_code=400,
                )
            if body["llm_provider"] not in ["openai", "anthropic", "watsonx", "ollama"]:
                return JSONResponse(
                    {"error": "llm_provider must be one of: openai, anthropic, watsonx, ollama"},
                    status_code=400,
                )
            llm_provider_selected = body["llm_provider"].strip()
            current_config.agent.llm_provider = llm_provider_selected
            config_updated = True
            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_LLM_PROVIDER,
                metadata={"llm_provider": llm_provider_selected}
            )
            logger.info(f"LLM provider selected during onboarding: {llm_provider_selected}")

        # Update knowledge settings (embedding)
        embedding_model_selected = None
        embedding_provider_selected = None

        if "embedding_model" in body and not DISABLE_INGEST_WITH_LANGFLOW:
            if (
                not isinstance(body["embedding_model"], str)
                or not body["embedding_model"].strip()
            ):
                return JSONResponse(
                    {"error": "embedding_model must be a non-empty string"},
                    status_code=400,
                )
            embedding_model_selected = body["embedding_model"].strip()
            current_config.knowledge.embedding_model = embedding_model_selected
            config_updated = True
            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_EMBED_MODEL,
                metadata={"embedding_model": embedding_model_selected}
            )
            logger.info(f"Embedding model selected during onboarding: {embedding_model_selected}")

        if "embedding_provider" in body:
            if (
                not isinstance(body["embedding_provider"], str)
                or not body["embedding_provider"].strip()
            ):
                return JSONResponse(
                    {"error": "embedding_provider must be a non-empty string"},
                    status_code=400,
                )
            # Anthropic doesn't have embeddings
            if body["embedding_provider"] not in ["openai", "watsonx", "ollama"]:
                return JSONResponse(
                    {"error": "embedding_provider must be one of: openai, watsonx, ollama"},
                    status_code=400,
                )
            embedding_provider_selected = body["embedding_provider"].strip()
            current_config.knowledge.embedding_provider = embedding_provider_selected
            config_updated = True
            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_EMBED_PROVIDER,
                metadata={"embedding_provider": embedding_provider_selected}
            )
            logger.info(f"Embedding provider selected during onboarding: {embedding_provider_selected}")

        # Update provider-specific credentials
        if "openai_api_key" in body and body["openai_api_key"].strip():
            current_config.providers.openai.api_key = body["openai_api_key"].strip()
            current_config.providers.openai.configured = True
            config_updated = True

        if "anthropic_api_key" in body and body["anthropic_api_key"].strip():
            current_config.providers.anthropic.api_key = body["anthropic_api_key"]
            current_config.providers.anthropic.configured = True
            config_updated = True

        if "watsonx_api_key" in body and body["watsonx_api_key"].strip():
            current_config.providers.watsonx.api_key = body["watsonx_api_key"]
            current_config.providers.watsonx.configured = True
            config_updated = True

        if "watsonx_endpoint" in body:
            if not isinstance(body["watsonx_endpoint"], str) or not body["watsonx_endpoint"].strip():
                return JSONResponse(
                    {"error": "watsonx_endpoint must be a non-empty string"}, status_code=400
                )
            current_config.providers.watsonx.endpoint = body["watsonx_endpoint"].strip()
            current_config.providers.watsonx.configured = True
            config_updated = True

        if "watsonx_project_id" in body:
            if (
                not isinstance(body["watsonx_project_id"], str)
                or not body["watsonx_project_id"].strip()
            ):
                return JSONResponse(
                    {"error": "watsonx_project_id must be a non-empty string"}, status_code=400
                )
            current_config.providers.watsonx.project_id = body["watsonx_project_id"].strip()
            current_config.providers.watsonx.configured = True
            config_updated = True

        if "ollama_endpoint" in body:
            if not isinstance(body["ollama_endpoint"], str) or not body["ollama_endpoint"].strip():
                return JSONResponse(
                    {"error": "ollama_endpoint must be a non-empty string"}, status_code=400
                )
            current_config.providers.ollama.endpoint = body["ollama_endpoint"].strip()
            current_config.providers.ollama.configured = True
            config_updated = True

        # Mark providers as configured if they were chosen during onboarding
        # Check LLM provider
        if "llm_provider" in body:
            llm_provider = body["llm_provider"].strip().lower()
            if llm_provider == "openai" and current_config.providers.openai.api_key:
                current_config.providers.openai.configured = True
                logger.info("Marked OpenAI as configured (chosen as LLM provider)")
            elif llm_provider == "anthropic" and current_config.providers.anthropic.api_key:
                current_config.providers.anthropic.configured = True
                logger.info("Marked Anthropic as configured (chosen as LLM provider)")
            elif llm_provider == "watsonx" and current_config.providers.watsonx.api_key and current_config.providers.watsonx.endpoint and current_config.providers.watsonx.project_id:
                current_config.providers.watsonx.configured = True
                logger.info("Marked WatsonX as configured (chosen as LLM provider)")
            elif llm_provider == "ollama" and current_config.providers.ollama.endpoint:
                current_config.providers.ollama.configured = True
                logger.info("Marked Ollama as configured (chosen as LLM provider)")

        # Check embedding provider
        if "embedding_provider" in body:
            embedding_provider = body["embedding_provider"].strip().lower()
            if embedding_provider == "openai" and current_config.providers.openai.api_key:
                current_config.providers.openai.configured = True
                logger.info("Marked OpenAI as configured (chosen as embedding provider)")
            elif embedding_provider == "watsonx" and current_config.providers.watsonx.api_key and current_config.providers.watsonx.endpoint and current_config.providers.watsonx.project_id:
                current_config.providers.watsonx.configured = True
                logger.info("Marked WatsonX as configured (chosen as embedding provider)")
            elif embedding_provider == "ollama" and current_config.providers.ollama.endpoint:
                current_config.providers.ollama.configured = True
                logger.info("Marked Ollama as configured (chosen as embedding provider)")

        # Handle sample_data
        should_ingest_sample_data = False
        if "sample_data" in body:
            if not isinstance(body["sample_data"], bool):
                return JSONResponse(
                    {"error": "sample_data must be a boolean value"}, status_code=400
                )
            should_ingest_sample_data = body["sample_data"]
            if should_ingest_sample_data:
                await TelemetryClient.send_event(
                    Category.ONBOARDING,
                    MessageId.ORB_ONBOARD_SAMPLE_DATA
                )
                logger.info("Sample data ingestion requested during onboarding")

        if not config_updated:
            return JSONResponse(
                {"error": "No valid fields provided for update"}, status_code=400
            )

        # Validate provider setup before initializing OpenSearch index
        # Use full validation with completion tests (test_completion=True) to ensure provider health during onboarding
        try:
            from api.provider_validation import validate_provider_setup

            # Validate LLM provider if set
            if "llm_provider" in body or "llm_model" in body:
                llm_provider = current_config.agent.llm_provider.lower()
                llm_provider_config = current_config.get_llm_provider_config()

                logger.info(f"Validating LLM provider setup for {llm_provider} (full validation with completion test)")
                await validate_provider_setup(
                    provider=llm_provider,
                    api_key=getattr(llm_provider_config, "api_key", None),
                    llm_model=current_config.agent.llm_model,
                    endpoint=getattr(llm_provider_config, "endpoint", None),
                    project_id=getattr(llm_provider_config, "project_id", None),
                    test_completion=True,  # Full validation with completion test - ensures provider health
                )
                logger.info(f"LLM provider setup validation completed successfully for {llm_provider}")

            # Validate embedding provider if set
            if "embedding_provider" in body or "embedding_model" in body:
                embedding_provider = current_config.knowledge.embedding_provider.lower()
                embedding_provider_config = current_config.get_embedding_provider_config()

                logger.info(f"Validating embedding provider setup for {embedding_provider} (full validation with completion test)")
                await validate_provider_setup(
                    provider=embedding_provider,
                    api_key=getattr(embedding_provider_config, "api_key", None),
                    embedding_model=current_config.knowledge.embedding_model,
                    endpoint=getattr(embedding_provider_config, "endpoint", None),
                    project_id=getattr(embedding_provider_config, "project_id", None),
                    test_completion=True,  # Full validation with completion test - ensures provider health
                )
                logger.info(f"Embedding provider setup validation completed successfully for {embedding_provider}")
        except Exception as e:
            logger.error(f"Provider validation failed: {str(e)}")
            return JSONResponse(
                {"error": str(e)},
                status_code=400,
            )

        # Set Langflow global variables and model values based on provider configuration
        try:
            # Check if any provider-related fields were provided
            provider_fields_provided = any(key in body for key in [
                "openai_api_key", "anthropic_api_key",
                "watsonx_api_key", "watsonx_endpoint", "watsonx_project_id",
                "ollama_endpoint"
            ])

            # Update global variables if any provider fields were provided
            # or if existing config has values (for OpenAI/Anthropic that might already be set)
            if (provider_fields_provided or
                current_config.providers.openai.api_key != "" or
                current_config.providers.anthropic.api_key != ""):
                await _update_langflow_global_variables(current_config)

            if "embedding_provider" in body or "embedding_model" in body:
                await _update_mcp_servers_with_provider_credentials(current_config, session_manager)

            # Update model values if provider or model fields were provided
            if "llm_provider" in body or "llm_model" in body or "embedding_provider" in body or "embedding_model" in body:
                await _update_langflow_model_values(current_config, flows_service)

        except Exception as e:
            logger.error(
                "Failed to set Langflow global variables and model values",
                error=str(e),
            )
            raise

        # Initialize the OpenSearch index if embedding model is configured
        if "embedding_model" in body or "embedding_provider" in body:
            try:
                # Import here to avoid circular imports
                from main import init_index

                logger.info(
                    "Initializing OpenSearch index after onboarding configuration"
                )
                await init_index()
                logger.info("OpenSearch index initialization completed successfully")
            except Exception as e:
                if isinstance(e, ValueError):
                    logger.error(
                        "Failed to initialize OpenSearch index after onboarding",
                        error=str(e),
                    )
                    return JSONResponse(
                        {
                            "error": str(e),
                            "edited": True,
                        },
                        status_code=400,
                    )
                logger.error(
                    "Failed to initialize OpenSearch index after onboarding",
                    error=str(e),
                )
                # Don't fail the entire onboarding process if index creation fails
                # The application can still work, but document operations may fail

            # Handle sample data ingestion if requested
            if should_ingest_sample_data:
                try:
                    # Import the function here to avoid circular imports
                    from main import ingest_default_documents_when_ready

                    # Get services from the current app state
                    # We need to access the app instance to get services
                    app = request.scope.get("app")
                    if app and hasattr(app.state, "services"):
                        services = app.state.services
                        logger.info(
                            "Starting sample data ingestion as requested in onboarding"
                        )
                        await ingest_default_documents_when_ready(services)
                        logger.info("Sample data ingestion completed successfully")
                    else:
                        logger.error(
                            "Could not access services for sample data ingestion"
                        )

                except Exception as e:
                    logger.error(
                        "Failed to complete sample data ingestion", error=str(e)
                    )
                    # Don't fail the entire onboarding process if sample data fails

        if config_manager.save_config_file(current_config):
            updated_fields = [
                k for k in body.keys() if k != "sample_data"
            ]  # Exclude sample_data from log
            logger.info(
                "Onboarding configuration updated successfully",
                updated_fields=updated_fields,
            )

            # Mark config as edited and send telemetry with model information
            current_config.edited = True

            # Build metadata with selected models
            onboarding_metadata = {}
            if llm_provider_selected:
                onboarding_metadata["llm_provider"] = llm_provider_selected
            if llm_model_selected:
                onboarding_metadata["llm_model"] = llm_model_selected
            if embedding_provider_selected:
                onboarding_metadata["embedding_provider"] = embedding_provider_selected
            if embedding_model_selected:
                onboarding_metadata["embedding_model"] = embedding_model_selected

            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_CONFIG_EDITED,
                metadata=onboarding_metadata
            )
            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_COMPLETE,
                metadata=onboarding_metadata
            )
            logger.info("Configuration marked as edited after onboarding")

        else:
            await TelemetryClient.send_event(
                Category.ONBOARDING,
                MessageId.ORB_ONBOARD_FAILED
            )
            return JSONResponse(
                {"error": "Failed to save configuration"}, status_code=500
            )

        # Refresh cached patched client so latest credentials take effect immediately
        await clients.refresh_patched_client()

        # Create OpenRAG Docs knowledge filter if sample data was ingested
        # Only create on embedding step to avoid duplicates (both LLM and embedding cards submit with sample_data)
        openrag_docs_filter_id = None
        if should_ingest_sample_data and ("embedding_provider" in body or "embedding_model" in body):
            try:
                openrag_docs_filter_id = await _create_openrag_docs_filter(
                    request, session_manager
                )
                if openrag_docs_filter_id:
                    logger.info(
                        "Created OpenRAG Docs knowledge filter",
                        filter_id=openrag_docs_filter_id,
                    )
            except Exception as e:
                logger.error(
                    "Failed to create OpenRAG Docs knowledge filter", error=str(e)
                )
                # Don't fail onboarding if filter creation fails

        return JSONResponse(
            {
                "message": "Onboarding configuration updated successfully",
                "edited": True,  # Confirm that config is now marked as edited
                "sample_data_ingested": should_ingest_sample_data,
                "openrag_docs_filter_id": openrag_docs_filter_id,
            }
        )

    except Exception as e:
        logger.error("Failed to update onboarding settings", error=str(e))
        await TelemetryClient.send_event(
            Category.ONBOARDING,
            MessageId.ORB_ONBOARD_FAILED
        )
        return JSONResponse(
            {"error": str(e)},
            status_code=500,
        )


async def _create_openrag_docs_filter(request, session_manager):
    """Create the OpenRAG Docs knowledge filter for onboarding"""
    import uuid
    import json
    from datetime import datetime

    # Get knowledge filter service from app state
    app = request.scope.get("app")
    if not app or not hasattr(app.state, "services"):
        logger.error("Could not access services for knowledge filter creation")
        return None

    knowledge_filter_service = app.state.services.get("knowledge_filter_service")
    if not knowledge_filter_service:
        logger.error("Knowledge filter service not available")
        return None

    # Get user and JWT token from request
    user = request.state.user
    jwt_token = session_manager.get_effective_jwt_token(user.user_id, request.state.jwt_token)

    # In no-auth mode, set owner to None so filter is visible to all users
    # In auth mode, use the actual user as owner
    if is_no_auth_mode():
        owner_user_id = None
    else:
        owner_user_id = user.user_id

    # Create the filter document
    filter_id = str(uuid.uuid4())
    query_data = json.dumps({
        "query": "",
        "filters": {
            "data_sources": ["openrag-documentation.pdf"],
            "document_types": ["*"],
            "owners": ["*"],
            "connector_types": ["*"],
        },
        "limit": 10,
        "scoreThreshold": 0,
        "color": "blue",
        "icon": "book",
    })

    filter_doc = {
        "id": filter_id,
        "name": "OpenRAG Docs",
        "description": "Filter for OpenRAG documentation",
        "query_data": query_data,
        "owner": owner_user_id,
        "allowed_users": [],
        "allowed_groups": [],
        "created_at": datetime.utcnow().isoformat(),
        "updated_at": datetime.utcnow().isoformat(),
    }

    result = await knowledge_filter_service.create_knowledge_filter(
        filter_doc, user_id=user.user_id, jwt_token=jwt_token
    )

    if result.get("success"):
        return filter_id
    else:
        logger.error("Failed to create OpenRAG Docs filter", error=result.get("error"))
        return None


def _get_flows_service():
    """Helper function to get flows service instance"""
    from services.flows_service import FlowsService

    return FlowsService()


async def _update_langflow_global_variables(config):
    """Update Langflow global variables for all configured providers"""
    try:
        # WatsonX global variables
        if config.providers.watsonx.api_key:
            await clients._create_langflow_global_variable(
                "WATSONX_API_KEY", config.providers.watsonx.api_key, modify=True
            )
            logger.info("Set WATSONX_API_KEY global variable in Langflow")

        if config.providers.watsonx.project_id:
            await clients._create_langflow_global_variable(
                "WATSONX_PROJECT_ID", config.providers.watsonx.project_id, modify=True
            )
            logger.info("Set WATSONX_PROJECT_ID global variable in Langflow")

        # OpenAI global variables
        if config.providers.openai.api_key:
            await clients._create_langflow_global_variable(
                "OPENAI_API_KEY", config.providers.openai.api_key, modify=True
            )
            logger.info("Set OPENAI_API_KEY global variable in Langflow")

        # Anthropic global variables
        if config.providers.anthropic.api_key:
            await clients._create_langflow_global_variable(
                "ANTHROPIC_API_KEY", config.providers.anthropic.api_key, modify=True
            )
            logger.info("Set ANTHROPIC_API_KEY global variable in Langflow")

        # Ollama global variables
        if config.providers.ollama.endpoint:
            endpoint = transform_localhost_url(config.providers.ollama.endpoint)
            await clients._create_langflow_global_variable(
                "OLLAMA_BASE_URL", endpoint, modify=True
            )
            logger.info("Set OLLAMA_BASE_URL global variable in Langflow")

        if config.knowledge.embedding_model:
            await clients._create_langflow_global_variable(
                "SELECTED_EMBEDDING_MODEL", config.knowledge.embedding_model, modify=True
            )
            logger.info(
                f"Set SELECTED_EMBEDDING_MODEL global variable to {config.knowledge.embedding_model}"
            )

    except Exception as e:
        logger.error(f"Failed to update Langflow global variables: {str(e)}")
        raise


async def _update_mcp_servers_with_provider_credentials(config, session_manager = None):
    # Update MCP servers with provider credentials
    try:
        from services.langflow_mcp_service import LangflowMCPService
        from utils.langflow_headers import build_mcp_global_vars_from_config

        mcp_service = LangflowMCPService()

        # Build global vars using utility function
        mcp_global_vars = build_mcp_global_vars_from_config(config)

        # In no-auth mode, add the anonymous JWT token and user details
        if is_no_auth_mode() and session_manager:
            from session_manager import AnonymousUser

            # Create/get anonymous JWT for no-auth mode
            anonymous_jwt = session_manager.get_effective_jwt_token(None, None)
            if anonymous_jwt:
                mcp_global_vars["JWT"] = anonymous_jwt

            # Add anonymous user details
            anonymous_user = AnonymousUser()
            mcp_global_vars["OWNER"] = anonymous_user.user_id  # "anonymous"
            mcp_global_vars["OWNER_NAME"] = f'"{anonymous_user.name}"'  # "Anonymous User" (quoted)
            mcp_global_vars["OWNER_EMAIL"] = anonymous_user.email  # "anonymous@localhost"

            logger.debug("Added anonymous JWT and user details to MCP servers for no-auth mode")

        if mcp_global_vars:
            result = await mcp_service.update_mcp_servers_with_global_vars(mcp_global_vars)
            logger.info("Updated MCP servers with provider credentials after settings change", **result)

    except Exception as mcp_error:
        logger.warning(f"Failed to update MCP servers after settings change: {str(mcp_error)}")
        # Don't fail the entire settings update if MCP update fails


async def _update_langflow_model_values(config, flows_service):
    """Update model values across Langflow flows"""
    try:
        # Update LLM model values
        llm_provider = config.agent.llm_provider.lower()
        llm_provider_config = config.get_llm_provider_config()
        llm_endpoint = getattr(llm_provider_config, "endpoint", None)

        await flows_service.change_langflow_model_value(
            llm_provider,
            llm_model=config.agent.llm_model,
            endpoint=llm_endpoint,
        )
        logger.info(
            f"Successfully updated Langflow flows for LLM provider {llm_provider}"
        )

        # Update embedding model values
        embedding_provider = config.knowledge.embedding_provider.lower()
        embedding_provider_config = config.get_embedding_provider_config()
        embedding_endpoint = getattr(embedding_provider_config, "endpoint", None)

        await flows_service.change_langflow_model_value(
            embedding_provider,
            embedding_model=config.knowledge.embedding_model,
            endpoint=embedding_endpoint,
        )
        logger.info(
            f"Successfully updated Langflow flows for embedding provider {embedding_provider}"
        )

    except Exception as e:
        logger.error(f"Failed to update Langflow model values: {str(e)}")
        raise


async def _update_langflow_system_prompt(config, flows_service):
    """Update system prompt in chat flow"""
    try:
        llm_provider = config.agent.llm_provider.lower()
        await flows_service.update_chat_flow_system_prompt(
            config.agent.system_prompt, llm_provider
        )
        logger.info("Successfully updated chat flow system prompt")
    except Exception as e:
        logger.error(f"Failed to update chat flow system prompt: {str(e)}")
        raise


async def _update_langflow_docling_settings(config, flows_service):
    """Update docling settings in ingest flow"""
    try:
        preset_config = get_docling_preset_configs(
            table_structure=config.knowledge.table_structure,
            ocr=config.knowledge.ocr,
            picture_descriptions=config.knowledge.picture_descriptions,
        )
        await flows_service.update_flow_docling_preset("custom", preset_config)
        logger.info("Successfully updated docling settings in ingest flow")
    except Exception as e:
        logger.error(f"Failed to update docling settings: {str(e)}")
        raise


async def _update_langflow_chunk_settings(config, flows_service):
    """Update chunk size and overlap in ingest flow"""
    try:
        await flows_service.update_ingest_flow_chunk_size(config.knowledge.chunk_size)
        logger.info(f"Successfully updated ingest flow chunk size to {config.knowledge.chunk_size}")

        await flows_service.update_ingest_flow_chunk_overlap(config.knowledge.chunk_overlap)
        logger.info(f"Successfully updated ingest flow chunk overlap to {config.knowledge.chunk_overlap}")
    except Exception as e:
        logger.error(f"Failed to update chunk settings: {str(e)}")
        raise


async def reapply_all_settings(session_manager = None):
    """
    Reapply all current configuration settings to Langflow flows and global variables.
    This is called when flows are detected to have been reset.
    """
    try:
        config = get_openrag_config()
        flows_service = _get_flows_service()

        logger.info("Reapplying all settings to Langflow flows and global variables")

        if config.knowledge.embedding_model or config.knowledge.embedding_provider:
            await _update_mcp_servers_with_provider_credentials(config, session_manager)
        else:
            logger.info("No embedding model or provider configured, skipping MCP server update")

        # Update all Langflow settings using helper functions
        try:
            await _update_langflow_global_variables(config)
        except Exception as e:
            logger.error(f"Failed to update Langflow global variables: {str(e)}")
            # Continue with other updates even if global variables fail

        try:
            await _update_langflow_model_values(config, flows_service)
        except Exception as e:
            logger.error(f"Failed to update Langflow model values: {str(e)}")

        try:
            await _update_langflow_system_prompt(config, flows_service)
        except Exception as e:
            logger.error(f"Failed to update Langflow system prompt: {str(e)}")

        try:
            await _update_langflow_docling_settings(config, flows_service)
        except Exception as e:
            logger.error(f"Failed to update Langflow docling settings: {str(e)}")

        try:
            await _update_langflow_chunk_settings(config, flows_service)
        except Exception as e:
            logger.error(f"Failed to update Langflow chunk settings: {str(e)}")

        logger.info("Successfully reapplied all settings to Langflow flows")

    except Exception as e:
        logger.error(f"Failed to reapply settings: {str(e)}")
        raise


async def rollback_onboarding(request, session_manager, task_service):
    """Rollback onboarding configuration when sample data files fail.

    This will:
    1. Cancel all active tasks
    2. Delete successfully ingested knowledge documents
    3. Reset configuration to allow re-onboarding
    """
    try:
        # Get current configuration
        current_config = get_openrag_config()

        # Only allow rollback if config was marked as edited (onboarding completed)
        if not current_config.edited:
            return JSONResponse(
                {"error": "No onboarding configuration to rollback"}, status_code=400
            )

        user = request.state.user
        jwt_token = session_manager.get_effective_jwt_token(user.user_id, request.state.jwt_token)

        logger.info("Rolling back onboarding configuration due to file failures")

        # Get all tasks for the user
        all_tasks = task_service.get_all_tasks(user.user_id)

        cancelled_tasks = []
        deleted_files = []

        # Cancel all active tasks and collect successfully ingested files
        for task_data in all_tasks:
            task_id = task_data.get("task_id")
            task_status = task_data.get("status")

            # Cancel active tasks (pending, running, processing)
            if task_status in ["pending", "running", "processing"]:
                try:
                    success = await task_service.cancel_task(user.user_id, task_id)
                    if success:
                        cancelled_tasks.append(task_id)
                        logger.info(f"Cancelled task {task_id}")
                except Exception as e:
                    logger.error(f"Failed to cancel task {task_id}: {str(e)}")

            # For completed tasks, find successfully ingested files and delete them
            elif task_status == "completed":
                files = task_data.get("files", {})
                if isinstance(files, dict):
                    for file_path, file_info in files.items():
                        # Check if file was successfully ingested
                        if isinstance(file_info, dict):
                            file_status = file_info.get("status")
                            filename = file_info.get("filename") or file_path.split("/")[-1]

                            if file_status == "completed" and filename:
                                try:
                                    # Get user's OpenSearch client
                                    opensearch_client = session_manager.get_user_opensearch_client(
                                        user.user_id, jwt_token
                                    )

                                    # Delete documents by filename
                                    from utils.opensearch_queries import build_filename_delete_body
                                    from config.settings import INDEX_NAME

                                    delete_query = build_filename_delete_body(filename)

                                    result = await opensearch_client.delete_by_query(
                                        index=INDEX_NAME,
                                        body=delete_query,
                                        conflicts="proceed"
                                    )

                                    deleted_count = result.get("deleted", 0)
                                    if deleted_count > 0:
                                        deleted_files.append(filename)
                                        logger.info(f"Deleted {deleted_count} chunks for filename {filename}")
                                except Exception as e:
                                    logger.error(f"Failed to delete documents for {filename}: {str(e)}")

        # Clear embedding provider and model settings
        current_config.knowledge.embedding_provider = "openai"  # Reset to default
        current_config.knowledge.embedding_model = ""

        # Mark config as not edited so user can go through onboarding again
        current_config.edited = False

        # Save the rolled back configuration manually to avoid save_config_file setting edited=True
        try:
            import yaml
            config_file = config_manager.config_file

            # Ensure directory exists
            config_file.parent.mkdir(parents=True, exist_ok=True)

            # Save config with edited=False
            with open(config_file, "w") as f:
                yaml.dump(current_config.to_dict(), f, default_flow_style=False, indent=2)

            # Update cached config
            config_manager._config = current_config

            logger.info("Successfully saved rolled back configuration with edited=False")
        except Exception as e:
            logger.error(f"Failed to save rolled back configuration: {e}")
            return JSONResponse(
                {"error": "Failed to save rolled back configuration"}, status_code=500
            )

        logger.info(
            f"Successfully rolled back onboarding configuration. "
            f"Cancelled {len(cancelled_tasks)} tasks, deleted {len(deleted_files)} files"
        )
        await TelemetryClient.send_event(
            Category.ONBOARDING,
            MessageId.ORB_ONBOARD_ROLLBACK
        )

        return JSONResponse(
            {
                "message": "Onboarding configuration rolled back successfully",
                "cancelled_tasks": len(cancelled_tasks),
                "deleted_files": len(deleted_files),
            }
        )

    except Exception as e:
        logger.error("Failed to rollback onboarding configuration", error=str(e))
        return JSONResponse(
            {"error": f"Failed to rollback onboarding: {str(e)}"}, status_code=500
        )


async def update_docling_preset(request, session_manager):
    """Update docling settings in the ingest flow - deprecated endpoint, use /settings instead"""
    try:
        # Parse request body
        body = await request.json()

        # Support old preset-based API for backwards compatibility
        if "preset" in body:
            # Map old presets to new toggle settings
            preset_map = {
                "standard": {
                    "table_structure": False,
                    "ocr": False,
                    "picture_descriptions": False,
                },
                "ocr": {
                    "table_structure": False,
                    "ocr": True,
                    "picture_descriptions": False,
                },
                "picture_description": {
                    "table_structure": False,
                    "ocr": True,
                    "picture_descriptions": True,
                },
                "VLM": {
                    "table_structure": False,
                    "ocr": False,
                    "picture_descriptions": False,
                },
            }

            preset = body["preset"]
            if preset not in preset_map:
                return JSONResponse(
                    {
                        "error": f"Invalid preset '{preset}'. Valid presets: {', '.join(preset_map.keys())}"
                    },
                    status_code=400,
                )

            settings = preset_map[preset]
        else:
            # Support new toggle-based API
            settings = {
                "table_structure": body.get("table_structure", False),
                "ocr": body.get("ocr", False),
                "picture_descriptions": body.get("picture_descriptions", False),
            }

        # Get the preset configuration
        preset_config = get_docling_preset_configs(**settings)

        # Use the helper function to update the flow
        flows_service = _get_flows_service()
        await flows_service.update_flow_docling_preset("custom", preset_config)

        logger.info("Successfully updated docling settings in ingest flow")

        return JSONResponse(
            {
                "message": "Successfully updated docling settings",
                "settings": settings,
                "preset_config": preset_config,
            }
        )

    except Exception as e:
        logger.error("Failed to update docling settings", error=str(e))
        return JSONResponse(
            {"error": f"Failed to update docling settings: {str(e)}"}, status_code=500
        )