Added embedding configuration after onboarding

2025-09-25 16:48:30 -03:00 · 2025-09-25 16:48:30 -03:00 · 27cff5650b
commit 27cff5650b
parent 5dfb3dadf0
3 changed files with 123 additions and 6 deletions
--- a/src/api/settings.py
+++ b/src/api/settings.py
@ -556,6 +556,19 @@ async def onboarding(request, flows_service):
                    )
                    # Continue even if setting global variables fails
            # Initialize the OpenSearch index now that we have the embedding model configured
            try:
                # Import here to avoid circular imports
                from main import init_index
                logger.info("Initializing OpenSearch index after onboarding configuration")
                await init_index()
                logger.info("OpenSearch index initialization completed successfully")
            except Exception as e:
                logger.error("Failed to initialize OpenSearch index after onboarding", error=str(e))
                # Don't fail the entire onboarding process if index creation fails
                # The application can still work, but document operations may fail
            # Handle sample data ingestion if requested
            if should_ingest_sample_data:
                try:
--- a/src/main.py
+++ b/src/main.py
@ -2,6 +2,7 @@
 from connectors.langflow_connector_service import LangflowConnectorService
 from connectors.service import ConnectorService
 from services.flows_service import FlowsService
 from utils.embeddings import create_dynamic_index_body
 from utils.logging_config import configure_from_env, get_logger
 configure_from_env()
@ -52,11 +53,11 @@ from auth_middleware import optional_auth, require_auth
 from config.settings import (
    DISABLE_INGEST_WITH_LANGFLOW,
    EMBED_MODEL,
    INDEX_BODY,
    INDEX_NAME,
    SESSION_SECRET,
    clients,
    is_no_auth_mode,
    get_openrag_config,
 )
 from services.auth_service import AuthService
 from services.langflow_mcp_service import LangflowMCPService
@ -81,7 +82,6 @@ logger.info(
    cuda_version=torch.version.cuda,
 )
 async def wait_for_opensearch():
    """Wait for OpenSearch to be ready with retries"""
    max_retries = 30
@ -132,12 +132,19 @@ async def init_index():
    """Initialize OpenSearch index and security roles"""
    await wait_for_opensearch()
    # Get the configured embedding model from user configuration
    config = get_openrag_config()
    embedding_model = config.knowledge.embedding_model
    # Create dynamic index body based on the configured embedding model
    dynamic_index_body = create_dynamic_index_body(embedding_model)
    # Create documents index
    if not await clients.opensearch.indices.exists(index=INDEX_NAME):
-        await clients.opensearch.indices.create(index=INDEX_NAME, body=INDEX_BODY)
+        await clients.opensearch.indices.create(index=INDEX_NAME, body=dynamic_index_body)
-        logger.info("Created OpenSearch index", index_name=INDEX_NAME)
+        logger.info("Created OpenSearch index", index_name=INDEX_NAME, embedding_model=embedding_model)
    else:
-        logger.info("Index already exists, skipping creation", index_name=INDEX_NAME)
+        logger.info("Index already exists, skipping creation", index_name=INDEX_NAME, embedding_model=embedding_model)
    # Create knowledge filters index
    knowledge_filter_index_name = "knowledge_filters"
@ -391,7 +398,12 @@ async def _ingest_default_documents_openrag(services, file_paths):
 async def startup_tasks(services):
    """Startup tasks"""
    logger.info("Starting startup tasks")
-    await init_index()
+    # Only initialize basic OpenSearch connection, not the index
    # Index will be created after onboarding when we know the embedding model
    await wait_for_opensearch()
    # Configure alerting security
    await configure_alerting_security()
 async def initialize_services():
--- a/src/utils/embeddings.py
+++ b/src/utils/embeddings.py
@ -0,0 +1,92 @@
 from utils.logging_config import get_logger
 logger = get_logger(__name__)
 def get_embedding_dimensions(model_name: str) -> int:
    """Get the embedding dimensions for a given model name."""
    # OpenAI models
    openai_models = {
        "text-embedding-3-small": 1536,
        "text-embedding-3-large": 3072,
        "text-embedding-ada-002": 1536,
    }
    # Ollama models (common embedding models)
    ollama_models = {
        "nomic-embed-text": 768,
        "all-minilm": 384,
        "mxbai-embed-large": 1024,
    }
    # Watson/IBM models
    watsonx_models = {
    # IBM Models
    "ibm/granite-embedding-107m-multilingual": 384,  
    "ibm/granite-embedding-278m-multilingual": 1024,
    "ibm/slate-125m-english-rtrvr": 768,
    "ibm/slate-125m-english-rtrvr-v2": 768,
    "ibm/slate-30m-english-rtrvr": 384,
    "ibm/slate-30m-english-rtrvr-v2": 384,
    # Third Party Models
    "intfloat/multilingual-e5-large": 1024,
    "sentence-transformers/all-minilm-l6-v2": 384,
 }
    # Check all model dictionaries
    all_models = {**openai_models, **ollama_models, **watsonx_models}
    if model_name in all_models:
        dimensions = all_models[model_name]
        logger.info(f"Found dimensions for model '{model_name}': {dimensions}")
        return dimensions
    # Default fallback
    default_dimensions = 1536
    logger.warning(
        f"Unknown embedding model '{model_name}', using default dimensions: {default_dimensions}"
    )
    return default_dimensions
 def create_dynamic_index_body(embedding_model: str) -> dict:
    """Create a dynamic index body configuration based on the embedding model."""
    dimensions = get_embedding_dimensions(embedding_model)
    return {
        "settings": {
            "index": {"knn": True},
            "number_of_shards": 1,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
                "document_id": {"type": "keyword"},
                "filename": {"type": "keyword"},
                "mimetype": {"type": "keyword"},
                "page": {"type": "integer"},
                "text": {"type": "text"},
                "chunk_embedding": {
                    "type": "knn_vector",
                    "dimension": dimensions,
                    "method": {
                        "name": "disk_ann",
                        "engine": "jvector",
                        "space_type": "l2",
                        "parameters": {"ef_construction": 100, "m": 16},
                    },
                },
                "source_url": {"type": "keyword"},
                "connector_type": {"type": "keyword"},
                "owner": {"type": "keyword"},
                "allowed_users": {"type": "keyword"},
                "allowed_groups": {"type": "keyword"},
                "user_permissions": {"type": "object"},
                "group_permissions": {"type": "object"},
                "created_time": {"type": "date"},
                "modified_time": {"type": "date"},
                "indexed_time": {"type": "date"},
                "metadata": {"type": "object"},
            }
        },
    }