Added embedding configuration after onboarding

2025-09-25 16:48:30 -03:00 · 2025-09-25 16:48:30 -03:00 · 27cff5650b
commit 27cff5650b
parent 5dfb3dadf0
3 changed files with 123 additions and 6 deletions
--- a/src/api/settings.py
+++ b/src/api/settings.py
@ -556,6 +556,19 @@ async def onboarding(request, flows_service):
                    )
                    # Continue even if setting global variables fails

+            # Initialize the OpenSearch index now that we have the embedding model configured
+            try:
+                # Import here to avoid circular imports
+                from main import init_index
+
+                logger.info("Initializing OpenSearch index after onboarding configuration")
+                await init_index()
+                logger.info("OpenSearch index initialization completed successfully")
+            except Exception as e:
+                logger.error("Failed to initialize OpenSearch index after onboarding", error=str(e))
+                # Don't fail the entire onboarding process if index creation fails
+                # The application can still work, but document operations may fail
+
            # Handle sample data ingestion if requested
            if should_ingest_sample_data:
                try:
--- a/src/main.py
+++ b/src/main.py
@ -2,6 +2,7 @@
 from connectors.langflow_connector_service import LangflowConnectorService
 from connectors.service import ConnectorService
 from services.flows_service import FlowsService
+from utils.embeddings import create_dynamic_index_body
 from utils.logging_config import configure_from_env, get_logger

 configure_from_env()
@ -52,11 +53,11 @@ from auth_middleware import optional_auth, require_auth
 from config.settings import (
    DISABLE_INGEST_WITH_LANGFLOW,
    EMBED_MODEL,
-    INDEX_BODY,
    INDEX_NAME,
    SESSION_SECRET,
    clients,
    is_no_auth_mode,
+    get_openrag_config,
 )
 from services.auth_service import AuthService
 from services.langflow_mcp_service import LangflowMCPService
@ -81,7 +82,6 @@ logger.info(
    cuda_version=torch.version.cuda,
 )

-
 async def wait_for_opensearch():
    """Wait for OpenSearch to be ready with retries"""
    max_retries = 30
@ -132,12 +132,19 @@ async def init_index():
    """Initialize OpenSearch index and security roles"""
    await wait_for_opensearch()

+    # Get the configured embedding model from user configuration
+    config = get_openrag_config()
+    embedding_model = config.knowledge.embedding_model
+
+    # Create dynamic index body based on the configured embedding model
+    dynamic_index_body = create_dynamic_index_body(embedding_model)
+
    # Create documents index
    if not await clients.opensearch.indices.exists(index=INDEX_NAME):
-        await clients.opensearch.indices.create(index=INDEX_NAME, body=INDEX_BODY)
-        logger.info("Created OpenSearch index", index_name=INDEX_NAME)
+        await clients.opensearch.indices.create(index=INDEX_NAME, body=dynamic_index_body)
+        logger.info("Created OpenSearch index", index_name=INDEX_NAME, embedding_model=embedding_model)
    else:
-        logger.info("Index already exists, skipping creation", index_name=INDEX_NAME)
+        logger.info("Index already exists, skipping creation", index_name=INDEX_NAME, embedding_model=embedding_model)

    # Create knowledge filters index
    knowledge_filter_index_name = "knowledge_filters"
@ -391,7 +398,12 @@ async def _ingest_default_documents_openrag(services, file_paths):
 async def startup_tasks(services):
    """Startup tasks"""
    logger.info("Starting startup tasks")
-    await init_index()
+    # Only initialize basic OpenSearch connection, not the index
+    # Index will be created after onboarding when we know the embedding model
+    await wait_for_opensearch()
+
+    # Configure alerting security
+    await configure_alerting_security()


 async def initialize_services():
--- a/src/utils/embeddings.py
+++ b/src/utils/embeddings.py
@ -0,0 +1,92 @@
+from utils.logging_config import get_logger
+
+
+logger = get_logger(__name__)
+
+def get_embedding_dimensions(model_name: str) -> int:
+    """Get the embedding dimensions for a given model name."""
+    # OpenAI models
+    openai_models = {
+        "text-embedding-3-small": 1536,
+        "text-embedding-3-large": 3072,
+        "text-embedding-ada-002": 1536,
+    }
+
+    # Ollama models (common embedding models)
+    ollama_models = {
+        "nomic-embed-text": 768,
+        "all-minilm": 384,
+        "mxbai-embed-large": 1024,
+    }
+
+    # Watson/IBM models
+    watsonx_models = {
+    # IBM Models
+    "ibm/granite-embedding-107m-multilingual": 384,  
+    "ibm/granite-embedding-278m-multilingual": 1024,
+    "ibm/slate-125m-english-rtrvr": 768,
+    "ibm/slate-125m-english-rtrvr-v2": 768,
+    "ibm/slate-30m-english-rtrvr": 384,
+    "ibm/slate-30m-english-rtrvr-v2": 384,
+    # Third Party Models
+    "intfloat/multilingual-e5-large": 1024,
+    "sentence-transformers/all-minilm-l6-v2": 384,
+}
+
+    # Check all model dictionaries
+    all_models = {**openai_models, **ollama_models, **watsonx_models}
+
+    if model_name in all_models:
+        dimensions = all_models[model_name]
+        logger.info(f"Found dimensions for model '{model_name}': {dimensions}")
+        return dimensions
+
+    # Default fallback
+    default_dimensions = 1536
+    logger.warning(
+        f"Unknown embedding model '{model_name}', using default dimensions: {default_dimensions}"
+    )
+    return default_dimensions
+
+
+def create_dynamic_index_body(embedding_model: str) -> dict:
+    """Create a dynamic index body configuration based on the embedding model."""
+    dimensions = get_embedding_dimensions(embedding_model)
+
+    return {
+        "settings": {
+            "index": {"knn": True},
+            "number_of_shards": 1,
+            "number_of_replicas": 1,
+        },
+        "mappings": {
+            "properties": {
+                "document_id": {"type": "keyword"},
+                "filename": {"type": "keyword"},
+                "mimetype": {"type": "keyword"},
+                "page": {"type": "integer"},
+                "text": {"type": "text"},
+                "chunk_embedding": {
+                    "type": "knn_vector",
+                    "dimension": dimensions,
+                    "method": {
+                        "name": "disk_ann",
+                        "engine": "jvector",
+                        "space_type": "l2",
+                        "parameters": {"ef_construction": 100, "m": 16},
+                    },
+                },
+                "source_url": {"type": "keyword"},
+                "connector_type": {"type": "keyword"},
+                "owner": {"type": "keyword"},
+                "allowed_users": {"type": "keyword"},
+                "allowed_groups": {"type": "keyword"},
+                "user_permissions": {"type": "object"},
+                "group_permissions": {"type": "object"},
+                "created_time": {"type": "date"},
+                "modified_time": {"type": "date"},
+                "indexed_time": {"type": "date"},
+                "metadata": {"type": "object"},
+            }
+        },
+    }