Added embedding configuration after onboarding

This commit is contained in:
Lucas Oliveira 2025-09-25 16:48:30 -03:00
parent 5dfb3dadf0
commit 27cff5650b
3 changed files with 123 additions and 6 deletions

View file

@ -556,6 +556,19 @@ async def onboarding(request, flows_service):
)
# Continue even if setting global variables fails
# Initialize the OpenSearch index now that we have the embedding model configured
try:
# Import here to avoid circular imports
from main import init_index
logger.info("Initializing OpenSearch index after onboarding configuration")
await init_index()
logger.info("OpenSearch index initialization completed successfully")
except Exception as e:
logger.error("Failed to initialize OpenSearch index after onboarding", error=str(e))
# Don't fail the entire onboarding process if index creation fails
# The application can still work, but document operations may fail
# Handle sample data ingestion if requested
if should_ingest_sample_data:
try:

View file

@ -2,6 +2,7 @@
from connectors.langflow_connector_service import LangflowConnectorService
from connectors.service import ConnectorService
from services.flows_service import FlowsService
from utils.embeddings import create_dynamic_index_body
from utils.logging_config import configure_from_env, get_logger
configure_from_env()
@ -52,11 +53,11 @@ from auth_middleware import optional_auth, require_auth
from config.settings import (
DISABLE_INGEST_WITH_LANGFLOW,
EMBED_MODEL,
INDEX_BODY,
INDEX_NAME,
SESSION_SECRET,
clients,
is_no_auth_mode,
get_openrag_config,
)
from services.auth_service import AuthService
from services.langflow_mcp_service import LangflowMCPService
@ -81,7 +82,6 @@ logger.info(
cuda_version=torch.version.cuda,
)
async def wait_for_opensearch():
"""Wait for OpenSearch to be ready with retries"""
max_retries = 30
@ -132,12 +132,19 @@ async def init_index():
"""Initialize OpenSearch index and security roles"""
await wait_for_opensearch()
# Get the configured embedding model from user configuration
config = get_openrag_config()
embedding_model = config.knowledge.embedding_model
# Create dynamic index body based on the configured embedding model
dynamic_index_body = create_dynamic_index_body(embedding_model)
# Create documents index
if not await clients.opensearch.indices.exists(index=INDEX_NAME):
await clients.opensearch.indices.create(index=INDEX_NAME, body=INDEX_BODY)
logger.info("Created OpenSearch index", index_name=INDEX_NAME)
await clients.opensearch.indices.create(index=INDEX_NAME, body=dynamic_index_body)
logger.info("Created OpenSearch index", index_name=INDEX_NAME, embedding_model=embedding_model)
else:
logger.info("Index already exists, skipping creation", index_name=INDEX_NAME)
logger.info("Index already exists, skipping creation", index_name=INDEX_NAME, embedding_model=embedding_model)
# Create knowledge filters index
knowledge_filter_index_name = "knowledge_filters"
@ -391,7 +398,12 @@ async def _ingest_default_documents_openrag(services, file_paths):
async def startup_tasks(services):
"""Startup tasks"""
logger.info("Starting startup tasks")
await init_index()
# Only initialize basic OpenSearch connection, not the index
# Index will be created after onboarding when we know the embedding model
await wait_for_opensearch()
# Configure alerting security
await configure_alerting_security()
async def initialize_services():

92
src/utils/embeddings.py Normal file
View file

@ -0,0 +1,92 @@
from utils.logging_config import get_logger
logger = get_logger(__name__)
def get_embedding_dimensions(model_name: str) -> int:
"""Get the embedding dimensions for a given model name."""
# OpenAI models
openai_models = {
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072,
"text-embedding-ada-002": 1536,
}
# Ollama models (common embedding models)
ollama_models = {
"nomic-embed-text": 768,
"all-minilm": 384,
"mxbai-embed-large": 1024,
}
# Watson/IBM models
watsonx_models = {
# IBM Models
"ibm/granite-embedding-107m-multilingual": 384,
"ibm/granite-embedding-278m-multilingual": 1024,
"ibm/slate-125m-english-rtrvr": 768,
"ibm/slate-125m-english-rtrvr-v2": 768,
"ibm/slate-30m-english-rtrvr": 384,
"ibm/slate-30m-english-rtrvr-v2": 384,
# Third Party Models
"intfloat/multilingual-e5-large": 1024,
"sentence-transformers/all-minilm-l6-v2": 384,
}
# Check all model dictionaries
all_models = {**openai_models, **ollama_models, **watsonx_models}
if model_name in all_models:
dimensions = all_models[model_name]
logger.info(f"Found dimensions for model '{model_name}': {dimensions}")
return dimensions
# Default fallback
default_dimensions = 1536
logger.warning(
f"Unknown embedding model '{model_name}', using default dimensions: {default_dimensions}"
)
return default_dimensions
def create_dynamic_index_body(embedding_model: str) -> dict:
"""Create a dynamic index body configuration based on the embedding model."""
dimensions = get_embedding_dimensions(embedding_model)
return {
"settings": {
"index": {"knn": True},
"number_of_shards": 1,
"number_of_replicas": 1,
},
"mappings": {
"properties": {
"document_id": {"type": "keyword"},
"filename": {"type": "keyword"},
"mimetype": {"type": "keyword"},
"page": {"type": "integer"},
"text": {"type": "text"},
"chunk_embedding": {
"type": "knn_vector",
"dimension": dimensions,
"method": {
"name": "disk_ann",
"engine": "jvector",
"space_type": "l2",
"parameters": {"ef_construction": 100, "m": 16},
},
},
"source_url": {"type": "keyword"},
"connector_type": {"type": "keyword"},
"owner": {"type": "keyword"},
"allowed_users": {"type": "keyword"},
"allowed_groups": {"type": "keyword"},
"user_permissions": {"type": "object"},
"group_permissions": {"type": "object"},
"created_time": {"type": "date"},
"modified_time": {"type": "date"},
"indexed_time": {"type": "date"},
"metadata": {"type": "object"},
}
},
}