WIP, non-langflow mode only

This commit is contained in:
phact 2025-10-10 09:43:37 -04:00
parent c6f2cf5945
commit 102c17d7ec
6 changed files with 424 additions and 17 deletions

View file

@ -113,6 +113,8 @@ INDEX_BODY = {
"mimetype": {"type": "keyword"},
"page": {"type": "integer"},
"text": {"type": "text"},
# Legacy field - kept for backward compatibility
# New documents will use chunk_embedding_{model_name} fields
"chunk_embedding": {
"type": "knn_vector",
"dimension": VECTOR_DIM,
@ -123,6 +125,8 @@ INDEX_BODY = {
"parameters": {"ef_construction": 100, "m": 16},
},
},
# Track which embedding model was used for this chunk
"embedding_model": {"type": "keyword"},
"source_url": {"type": "keyword"},
"connector_type": {"type": "keyword"},
"owner": {"type": "keyword"},

View file

@ -156,15 +156,23 @@ class TaskProcessor:
owner_email: str = None,
file_size: int = None,
connector_type: str = "local",
embedding_model: str = None,
):
"""
Standard processing pipeline for non-Langflow processors:
docling conversion + embeddings + OpenSearch indexing.
Args:
embedding_model: Embedding model to use (defaults to EMBED_MODEL from settings)
"""
import datetime
from config.settings import INDEX_NAME, EMBED_MODEL, clients
from services.document_service import chunk_texts_for_embeddings
from utils.document_processing import extract_relevant
from utils.embedding_fields import get_embedding_field_name, ensure_embedding_field_exists
# Use provided embedding model or fall back to default
embedding_model = embedding_model or EMBED_MODEL
# Get user's OpenSearch client with JWT for OIDC auth
opensearch_client = self.document_service.session_manager.get_user_opensearch_client(
@ -175,6 +183,18 @@ class TaskProcessor:
if await self.check_document_exists(file_hash, opensearch_client):
return {"status": "unchanged", "id": file_hash}
# Ensure the embedding field exists for this model
embedding_field_name = await ensure_embedding_field_exists(
opensearch_client, embedding_model, INDEX_NAME
)
logger.info(
"Processing document with embedding model",
embedding_model=embedding_model,
embedding_field=embedding_field_name,
file_hash=file_hash,
)
# Convert and extract
result = clients.converter.convert(file_path)
full_doc = result.document.export_to_dict()
@ -188,7 +208,7 @@ class TaskProcessor:
for batch in text_batches:
resp = await clients.patched_async_client.embeddings.create(
model=EMBED_MODEL, input=batch
model=embedding_model, input=batch
)
embeddings.extend([d.embedding for d in resp.data])
@ -202,7 +222,10 @@ class TaskProcessor:
"mimetype": slim_doc["mimetype"],
"page": chunk["page"],
"text": chunk["text"],
"chunk_embedding": vect,
# Store embedding in model-specific field
embedding_field_name: vect,
# Track which model was used
"embedding_model": embedding_model,
"file_size": file_size,
"connector_type": connector_type,
"indexed_time": datetime.datetime.now().isoformat(),

View file

@ -12,16 +12,33 @@ class SearchService:
self.session_manager = session_manager
@tool
async def search_tool(self, query: str) -> Dict[str, Any]:
async def search_tool(self, query: str, embedding_model: str = None) -> Dict[str, Any]:
"""
Use this tool to search for documents relevant to the query.
Args:
query (str): query string to search the corpus
embedding_model (str): Optional override for embedding model.
If not provided, uses EMBED_MODEL from config.
Returns:
dict (str, Any): {"results": [chunks]} on success
"""
from utils.embedding_fields import get_embedding_field_name
# Strategy: Use provided model, or default to EMBED_MODEL
# This assumes documents are embedded with EMBED_MODEL by default
# Future enhancement: Could auto-detect available models in corpus
embedding_model = embedding_model or EMBED_MODEL
embedding_field_name = get_embedding_field_name(embedding_model)
logger.info(
"Search with embedding model",
embedding_model=embedding_model,
embedding_field=embedding_field_name,
query_preview=query[:50] if query else None,
)
# Get authentication context from the current async context
user_id, jwt_token = get_auth_context()
# Get search filters, limit, and score threshold from context
@ -37,12 +54,75 @@ class SearchService:
# Detect wildcard request ("*") to return global facets/stats without semantic search
is_wildcard_match_all = isinstance(query, str) and query.strip() == "*"
# Only embed when not doing match_all
# Get available embedding models from corpus
query_embeddings = {}
available_models = []
if not is_wildcard_match_all:
resp = await clients.patched_async_client.embeddings.create(
model=EMBED_MODEL, input=[query]
# First, detect which embedding models exist in the corpus
opensearch_client = self.session_manager.get_user_opensearch_client(
user_id, jwt_token
)
try:
agg_query = {
"size": 0,
"aggs": {
"embedding_models": {
"terms": {
"field": "embedding_model",
"size": 10
}
}
}
}
agg_result = await opensearch_client.search(index=INDEX_NAME, body=agg_query)
buckets = agg_result.get("aggregations", {}).get("embedding_models", {}).get("buckets", [])
available_models = [b["key"] for b in buckets if b["key"]]
if not available_models:
# Fallback to configured model if no documents indexed yet
available_models = [embedding_model]
logger.info(
"Detected embedding models in corpus",
available_models=available_models,
model_counts={b["key"]: b["doc_count"] for b in buckets}
)
except Exception as e:
logger.warning("Failed to detect embedding models, using configured model", error=str(e))
available_models = [embedding_model]
# Parallelize embedding generation for all models
import asyncio
async def embed_with_model(model_name):
try:
resp = await clients.patched_async_client.embeddings.create(
model=model_name, input=[query]
)
return model_name, resp.data[0].embedding
except Exception as e:
logger.error(f"Failed to embed with model {model_name}", error=str(e))
return model_name, None
# Run all embeddings in parallel
embedding_results = await asyncio.gather(
*[embed_with_model(model) for model in available_models],
return_exceptions=True
)
# Collect successful embeddings
for result in embedding_results:
if isinstance(result, tuple) and result[1] is not None:
model_name, embedding = result
query_embeddings[model_name] = embedding
logger.info(
"Generated query embeddings",
models=list(query_embeddings.keys()),
query_preview=query[:50]
)
query_embedding = resp.data[0].embedding
# Build filter clauses
filter_clauses = []
@ -80,17 +160,51 @@ class SearchService:
else:
query_block = {"match_all": {}}
else:
# Build multi-model KNN queries
knn_queries = []
embedding_fields_to_check = []
for model_name, embedding_vector in query_embeddings.items():
field_name = get_embedding_field_name(model_name)
embedding_fields_to_check.append(field_name)
knn_queries.append({
"knn": {
field_name: {
"vector": embedding_vector,
"k": 50,
"num_candidates": 1000,
}
}
})
# Build exists filter - doc must have at least one embedding field
exists_any_embedding = {
"bool": {
"should": [{"exists": {"field": f}} for f in embedding_fields_to_check],
"minimum_should_match": 1
}
}
# Add exists filter to existing filters
all_filters = [*filter_clauses, exists_any_embedding]
logger.debug(
"Building hybrid query with filters",
user_filters_count=len(filter_clauses),
total_filters_count=len(all_filters),
filter_types=[type(f).__name__ for f in all_filters]
)
# Hybrid search query structure (semantic + keyword)
# Use dis_max to pick best score across multiple embedding fields
query_block = {
"bool": {
"should": [
{
"knn": {
"chunk_embedding": {
"vector": query_embedding,
"k": 10,
"boost": 0.7,
}
"dis_max": {
"tie_breaker": 0.0, # Take only the best match, no blending
"boost": 0.7, # 70% weight for semantic search
"queries": knn_queries
}
},
{
@ -99,12 +213,12 @@ class SearchService:
"fields": ["text^2", "filename^1.5"],
"type": "best_fields",
"fuzziness": "AUTO",
"boost": 0.3,
"boost": 0.3, # 30% weight for keyword search
}
},
],
"minimum_should_match": 1,
**({"filter": filter_clauses} if filter_clauses else {}),
"filter": all_filters,
}
}
@ -115,6 +229,7 @@ class SearchService:
"document_types": {"terms": {"field": "mimetype", "size": 10}},
"owners": {"terms": {"field": "owner_name.keyword", "size": 10}},
"connector_types": {"terms": {"field": "connector_type", "size": 10}},
"embedding_models": {"terms": {"field": "embedding_model", "size": 10}},
},
"_source": [
"filename",
@ -127,6 +242,7 @@ class SearchService:
"owner_email",
"file_size",
"connector_type",
"embedding_model", # Include embedding model in results
"allowed_users",
"allowed_groups",
],
@ -177,6 +293,7 @@ class SearchService:
"owner_email": hit["_source"].get("owner_email"),
"file_size": hit["_source"].get("file_size"),
"connector_type": hit["_source"].get("connector_type"),
"embedding_model": hit["_source"].get("embedding_model"), # Include in results
}
)
@ -199,8 +316,13 @@ class SearchService:
filters: Dict[str, Any] = None,
limit: int = 10,
score_threshold: float = 0,
embedding_model: str = None,
) -> Dict[str, Any]:
"""Public search method for API endpoints"""
"""Public search method for API endpoints
Args:
embedding_model: Embedding model to use for search (defaults to EMBED_MODEL)
"""
# Set auth context if provided (for direct API calls)
from config.settings import is_no_auth_mode
@ -220,4 +342,4 @@ class SearchService:
set_search_limit(limit)
set_score_threshold(score_threshold)
return await self.search_tool(query)
return await self.search_tool(query, embedding_model=embedding_model)

View file

@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""
Migration script to add embedding_model field to existing OpenSearch index.
Run this once to fix the field type from text to keyword.
"""
import asyncio
import sys
from opensearchpy import AsyncOpenSearch
from opensearchpy._async.http_aiohttp import AIOHttpConnection
# Add parent directory to path to import config
sys.path.insert(0, '/home/tato/Desktop/openrag/src')
from config.settings import (
OPENSEARCH_HOST,
OPENSEARCH_PORT,
OPENSEARCH_USERNAME,
OPENSEARCH_PASSWORD,
INDEX_NAME,
)
from utils.logging_config import get_logger
logger = get_logger(__name__)
async def add_embedding_model_field():
"""Add embedding_model as keyword field to existing index"""
# Create admin OpenSearch client
client = AsyncOpenSearch(
hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
connection_class=AIOHttpConnection,
scheme="https",
use_ssl=True,
verify_certs=False,
ssl_assert_fingerprint=None,
http_auth=(OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD),
http_compress=True,
)
try:
# Check if index exists
exists = await client.indices.exists(index=INDEX_NAME)
if not exists:
logger.error(f"Index {INDEX_NAME} does not exist")
return False
# Get current mapping
mapping = await client.indices.get_mapping(index=INDEX_NAME)
current_props = mapping[INDEX_NAME]["mappings"].get("properties", {})
# Check if embedding_model field exists
if "embedding_model" in current_props:
current_type = current_props["embedding_model"].get("type")
logger.info(f"embedding_model field exists with type: {current_type}")
if current_type == "keyword":
logger.info("Field is already correct type (keyword)")
return True
else:
logger.warning(
f"Field exists with wrong type: {current_type}. "
"Cannot change field type on existing field. "
"You need to reindex or use a different field name."
)
return False
# Add the field as keyword
logger.info("Adding embedding_model field as keyword type")
new_mapping = {
"properties": {
"embedding_model": {"type": "keyword"}
}
}
response = await client.indices.put_mapping(
index=INDEX_NAME,
body=new_mapping
)
logger.info(f"Successfully added embedding_model field: {response}")
# Verify the change
updated_mapping = await client.indices.get_mapping(index=INDEX_NAME)
updated_props = updated_mapping[INDEX_NAME]["mappings"]["properties"]
if "embedding_model" in updated_props:
field_type = updated_props["embedding_model"].get("type")
logger.info(f"Verified: embedding_model field type is now: {field_type}")
return field_type == "keyword"
else:
logger.error("Field was not added successfully")
return False
except Exception as e:
logger.error(f"Error adding embedding_model field: {e}")
return False
finally:
await client.close()
if __name__ == "__main__":
success = asyncio.run(add_embedding_model_field())
sys.exit(0 if success else 1)

View file

@ -0,0 +1,150 @@
"""
Utility functions for managing dynamic embedding field names in OpenSearch.
This module provides helpers for:
- Normalizing embedding model names to valid OpenSearch field names
- Generating dynamic field names based on embedding models
- Ensuring embedding fields exist in the OpenSearch index
"""
from utils.logging_config import get_logger
logger = get_logger(__name__)
def normalize_model_name(model_name: str) -> str:
"""
Convert an embedding model name to a valid OpenSearch field suffix.
Examples:
- "text-embedding-3-small" -> "text_embedding_3_small"
- "nomic-embed-text:latest" -> "nomic_embed_text_latest"
- "ibm/slate-125m-english-rtrvr" -> "ibm_slate_125m_english_rtrvr"
Args:
model_name: The embedding model name (e.g., from OpenAI, Ollama, Watsonx)
Returns:
Normalized string safe for use as OpenSearch field name suffix
"""
normalized = model_name.lower()
# Replace common separators with underscores
normalized = normalized.replace("-", "_")
normalized = normalized.replace(":", "_")
normalized = normalized.replace("/", "_")
normalized = normalized.replace(".", "_")
# Remove any other non-alphanumeric characters
normalized = "".join(c if c.isalnum() or c == "_" else "_" for c in normalized)
# Remove duplicate underscores
while "__" in normalized:
normalized = normalized.replace("__", "_")
# Remove leading/trailing underscores
normalized = normalized.strip("_")
return normalized
def get_embedding_field_name(model_name: str) -> str:
"""
Get the OpenSearch field name for storing embeddings from a specific model.
Args:
model_name: The embedding model name
Returns:
Field name in format: chunk_embedding_{normalized_model_name}
Examples:
>>> get_embedding_field_name("text-embedding-3-small")
'chunk_embedding_text_embedding_3_small'
>>> get_embedding_field_name("nomic-embed-text")
'chunk_embedding_nomic_embed_text'
"""
normalized = normalize_model_name(model_name)
return f"chunk_embedding_{normalized}"
async def ensure_embedding_field_exists(
opensearch_client,
model_name: str,
index_name: str = None,
) -> str:
"""
Ensure that an embedding field for the specified model exists in the OpenSearch index.
If the field doesn't exist, it will be added dynamically using PUT mapping API.
Args:
opensearch_client: OpenSearch client instance
model_name: The embedding model name
index_name: OpenSearch index name (defaults to INDEX_NAME from settings)
Returns:
The field name that was ensured to exist
Raises:
Exception: If unable to add the field mapping
"""
from config.settings import INDEX_NAME
from utils.embeddings import get_embedding_dimensions
if index_name is None:
index_name = INDEX_NAME
field_name = get_embedding_field_name(model_name)
dimensions = get_embedding_dimensions(model_name)
logger.info(
"Ensuring embedding field exists",
field_name=field_name,
model_name=model_name,
dimensions=dimensions,
)
# Define the field mapping
mapping = {
"properties": {
field_name: {
"type": "knn_vector",
"dimension": dimensions,
"method": {
"name": "disk_ann",
"engine": "jvector",
"space_type": "l2",
"parameters": {"ef_construction": 100, "m": 16},
},
}
}
}
try:
# Try to add the mapping
# OpenSearch will ignore if field already exists
await opensearch_client.indices.put_mapping(
index=index_name,
body=mapping
)
logger.info(
"Successfully ensured embedding field exists",
field_name=field_name,
model_name=model_name,
)
except Exception as e:
error_msg = str(e).lower()
# These are expected/safe errors when field already exists
if "already" in error_msg or "exists" in error_msg or "mapper_parsing_exception" in error_msg:
logger.debug(
"Embedding field already exists (expected)",
field_name=field_name,
model_name=model_name,
)
else:
logger.error(
"Failed to ensure embedding field exists",
field_name=field_name,
model_name=model_name,
error=str(e),
)
# Don't raise - field might already exist with different params
# Better to proceed and let indexing fail if there's a real issue
return field_name

View file

@ -40,6 +40,8 @@ def create_dynamic_index_body(embedding_model: str) -> dict:
"mimetype": {"type": "keyword"},
"page": {"type": "integer"},
"text": {"type": "text"},
# Legacy field - kept for backward compatibility
# New documents will use chunk_embedding_{model_name} fields
"chunk_embedding": {
"type": "knn_vector",
"dimension": dimensions,
@ -50,6 +52,8 @@ def create_dynamic_index_body(embedding_model: str) -> dict:
"parameters": {"ef_construction": 100, "m": 16},
},
},
# Track which embedding model was used for this chunk
"embedding_model": {"type": "keyword"},
"source_url": {"type": "keyword"},
"connector_type": {"type": "keyword"},
"owner": {"type": "keyword"},