filter query optimization

This commit is contained in:
phact 2025-10-10 10:28:11 -04:00
parent b799d91c25
commit c8c719c7eb

View file

@ -64,7 +64,36 @@ class SearchService:
user_id, jwt_token
)
# Build filter clauses first so we can use them in model detection
filter_clauses = []
if filters:
# Map frontend filter names to backend field names
field_mapping = {
"data_sources": "filename",
"document_types": "mimetype",
"owners": "owner_name.keyword",
"connector_types": "connector_type",
}
for filter_key, values in filters.items():
if values is not None and isinstance(values, list):
# Map frontend key to backend field name
field_name = field_mapping.get(filter_key, filter_key)
if len(values) == 0:
# Empty array means "match nothing" - use impossible filter
filter_clauses.append(
{"term": {field_name: "__IMPOSSIBLE_VALUE__"}}
)
elif len(values) == 1:
# Single value filter
filter_clauses.append({"term": {field_name: values[0]}})
else:
# Multiple values filter
filter_clauses.append({"terms": {field_name: values}})
try:
# Build aggregation query with filters applied
agg_query = {
"size": 0,
"aggs": {
@ -76,6 +105,15 @@ class SearchService:
}
}
}
# Apply filters to model detection if any exist
if filter_clauses:
agg_query["query"] = {
"bool": {
"filter": filter_clauses
}
}
agg_result = await opensearch_client.search(index=INDEX_NAME, body=agg_query)
buckets = agg_result.get("aggregations", {}).get("embedding_models", {}).get("buckets", [])
available_models = [b["key"] for b in buckets if b["key"]]
@ -87,7 +125,8 @@ class SearchService:
logger.info(
"Detected embedding models in corpus",
available_models=available_models,
model_counts={b["key"]: b["doc_count"] for b in buckets}
model_counts={b["key"]: b["doc_count"] for b in buckets},
with_filters=len(filter_clauses) > 0
)
except Exception as e:
logger.warning("Failed to detect embedding models, using configured model", error=str(e))
@ -123,34 +162,34 @@ class SearchService:
models=list(query_embeddings.keys()),
query_preview=query[:50]
)
else:
# Wildcard query - no embedding needed
filter_clauses = []
if filters:
# Map frontend filter names to backend field names
field_mapping = {
"data_sources": "filename",
"document_types": "mimetype",
"owners": "owner_name.keyword",
"connector_types": "connector_type",
}
# Build filter clauses
filter_clauses = []
if filters:
# Map frontend filter names to backend field names
field_mapping = {
"data_sources": "filename",
"document_types": "mimetype",
"owners": "owner_name.keyword",
"connector_types": "connector_type",
}
for filter_key, values in filters.items():
if values is not None and isinstance(values, list):
# Map frontend key to backend field name
field_name = field_mapping.get(filter_key, filter_key)
for filter_key, values in filters.items():
if values is not None and isinstance(values, list):
# Map frontend key to backend field name
field_name = field_mapping.get(filter_key, filter_key)
if len(values) == 0:
# Empty array means "match nothing" - use impossible filter
filter_clauses.append(
{"term": {field_name: "__IMPOSSIBLE_VALUE__"}}
)
elif len(values) == 1:
# Single value filter
filter_clauses.append({"term": {field_name: values[0]}})
else:
# Multiple values filter
filter_clauses.append({"terms": {field_name: values}})
if len(values) == 0:
# Empty array means "match nothing" - use impossible filter
filter_clauses.append(
{"term": {field_name: "__IMPOSSIBLE_VALUE__"}}
)
elif len(values) == 1:
# Single value filter
filter_clauses.append({"term": {field_name: values[0]}})
else:
# Multiple values filter
filter_clauses.append({"terms": {field_name: values}})
# Build query body
if is_wildcard_match_all: