This PR implements the complete three-tier hierarchical retrieval architecture as specified in issue #11610, enabling production-grade RAG capabilities. ## Tier 1: Knowledge Base Routing - Auto-route queries to relevant knowledge bases - Per-KB retrieval parameters (KBRetrievalParams dataclass) - Rule-based routing with keyword overlap scoring - LLM-based routing with fallback to rule-based - Configurable routing methods: auto, rule_based, llm_based, all ## Tier 2: Document Filtering - Document-level metadata filtering within selected KBs - Configurable metadata fields for filtering - LLM-generated filter conditions - Metadata similarity matching (fuzzy matching) - Enhanced metadata generation for documents ## Tier 3: Chunk Refinement - Parent-child chunking with summary mapping - Custom prompts for keyword extraction - LLM-based question generation for chunks - Integration with existing retrieval pipeline ## Metadata Management (Batch CRUD) - MetadataService with batch operations: - batch_get_metadata - batch_update_metadata - batch_delete_metadata_fields - batch_set_metadata_field - get_metadata_schema - search_by_metadata - get_metadata_statistics - copy_metadata - REST API endpoints in metadata_app.py ## Integration - HierarchicalConfig dataclass for configuration - Integrated into Dealer class (search.py) - Wired into agent retrieval tool - Non-breaking: disabled by default ## Tests - 48 unit tests covering all components - Tests for config, routing, filtering, and metadata operations
398 lines
13 KiB
Python
398 lines
13 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""
|
|
Metadata Management Service for Hierarchical Retrieval.
|
|
|
|
Provides batch CRUD operations for document metadata to support:
|
|
- Efficient metadata filtering in Tier 2 of hierarchical retrieval
|
|
- Bulk metadata updates across multiple documents
|
|
- Metadata schema management per knowledge base
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
from peewee import fn
|
|
|
|
from api.db.db_models import DB, Document
|
|
from api.db.services.document_service import DocumentService
|
|
|
|
|
|
class MetadataService:
|
|
"""
|
|
Service for managing document metadata in batch operations.
|
|
|
|
Supports the hierarchical retrieval architecture by providing
|
|
efficient metadata management for document filtering.
|
|
"""
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def batch_get_metadata(
|
|
cls,
|
|
doc_ids: List[str],
|
|
fields: Optional[List[str]] = None
|
|
) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Get metadata for multiple documents.
|
|
|
|
Args:
|
|
doc_ids: List of document IDs
|
|
fields: Optional list of specific metadata fields to retrieve
|
|
|
|
Returns:
|
|
Dict mapping doc_id to metadata dict
|
|
"""
|
|
if not doc_ids:
|
|
return {}
|
|
|
|
result = {}
|
|
docs = Document.select(
|
|
Document.id,
|
|
Document.meta_fields,
|
|
Document.name
|
|
).where(Document.id.in_(doc_ids))
|
|
|
|
for doc in docs:
|
|
meta = doc.meta_fields or {}
|
|
if fields:
|
|
# Filter to requested fields only
|
|
meta = {k: v for k, v in meta.items() if k in fields}
|
|
result[doc.id] = {
|
|
"doc_id": doc.id,
|
|
"doc_name": doc.name,
|
|
"metadata": meta
|
|
}
|
|
|
|
return result
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def batch_update_metadata(
|
|
cls,
|
|
updates: List[Dict[str, Any]],
|
|
merge: bool = True
|
|
) -> Tuple[int, List[str]]:
|
|
"""
|
|
Update metadata for multiple documents in batch.
|
|
|
|
Args:
|
|
updates: List of dicts with 'doc_id' and 'metadata' keys
|
|
merge: If True, merge with existing metadata; if False, replace
|
|
|
|
Returns:
|
|
Tuple of (success_count, list of failed doc_ids)
|
|
"""
|
|
if not updates:
|
|
return 0, []
|
|
|
|
success_count = 0
|
|
failed_ids = []
|
|
|
|
for update in updates:
|
|
doc_id = update.get("doc_id")
|
|
new_metadata = update.get("metadata", {})
|
|
|
|
if not doc_id:
|
|
continue
|
|
|
|
try:
|
|
if merge:
|
|
# Get existing metadata and merge
|
|
doc = Document.get_or_none(Document.id == doc_id)
|
|
if doc:
|
|
existing = doc.meta_fields or {}
|
|
existing.update(new_metadata)
|
|
new_metadata = existing
|
|
|
|
DocumentService.update_meta_fields(doc_id, new_metadata)
|
|
success_count += 1
|
|
|
|
except Exception as e:
|
|
logging.error(f"Failed to update metadata for doc {doc_id}: {e}")
|
|
failed_ids.append(doc_id)
|
|
|
|
logging.info(f"Batch metadata update: {success_count} succeeded, {len(failed_ids)} failed")
|
|
return success_count, failed_ids
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def batch_delete_metadata_fields(
|
|
cls,
|
|
doc_ids: List[str],
|
|
fields: List[str]
|
|
) -> Tuple[int, List[str]]:
|
|
"""
|
|
Delete specific metadata fields from multiple documents.
|
|
|
|
Args:
|
|
doc_ids: List of document IDs
|
|
fields: List of metadata field names to delete
|
|
|
|
Returns:
|
|
Tuple of (success_count, list of failed doc_ids)
|
|
"""
|
|
if not doc_ids or not fields:
|
|
return 0, []
|
|
|
|
success_count = 0
|
|
failed_ids = []
|
|
|
|
docs = Document.select(
|
|
Document.id,
|
|
Document.meta_fields
|
|
).where(Document.id.in_(doc_ids))
|
|
|
|
for doc in docs:
|
|
try:
|
|
meta = doc.meta_fields or {}
|
|
modified = False
|
|
|
|
for field in fields:
|
|
if field in meta:
|
|
del meta[field]
|
|
modified = True
|
|
|
|
if modified:
|
|
DocumentService.update_meta_fields(doc.id, meta)
|
|
success_count += 1
|
|
|
|
except Exception as e:
|
|
logging.error(f"Failed to delete metadata fields for doc {doc.id}: {e}")
|
|
failed_ids.append(doc.id)
|
|
|
|
return success_count, failed_ids
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def batch_set_metadata_field(
|
|
cls,
|
|
doc_ids: List[str],
|
|
field_name: str,
|
|
field_value: Any
|
|
) -> Tuple[int, List[str]]:
|
|
"""
|
|
Set a specific metadata field to the same value for multiple documents.
|
|
|
|
Useful for bulk categorization or tagging.
|
|
|
|
Args:
|
|
doc_ids: List of document IDs
|
|
field_name: Name of the metadata field
|
|
field_value: Value to set
|
|
|
|
Returns:
|
|
Tuple of (success_count, list of failed doc_ids)
|
|
"""
|
|
if not doc_ids or not field_name:
|
|
return 0, []
|
|
|
|
updates = [
|
|
{"doc_id": doc_id, "metadata": {field_name: field_value}}
|
|
for doc_id in doc_ids
|
|
]
|
|
|
|
return cls.batch_update_metadata(updates, merge=True)
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def get_metadata_schema(cls, kb_id: str) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Get the metadata schema for a knowledge base.
|
|
|
|
Analyzes all documents in the KB to determine available
|
|
metadata fields and their types/values.
|
|
|
|
Args:
|
|
kb_id: Knowledge base ID
|
|
|
|
Returns:
|
|
Dict mapping field names to field info (type, sample values, count)
|
|
"""
|
|
schema = {}
|
|
|
|
docs = Document.select(
|
|
Document.meta_fields
|
|
).where(Document.kb_id == kb_id)
|
|
|
|
for doc in docs:
|
|
meta = doc.meta_fields or {}
|
|
for field_name, field_value in meta.items():
|
|
if field_name not in schema:
|
|
schema[field_name] = {
|
|
"type": type(field_value).__name__,
|
|
"sample_values": set(),
|
|
"count": 0
|
|
}
|
|
|
|
schema[field_name]["count"] += 1
|
|
|
|
# Collect sample values (limit to 10)
|
|
if len(schema[field_name]["sample_values"]) < 10:
|
|
try:
|
|
schema[field_name]["sample_values"].add(str(field_value)[:100])
|
|
except Exception:
|
|
pass
|
|
|
|
# Convert sets to lists for JSON serialization
|
|
for field_name in schema:
|
|
schema[field_name]["sample_values"] = list(schema[field_name]["sample_values"])
|
|
|
|
return schema
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def search_by_metadata(
|
|
cls,
|
|
kb_id: str,
|
|
filters: Dict[str, Any],
|
|
limit: int = 100
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Search documents by metadata filters.
|
|
|
|
Args:
|
|
kb_id: Knowledge base ID
|
|
filters: Dict of field_name -> value or {operator: value}
|
|
limit: Maximum number of results
|
|
|
|
Returns:
|
|
List of matching documents with their metadata
|
|
"""
|
|
docs = Document.select(
|
|
Document.id,
|
|
Document.name,
|
|
Document.meta_fields
|
|
).where(Document.kb_id == kb_id)
|
|
|
|
results = []
|
|
for doc in docs:
|
|
meta = doc.meta_fields or {}
|
|
matches = True
|
|
|
|
for field_name, condition in filters.items():
|
|
doc_value = meta.get(field_name)
|
|
|
|
if isinstance(condition, dict):
|
|
# Operator-based condition
|
|
op = list(condition.keys())[0]
|
|
val = condition[op]
|
|
|
|
if op == "equals":
|
|
matches = str(doc_value) == str(val)
|
|
elif op == "contains":
|
|
matches = str(val).lower() in str(doc_value).lower()
|
|
elif op == "starts_with":
|
|
matches = str(doc_value).lower().startswith(str(val).lower())
|
|
elif op == "in":
|
|
matches = doc_value in val
|
|
elif op == "gt":
|
|
matches = float(doc_value) > float(val) if doc_value else False
|
|
elif op == "lt":
|
|
matches = float(doc_value) < float(val) if doc_value else False
|
|
else:
|
|
# Simple equality
|
|
matches = str(doc_value) == str(condition)
|
|
|
|
if not matches:
|
|
break
|
|
|
|
if matches:
|
|
results.append({
|
|
"doc_id": doc.id,
|
|
"doc_name": doc.name,
|
|
"metadata": meta
|
|
})
|
|
|
|
if len(results) >= limit:
|
|
break
|
|
|
|
return results
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def get_metadata_statistics(cls, kb_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about metadata usage in a knowledge base.
|
|
|
|
Args:
|
|
kb_id: Knowledge base ID
|
|
|
|
Returns:
|
|
Dict with statistics about metadata fields
|
|
"""
|
|
total_docs = Document.select(fn.COUNT(Document.id)).where(
|
|
Document.kb_id == kb_id
|
|
).scalar()
|
|
|
|
docs_with_metadata = 0
|
|
field_usage = {}
|
|
|
|
docs = Document.select(Document.meta_fields).where(Document.kb_id == kb_id)
|
|
|
|
for doc in docs:
|
|
meta = doc.meta_fields or {}
|
|
if meta:
|
|
docs_with_metadata += 1
|
|
for field_name in meta.keys():
|
|
field_usage[field_name] = field_usage.get(field_name, 0) + 1
|
|
|
|
return {
|
|
"total_documents": total_docs,
|
|
"documents_with_metadata": docs_with_metadata,
|
|
"metadata_coverage": docs_with_metadata / total_docs if total_docs > 0 else 0,
|
|
"field_usage": field_usage,
|
|
"unique_fields": len(field_usage)
|
|
}
|
|
|
|
@classmethod
|
|
@DB.connection_context()
|
|
def copy_metadata(
|
|
cls,
|
|
source_doc_id: str,
|
|
target_doc_ids: List[str],
|
|
fields: Optional[List[str]] = None
|
|
) -> Tuple[int, List[str]]:
|
|
"""
|
|
Copy metadata from one document to multiple target documents.
|
|
|
|
Args:
|
|
source_doc_id: Source document ID
|
|
target_doc_ids: List of target document IDs
|
|
fields: Optional list of specific fields to copy (all if None)
|
|
|
|
Returns:
|
|
Tuple of (success_count, list of failed doc_ids)
|
|
"""
|
|
source_doc = Document.get_or_none(Document.id == source_doc_id)
|
|
if not source_doc:
|
|
return 0, target_doc_ids
|
|
|
|
source_meta = source_doc.meta_fields or {}
|
|
|
|
if fields:
|
|
source_meta = {k: v for k, v in source_meta.items() if k in fields}
|
|
|
|
if not source_meta:
|
|
return 0, []
|
|
|
|
updates = [
|
|
{"doc_id": doc_id, "metadata": source_meta.copy()}
|
|
for doc_id in target_doc_ids
|
|
]
|
|
|
|
return cls.batch_update_metadata(updates, merge=True)
|