ragflow/api/db/services/metadata_service.py
hsparks.codes d104f59e29 feat: Implement hierarchical retrieval architecture (#11610)
This PR implements the complete three-tier hierarchical retrieval architecture
as specified in issue #11610, enabling production-grade RAG capabilities.

## Tier 1: Knowledge Base Routing
- Auto-route queries to relevant knowledge bases
- Per-KB retrieval parameters (KBRetrievalParams dataclass)
- Rule-based routing with keyword overlap scoring
- LLM-based routing with fallback to rule-based
- Configurable routing methods: auto, rule_based, llm_based, all

## Tier 2: Document Filtering
- Document-level metadata filtering within selected KBs
- Configurable metadata fields for filtering
- LLM-generated filter conditions
- Metadata similarity matching (fuzzy matching)
- Enhanced metadata generation for documents

## Tier 3: Chunk Refinement
- Parent-child chunking with summary mapping
- Custom prompts for keyword extraction
- LLM-based question generation for chunks
- Integration with existing retrieval pipeline

## Metadata Management (Batch CRUD)
- MetadataService with batch operations:
  - batch_get_metadata
  - batch_update_metadata
  - batch_delete_metadata_fields
  - batch_set_metadata_field
  - get_metadata_schema
  - search_by_metadata
  - get_metadata_statistics
  - copy_metadata
- REST API endpoints in metadata_app.py

## Integration
- HierarchicalConfig dataclass for configuration
- Integrated into Dealer class (search.py)
- Wired into agent retrieval tool
- Non-breaking: disabled by default

## Tests
- 48 unit tests covering all components
- Tests for config, routing, filtering, and metadata operations
2025-12-09 07:32:00 +01:00

398 lines
13 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Metadata Management Service for Hierarchical Retrieval.
Provides batch CRUD operations for document metadata to support:
- Efficient metadata filtering in Tier 2 of hierarchical retrieval
- Bulk metadata updates across multiple documents
- Metadata schema management per knowledge base
"""
import logging
from typing import List, Dict, Any, Optional, Tuple
from peewee import fn
from api.db.db_models import DB, Document
from api.db.services.document_service import DocumentService
class MetadataService:
"""
Service for managing document metadata in batch operations.
Supports the hierarchical retrieval architecture by providing
efficient metadata management for document filtering.
"""
@classmethod
@DB.connection_context()
def batch_get_metadata(
cls,
doc_ids: List[str],
fields: Optional[List[str]] = None
) -> Dict[str, Dict[str, Any]]:
"""
Get metadata for multiple documents.
Args:
doc_ids: List of document IDs
fields: Optional list of specific metadata fields to retrieve
Returns:
Dict mapping doc_id to metadata dict
"""
if not doc_ids:
return {}
result = {}
docs = Document.select(
Document.id,
Document.meta_fields,
Document.name
).where(Document.id.in_(doc_ids))
for doc in docs:
meta = doc.meta_fields or {}
if fields:
# Filter to requested fields only
meta = {k: v for k, v in meta.items() if k in fields}
result[doc.id] = {
"doc_id": doc.id,
"doc_name": doc.name,
"metadata": meta
}
return result
@classmethod
@DB.connection_context()
def batch_update_metadata(
cls,
updates: List[Dict[str, Any]],
merge: bool = True
) -> Tuple[int, List[str]]:
"""
Update metadata for multiple documents in batch.
Args:
updates: List of dicts with 'doc_id' and 'metadata' keys
merge: If True, merge with existing metadata; if False, replace
Returns:
Tuple of (success_count, list of failed doc_ids)
"""
if not updates:
return 0, []
success_count = 0
failed_ids = []
for update in updates:
doc_id = update.get("doc_id")
new_metadata = update.get("metadata", {})
if not doc_id:
continue
try:
if merge:
# Get existing metadata and merge
doc = Document.get_or_none(Document.id == doc_id)
if doc:
existing = doc.meta_fields or {}
existing.update(new_metadata)
new_metadata = existing
DocumentService.update_meta_fields(doc_id, new_metadata)
success_count += 1
except Exception as e:
logging.error(f"Failed to update metadata for doc {doc_id}: {e}")
failed_ids.append(doc_id)
logging.info(f"Batch metadata update: {success_count} succeeded, {len(failed_ids)} failed")
return success_count, failed_ids
@classmethod
@DB.connection_context()
def batch_delete_metadata_fields(
cls,
doc_ids: List[str],
fields: List[str]
) -> Tuple[int, List[str]]:
"""
Delete specific metadata fields from multiple documents.
Args:
doc_ids: List of document IDs
fields: List of metadata field names to delete
Returns:
Tuple of (success_count, list of failed doc_ids)
"""
if not doc_ids or not fields:
return 0, []
success_count = 0
failed_ids = []
docs = Document.select(
Document.id,
Document.meta_fields
).where(Document.id.in_(doc_ids))
for doc in docs:
try:
meta = doc.meta_fields or {}
modified = False
for field in fields:
if field in meta:
del meta[field]
modified = True
if modified:
DocumentService.update_meta_fields(doc.id, meta)
success_count += 1
except Exception as e:
logging.error(f"Failed to delete metadata fields for doc {doc.id}: {e}")
failed_ids.append(doc.id)
return success_count, failed_ids
@classmethod
@DB.connection_context()
def batch_set_metadata_field(
cls,
doc_ids: List[str],
field_name: str,
field_value: Any
) -> Tuple[int, List[str]]:
"""
Set a specific metadata field to the same value for multiple documents.
Useful for bulk categorization or tagging.
Args:
doc_ids: List of document IDs
field_name: Name of the metadata field
field_value: Value to set
Returns:
Tuple of (success_count, list of failed doc_ids)
"""
if not doc_ids or not field_name:
return 0, []
updates = [
{"doc_id": doc_id, "metadata": {field_name: field_value}}
for doc_id in doc_ids
]
return cls.batch_update_metadata(updates, merge=True)
@classmethod
@DB.connection_context()
def get_metadata_schema(cls, kb_id: str) -> Dict[str, Dict[str, Any]]:
"""
Get the metadata schema for a knowledge base.
Analyzes all documents in the KB to determine available
metadata fields and their types/values.
Args:
kb_id: Knowledge base ID
Returns:
Dict mapping field names to field info (type, sample values, count)
"""
schema = {}
docs = Document.select(
Document.meta_fields
).where(Document.kb_id == kb_id)
for doc in docs:
meta = doc.meta_fields or {}
for field_name, field_value in meta.items():
if field_name not in schema:
schema[field_name] = {
"type": type(field_value).__name__,
"sample_values": set(),
"count": 0
}
schema[field_name]["count"] += 1
# Collect sample values (limit to 10)
if len(schema[field_name]["sample_values"]) < 10:
try:
schema[field_name]["sample_values"].add(str(field_value)[:100])
except Exception:
pass
# Convert sets to lists for JSON serialization
for field_name in schema:
schema[field_name]["sample_values"] = list(schema[field_name]["sample_values"])
return schema
@classmethod
@DB.connection_context()
def search_by_metadata(
cls,
kb_id: str,
filters: Dict[str, Any],
limit: int = 100
) -> List[Dict[str, Any]]:
"""
Search documents by metadata filters.
Args:
kb_id: Knowledge base ID
filters: Dict of field_name -> value or {operator: value}
limit: Maximum number of results
Returns:
List of matching documents with their metadata
"""
docs = Document.select(
Document.id,
Document.name,
Document.meta_fields
).where(Document.kb_id == kb_id)
results = []
for doc in docs:
meta = doc.meta_fields or {}
matches = True
for field_name, condition in filters.items():
doc_value = meta.get(field_name)
if isinstance(condition, dict):
# Operator-based condition
op = list(condition.keys())[0]
val = condition[op]
if op == "equals":
matches = str(doc_value) == str(val)
elif op == "contains":
matches = str(val).lower() in str(doc_value).lower()
elif op == "starts_with":
matches = str(doc_value).lower().startswith(str(val).lower())
elif op == "in":
matches = doc_value in val
elif op == "gt":
matches = float(doc_value) > float(val) if doc_value else False
elif op == "lt":
matches = float(doc_value) < float(val) if doc_value else False
else:
# Simple equality
matches = str(doc_value) == str(condition)
if not matches:
break
if matches:
results.append({
"doc_id": doc.id,
"doc_name": doc.name,
"metadata": meta
})
if len(results) >= limit:
break
return results
@classmethod
@DB.connection_context()
def get_metadata_statistics(cls, kb_id: str) -> Dict[str, Any]:
"""
Get statistics about metadata usage in a knowledge base.
Args:
kb_id: Knowledge base ID
Returns:
Dict with statistics about metadata fields
"""
total_docs = Document.select(fn.COUNT(Document.id)).where(
Document.kb_id == kb_id
).scalar()
docs_with_metadata = 0
field_usage = {}
docs = Document.select(Document.meta_fields).where(Document.kb_id == kb_id)
for doc in docs:
meta = doc.meta_fields or {}
if meta:
docs_with_metadata += 1
for field_name in meta.keys():
field_usage[field_name] = field_usage.get(field_name, 0) + 1
return {
"total_documents": total_docs,
"documents_with_metadata": docs_with_metadata,
"metadata_coverage": docs_with_metadata / total_docs if total_docs > 0 else 0,
"field_usage": field_usage,
"unique_fields": len(field_usage)
}
@classmethod
@DB.connection_context()
def copy_metadata(
cls,
source_doc_id: str,
target_doc_ids: List[str],
fields: Optional[List[str]] = None
) -> Tuple[int, List[str]]:
"""
Copy metadata from one document to multiple target documents.
Args:
source_doc_id: Source document ID
target_doc_ids: List of target document IDs
fields: Optional list of specific fields to copy (all if None)
Returns:
Tuple of (success_count, list of failed doc_ids)
"""
source_doc = Document.get_or_none(Document.id == source_doc_id)
if not source_doc:
return 0, target_doc_ids
source_meta = source_doc.meta_fields or {}
if fields:
source_meta = {k: v for k, v in source_meta.items() if k in fields}
if not source_meta:
return 0, []
updates = [
{"doc_id": doc_id, "metadata": source_meta.copy()}
for doc_id in target_doc_ids
]
return cls.batch_update_metadata(updates, merge=True)