This PR implements the complete three-tier hierarchical retrieval architecture as specified in issue #11610, enabling production-grade RAG capabilities. ## Tier 1: Knowledge Base Routing - Auto-route queries to relevant knowledge bases - Per-KB retrieval parameters (KBRetrievalParams dataclass) - Rule-based routing with keyword overlap scoring - LLM-based routing with fallback to rule-based - Configurable routing methods: auto, rule_based, llm_based, all ## Tier 2: Document Filtering - Document-level metadata filtering within selected KBs - Configurable metadata fields for filtering - LLM-generated filter conditions - Metadata similarity matching (fuzzy matching) - Enhanced metadata generation for documents ## Tier 3: Chunk Refinement - Parent-child chunking with summary mapping - Custom prompts for keyword extraction - LLM-based question generation for chunks - Integration with existing retrieval pipeline ## Metadata Management (Batch CRUD) - MetadataService with batch operations: - batch_get_metadata - batch_update_metadata - batch_delete_metadata_fields - batch_set_metadata_field - get_metadata_schema - search_by_metadata - get_metadata_statistics - copy_metadata - REST API endpoints in metadata_app.py ## Integration - HierarchicalConfig dataclass for configuration - Integrated into Dealer class (search.py) - Wired into agent retrieval tool - Non-breaking: disabled by default ## Tests - 48 unit tests covering all components - Tests for config, routing, filtering, and metadata operations
378 lines
11 KiB
Python
378 lines
11 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""
|
|
Metadata Management API for Hierarchical Retrieval.
|
|
|
|
Provides REST endpoints for batch CRUD operations on document metadata,
|
|
supporting the hierarchical retrieval architecture's Tier 2 document filtering.
|
|
"""
|
|
|
|
from quart import request
|
|
from api.apps import current_user, login_required
|
|
from api.common.check_team_permission import check_kb_team_permission
|
|
from api.db.services.metadata_service import MetadataService
|
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
from api.utils.api_utils import (
|
|
get_json_result,
|
|
server_error_response,
|
|
)
|
|
from common.constants import RetCode
|
|
|
|
|
|
@manager.route("/batch/get", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def batch_get_metadata():
|
|
"""
|
|
Get metadata for multiple documents.
|
|
|
|
Request body:
|
|
{
|
|
"doc_ids": ["doc1", "doc2", ...],
|
|
"fields": ["field1", "field2", ...] // optional
|
|
}
|
|
|
|
Returns:
|
|
{
|
|
"doc1": {"doc_id": "doc1", "doc_name": "...", "metadata": {...}},
|
|
...
|
|
}
|
|
"""
|
|
try:
|
|
req = await request.json
|
|
doc_ids = req.get("doc_ids", [])
|
|
fields = req.get("fields")
|
|
|
|
if not doc_ids:
|
|
return get_json_result(
|
|
data={},
|
|
message="No document IDs provided",
|
|
code=RetCode.ARGUMENT_ERROR
|
|
)
|
|
|
|
result = MetadataService.batch_get_metadata(doc_ids, fields)
|
|
return get_json_result(data=result)
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/batch/update", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def batch_update_metadata():
|
|
"""
|
|
Update metadata for multiple documents.
|
|
|
|
Request body:
|
|
{
|
|
"updates": [
|
|
{"doc_id": "doc1", "metadata": {"field1": "value1", ...}},
|
|
{"doc_id": "doc2", "metadata": {"field2": "value2", ...}},
|
|
...
|
|
],
|
|
"merge": true // optional, default true. If false, replaces all metadata
|
|
}
|
|
|
|
Returns:
|
|
{
|
|
"success_count": 5,
|
|
"failed_ids": ["doc3"]
|
|
}
|
|
"""
|
|
try:
|
|
req = await request.json
|
|
updates = req.get("updates", [])
|
|
merge = req.get("merge", True)
|
|
|
|
if not updates:
|
|
return get_json_result(
|
|
data={"success_count": 0, "failed_ids": []},
|
|
message="No updates provided",
|
|
code=RetCode.ARGUMENT_ERROR
|
|
)
|
|
|
|
success_count, failed_ids = MetadataService.batch_update_metadata(updates, merge)
|
|
|
|
return get_json_result(data={
|
|
"success_count": success_count,
|
|
"failed_ids": failed_ids
|
|
})
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/batch/delete-fields", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def batch_delete_metadata_fields():
|
|
"""
|
|
Delete specific metadata fields from multiple documents.
|
|
|
|
Request body:
|
|
{
|
|
"doc_ids": ["doc1", "doc2", ...],
|
|
"fields": ["field1", "field2", ...]
|
|
}
|
|
|
|
Returns:
|
|
{
|
|
"success_count": 5,
|
|
"failed_ids": []
|
|
}
|
|
"""
|
|
try:
|
|
req = await request.json
|
|
doc_ids = req.get("doc_ids", [])
|
|
fields = req.get("fields", [])
|
|
|
|
if not doc_ids or not fields:
|
|
return get_json_result(
|
|
data={"success_count": 0, "failed_ids": []},
|
|
message="doc_ids and fields are required",
|
|
code=RetCode.ARGUMENT_ERROR
|
|
)
|
|
|
|
success_count, failed_ids = MetadataService.batch_delete_metadata_fields(doc_ids, fields)
|
|
|
|
return get_json_result(data={
|
|
"success_count": success_count,
|
|
"failed_ids": failed_ids
|
|
})
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/batch/set-field", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def batch_set_metadata_field():
|
|
"""
|
|
Set a specific field to the same value for multiple documents.
|
|
|
|
Useful for bulk categorization or tagging.
|
|
|
|
Request body:
|
|
{
|
|
"doc_ids": ["doc1", "doc2", ...],
|
|
"field_name": "category",
|
|
"field_value": "Technical"
|
|
}
|
|
|
|
Returns:
|
|
{
|
|
"success_count": 5,
|
|
"failed_ids": []
|
|
}
|
|
"""
|
|
try:
|
|
req = await request.json
|
|
doc_ids = req.get("doc_ids", [])
|
|
field_name = req.get("field_name")
|
|
field_value = req.get("field_value")
|
|
|
|
if not doc_ids or not field_name:
|
|
return get_json_result(
|
|
data={"success_count": 0, "failed_ids": []},
|
|
message="doc_ids and field_name are required",
|
|
code=RetCode.ARGUMENT_ERROR
|
|
)
|
|
|
|
success_count, failed_ids = MetadataService.batch_set_metadata_field(
|
|
doc_ids, field_name, field_value
|
|
)
|
|
|
|
return get_json_result(data={
|
|
"success_count": success_count,
|
|
"failed_ids": failed_ids
|
|
})
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/schema/<kb_id>", methods=["GET"]) # noqa: F821
|
|
@login_required
|
|
async def get_metadata_schema(kb_id):
|
|
"""
|
|
Get the metadata schema for a knowledge base.
|
|
|
|
Returns available metadata fields, their types, and sample values.
|
|
|
|
Returns:
|
|
{
|
|
"field1": {"type": "str", "sample_values": ["a", "b"], "count": 10},
|
|
...
|
|
}
|
|
"""
|
|
try:
|
|
# Check KB access permission
|
|
kb = KnowledgebaseService.get_by_id(kb_id)
|
|
if not kb:
|
|
return get_json_result(
|
|
data={},
|
|
message="Knowledge base not found",
|
|
code=RetCode.DATA_ERROR
|
|
)
|
|
|
|
if not check_kb_team_permission(current_user.id, kb_id):
|
|
return get_json_result(
|
|
data={},
|
|
message="No permission to access this knowledge base",
|
|
code=RetCode.PERMISSION_ERROR
|
|
)
|
|
|
|
schema = MetadataService.get_metadata_schema(kb_id)
|
|
return get_json_result(data=schema)
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/statistics/<kb_id>", methods=["GET"]) # noqa: F821
|
|
@login_required
|
|
async def get_metadata_statistics(kb_id):
|
|
"""
|
|
Get statistics about metadata usage in a knowledge base.
|
|
|
|
Returns:
|
|
{
|
|
"total_documents": 100,
|
|
"documents_with_metadata": 80,
|
|
"metadata_coverage": 0.8,
|
|
"field_usage": {"category": 50, "author": 30},
|
|
"unique_fields": 5
|
|
}
|
|
"""
|
|
try:
|
|
# Check KB access permission
|
|
kb = KnowledgebaseService.get_by_id(kb_id)
|
|
if not kb:
|
|
return get_json_result(
|
|
data={},
|
|
message="Knowledge base not found",
|
|
code=RetCode.DATA_ERROR
|
|
)
|
|
|
|
if not check_kb_team_permission(current_user.id, kb_id):
|
|
return get_json_result(
|
|
data={},
|
|
message="No permission to access this knowledge base",
|
|
code=RetCode.PERMISSION_ERROR
|
|
)
|
|
|
|
stats = MetadataService.get_metadata_statistics(kb_id)
|
|
return get_json_result(data=stats)
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/search", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def search_by_metadata():
|
|
"""
|
|
Search documents by metadata filters.
|
|
|
|
Request body:
|
|
{
|
|
"kb_id": "kb123",
|
|
"filters": {
|
|
"category": "Technical",
|
|
"author": {"contains": "John"},
|
|
"year": {"gt": 2020}
|
|
},
|
|
"limit": 100
|
|
}
|
|
|
|
Supported operators: equals, contains, starts_with, in, gt, lt
|
|
|
|
Returns:
|
|
[
|
|
{"doc_id": "doc1", "doc_name": "...", "metadata": {...}},
|
|
...
|
|
]
|
|
"""
|
|
try:
|
|
req = await request.json
|
|
kb_id = req.get("kb_id")
|
|
filters = req.get("filters", {})
|
|
limit = req.get("limit", 100)
|
|
|
|
if not kb_id:
|
|
return get_json_result(
|
|
data=[],
|
|
message="kb_id is required",
|
|
code=RetCode.ARGUMENT_ERROR
|
|
)
|
|
|
|
# Check KB access permission
|
|
if not check_kb_team_permission(current_user.id, kb_id):
|
|
return get_json_result(
|
|
data=[],
|
|
message="No permission to access this knowledge base",
|
|
code=RetCode.PERMISSION_ERROR
|
|
)
|
|
|
|
results = MetadataService.search_by_metadata(kb_id, filters, limit)
|
|
return get_json_result(data=results)
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/copy", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def copy_metadata():
|
|
"""
|
|
Copy metadata from one document to multiple target documents.
|
|
|
|
Request body:
|
|
{
|
|
"source_doc_id": "doc1",
|
|
"target_doc_ids": ["doc2", "doc3", ...],
|
|
"fields": ["field1", "field2"] // optional, copies all if not specified
|
|
}
|
|
|
|
Returns:
|
|
{
|
|
"success_count": 5,
|
|
"failed_ids": []
|
|
}
|
|
"""
|
|
try:
|
|
req = await request.json
|
|
source_doc_id = req.get("source_doc_id")
|
|
target_doc_ids = req.get("target_doc_ids", [])
|
|
fields = req.get("fields")
|
|
|
|
if not source_doc_id or not target_doc_ids:
|
|
return get_json_result(
|
|
data={"success_count": 0, "failed_ids": []},
|
|
message="source_doc_id and target_doc_ids are required",
|
|
code=RetCode.ARGUMENT_ERROR
|
|
)
|
|
|
|
success_count, failed_ids = MetadataService.copy_metadata(
|
|
source_doc_id, target_doc_ids, fields
|
|
)
|
|
|
|
return get_json_result(data={
|
|
"success_count": success_count,
|
|
"failed_ids": failed_ids
|
|
})
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|