diff --git a/api/apps/document_app.py b/api/apps/document_app.py index ba52bd61c..8adbb175c 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -718,6 +718,227 @@ async def set_meta(): return server_error_response(e) +@manager.route("/batch_set_meta", methods=["POST"]) # noqa: F821 +@login_required +@validate_request("doc_ids", "meta") +async def batch_set_meta(): + """ + Batch update metadata for multiple documents. + + Request body: + { + "doc_ids": ["doc_id_1", "doc_id_2", ...], + "meta": {"key1": "value1", "key2": 123} + } + """ + req = await get_request_json() + doc_ids = req.get("doc_ids", []) + + if not isinstance(doc_ids, list) or not doc_ids: + return get_json_result( + data=False, + message="doc_ids must be a non-empty list", + code=RetCode.ARGUMENT_ERROR + ) + + # Parse and validate metadata + try: + meta = json.loads(req["meta"]) if isinstance(req["meta"], str) else req["meta"] + if not isinstance(meta, dict): + return get_json_result( + data=False, + message="Only dictionary type supported.", + code=RetCode.ARGUMENT_ERROR + ) + for k, v in meta.items(): + if not isinstance(v, (str, int, float)): + return get_json_result( + data=False, + message=f"The type is not supported: {v}", + code=RetCode.ARGUMENT_ERROR + ) + except Exception as e: + return get_json_result( + data=False, + message=f"Json syntax error: {e}", + code=RetCode.ARGUMENT_ERROR + ) + + # Process each document + results = {} + success_count = 0 + + for doc_id in doc_ids: + try: + # Check authorization + if not DocumentService.accessible(doc_id, current_user.id): + results[doc_id] = {"success": False, "error": "No authorization"} + continue + + # Check if document exists + e, doc = DocumentService.get_by_id(doc_id) + if not e: + results[doc_id] = {"success": False, "error": "Document not found"} + continue + + # Update metadata + if DocumentService.update_by_id(doc_id, {"meta_fields": meta}): + results[doc_id] = {"success": True} + success_count += 1 + else: + results[doc_id] = {"success": False, "error": "Database error"} + + except Exception as e: + results[doc_id] = {"success": False, "error": str(e)} + + return get_json_result( + data={ + "results": results, + "total": len(doc_ids), + "success": success_count, + "failed": len(doc_ids) - success_count + } + ) + + +@manager.route("/get_meta", methods=["POST"]) # noqa: F821 +@login_required +@validate_request("doc_id") +async def get_meta(): + """Get metadata for a single document.""" + req = await get_request_json() + + if not DocumentService.accessible(req["doc_id"], current_user.id): + return get_json_result( + data=False, + message="No authorization.", + code=RetCode.AUTHENTICATION_ERROR + ) + + try: + e, doc = DocumentService.get_by_id(req["doc_id"]) + if not e: + return get_data_error_result(message="Document not found!") + + return get_json_result(data={ + "doc_id": doc.id, + "doc_name": doc.name, + "meta": doc.meta_fields or {} + }) + except Exception as e: + return server_error_response(e) + + +@manager.route("/batch_get_meta", methods=["POST"]) # noqa: F821 +@login_required +@validate_request("doc_ids") +async def batch_get_meta(): + """ + Batch retrieve metadata for multiple documents. + + Request body: + { + "doc_ids": ["doc_id_1", "doc_id_2", ...] + } + """ + req = await get_request_json() + doc_ids = req.get("doc_ids", []) + + if not isinstance(doc_ids, list) or not doc_ids: + return get_json_result( + data=False, + message="doc_ids must be a non-empty list", + code=RetCode.ARGUMENT_ERROR + ) + + results = {} + + for doc_id in doc_ids: + try: + if not DocumentService.accessible(doc_id, current_user.id): + results[doc_id] = {"error": "No authorization"} + continue + + e, doc = DocumentService.get_by_id(doc_id) + if not e: + results[doc_id] = {"error": "Document not found"} + continue + + results[doc_id] = { + "doc_name": doc.name, + "meta": doc.meta_fields or {}, + "kb_id": doc.kb_id + } + + except Exception as e: + results[doc_id] = {"error": str(e)} + + return get_json_result(data=results) + + +@manager.route("/list_metadata_fields", methods=["POST"]) # noqa: F821 +@login_required +@validate_request("kb_id") +async def list_metadata_fields(): + """ + List all unique metadata field names and their value types across documents in a KB. + + Request body: + { + "kb_id": "kb_id_123" + } + """ + req = await get_request_json() + kb_id = req.get("kb_id") + + try: + # Check KB access + e, kb = KnowledgebaseService.get_by_id(kb_id) + if not e: + return get_data_error_result(message="Knowledgebase not found!") + + if not check_kb_team_permission(kb, current_user.id): + return get_json_result( + data=False, + message="No authorization.", + code=RetCode.AUTHENTICATION_ERROR + ) + + # Get all documents in KB + docs = DocumentService.query(kb_id=kb_id) + + # Collect all metadata fields and their types + metadata_fields = {} + + for doc in docs: + if not doc.meta_fields: + continue + + for key, value in doc.meta_fields.items(): + value_type = type(value).__name__ + + if key not in metadata_fields: + metadata_fields[key] = { + "type": value_type, + "example": value, + "count": 1 + } + else: + metadata_fields[key]["count"] += 1 + # Track if types vary + if metadata_fields[key]["type"] != value_type: + metadata_fields[key]["type"] = "mixed" + + return get_json_result(data={ + "kb_id": kb_id, + "total_documents": len(docs), + "metadata_fields": metadata_fields + }) + + except Exception as e: + return server_error_response(e) + + @manager.route("/upload_info", methods=["POST"]) # noqa: F821 async def upload_info(): files = await request.files diff --git a/test/unit_test/api/test_batch_metadata.py b/test/unit_test/api/test_batch_metadata.py new file mode 100644 index 000000000..ae9871f68 --- /dev/null +++ b/test/unit_test/api/test_batch_metadata.py @@ -0,0 +1,225 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Unit tests for batch metadata management endpoints +""" + +import pytest +import json +from unittest.mock import Mock, patch, MagicMock + + +class TestBatchMetadataEndpoints: + """Test batch metadata management functionality""" + + def test_batch_set_meta_validation(self): + """Test batch_set_meta request validation""" + # Test empty doc_ids + doc_ids = [] + meta = {"department": "HR"} + + assert isinstance(doc_ids, list) + assert not doc_ids # Empty list + + # Test valid doc_ids + doc_ids = ["doc1", "doc2", "doc3"] + assert isinstance(doc_ids, list) + assert len(doc_ids) == 3 + + def test_metadata_type_validation(self): + """Test metadata value type validation""" + # Valid types + valid_meta = { + "string_field": "value", + "int_field": 123, + "float_field": 45.67 + } + + for k, v in valid_meta.items(): + assert isinstance(v, (str, int, float)) + + # Invalid types + invalid_values = [ + {"list_field": [1, 2, 3]}, + {"dict_field": {"nested": "value"}}, + ] + + for invalid in invalid_values: + for k, v in invalid.items(): + assert not isinstance(v, (str, int, float)) + + def test_batch_results_structure(self): + """Test batch operation results structure""" + results = { + "results": { + "doc1": {"success": True}, + "doc2": {"success": False, "error": "Not found"}, + "doc3": {"success": True} + }, + "total": 3, + "success": 2, + "failed": 1 + } + + assert "results" in results + assert "total" in results + assert "success" in results + assert "failed" in results + assert results["total"] == 3 + assert results["success"] == 2 + assert results["failed"] == 1 + + def test_get_meta_response_structure(self): + """Test get_meta response structure""" + response = { + "doc_id": "doc123", + "doc_name": "test.pdf", + "meta": { + "department": "HR", + "year": 2024 + } + } + + assert "doc_id" in response + assert "doc_name" in response + assert "meta" in response + assert isinstance(response["meta"], dict) + + def test_batch_get_meta_response_structure(self): + """Test batch_get_meta response structure""" + response = { + "doc1": { + "doc_name": "file1.pdf", + "meta": {"dept": "HR"}, + "kb_id": "kb123" + }, + "doc2": { + "error": "Document not found" + } + } + + assert "doc1" in response + assert "doc2" in response + assert "meta" in response["doc1"] + assert "error" in response["doc2"] + + def test_list_metadata_fields_structure(self): + """Test list_metadata_fields response structure""" + response = { + "kb_id": "kb123", + "total_documents": 10, + "metadata_fields": { + "department": { + "type": "str", + "example": "HR", + "count": 8 + }, + "year": { + "type": "int", + "example": 2024, + "count": 10 + }, + "cost": { + "type": "float", + "example": 199.99, + "count": 5 + } + } + } + + assert "kb_id" in response + assert "total_documents" in response + assert "metadata_fields" in response + + for field_name, field_info in response["metadata_fields"].items(): + assert "type" in field_info + assert "example" in field_info + assert "count" in field_info + + def test_metadata_field_type_tracking(self): + """Test metadata field type tracking across documents""" + # Simulating field type analysis + documents = [ + {"meta_fields": {"dept": "HR", "year": 2024}}, + {"meta_fields": {"dept": "IT", "year": 2023}}, + {"meta_fields": {"dept": "Finance", "year": "2024"}}, # Mixed type + ] + + field_types = {} + + for doc in documents: + for key, value in doc.get("meta_fields", {}).items(): + value_type = type(value).__name__ + + if key not in field_types: + field_types[key] = value_type + elif field_types[key] != value_type: + field_types[key] = "mixed" + + assert field_types["dept"] == "str" + assert field_types["year"] == "mixed" # int and str + + def test_json_metadata_parsing(self): + """Test JSON metadata parsing""" + # Test string JSON + meta_str = '{"department": "HR", "cost": 123.45}' + meta = json.loads(meta_str) + + assert isinstance(meta, dict) + assert meta["department"] == "HR" + assert meta["cost"] == 123.45 + + # Test already parsed dict + meta_dict = {"department": "HR", "cost": 123.45} + assert isinstance(meta_dict, dict) + + def test_authorization_check_logic(self): + """Test authorization checking logic""" + user_id = "user123" + doc_owner_id = "user123" + + # Same user - authorized + assert user_id == doc_owner_id + + # Different user - not authorized + other_user = "user456" + assert user_id != other_user + + def test_batch_operation_partial_success(self): + """Test handling partial success in batch operations""" + doc_ids = ["doc1", "doc2", "doc3", "doc4"] + + # Simulate results + results = { + "doc1": {"success": True}, + "doc2": {"success": False, "error": "Not found"}, + "doc3": {"success": True}, + "doc4": {"success": False, "error": "No authorization"} + } + + success_count = sum(1 for r in results.values() if r.get("success")) + failed_count = len(doc_ids) - success_count + + assert success_count == 2 + assert failed_count == 2 + + # Verify we can still return partial results + assert len(results) == len(doc_ids) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])