ragflow/test/unit_test/api/test_batch_metadata.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

"""
Unit tests for batch metadata management endpoints
"""

import pytest
import json
from unittest.mock import Mock, patch, MagicMock


class TestBatchMetadataEndpoints:
    """Test batch metadata management functionality"""

    def test_batch_set_meta_validation(self):
        """Test batch_set_meta request validation"""
        # Test empty doc_ids
        doc_ids = []
        meta = {"department": "HR"}

        assert isinstance(doc_ids, list)
        assert not doc_ids  # Empty list

        # Test valid doc_ids
        doc_ids = ["doc1", "doc2", "doc3"]
        assert isinstance(doc_ids, list)
        assert len(doc_ids) == 3

    def test_metadata_type_validation(self):
        """Test metadata value type validation"""
        # Valid types
        valid_meta = {
            "string_field": "value",
            "int_field": 123,
            "float_field": 45.67
        }

        for k, v in valid_meta.items():
            assert isinstance(v, (str, int, float))

        # Invalid types
        invalid_values = [
            {"list_field": [1, 2, 3]},
            {"dict_field": {"nested": "value"}},
        ]

        for invalid in invalid_values:
            for k, v in invalid.items():
                assert not isinstance(v, (str, int, float))

    def test_batch_results_structure(self):
        """Test batch operation results structure"""
        results = {
            "results": {
                "doc1": {"success": True},
                "doc2": {"success": False, "error": "Not found"},
                "doc3": {"success": True}
            },
            "total": 3,
            "success": 2,
            "failed": 1
        }

        assert "results" in results
        assert "total" in results
        assert "success" in results
        assert "failed" in results
        assert results["total"] == 3
        assert results["success"] == 2
        assert results["failed"] == 1

    def test_get_meta_response_structure(self):
        """Test get_meta response structure"""
        response = {
            "doc_id": "doc123",
            "doc_name": "test.pdf",
            "meta": {
                "department": "HR",
                "year": 2024
            }
        }

        assert "doc_id" in response
        assert "doc_name" in response
        assert "meta" in response
        assert isinstance(response["meta"], dict)

    def test_batch_get_meta_response_structure(self):
        """Test batch_get_meta response structure"""
        response = {
            "doc1": {
                "doc_name": "file1.pdf",
                "meta": {"dept": "HR"},
                "kb_id": "kb123"
            },
            "doc2": {
                "error": "Document not found"
            }
        }

        assert "doc1" in response
        assert "doc2" in response
        assert "meta" in response["doc1"]
        assert "error" in response["doc2"]

    def test_list_metadata_fields_structure(self):
        """Test list_metadata_fields response structure"""
        response = {
            "kb_id": "kb123",
            "total_documents": 10,
            "metadata_fields": {
                "department": {
                    "type": "str",
                    "example": "HR",
                    "count": 8
                },
                "year": {
                    "type": "int",
                    "example": 2024,
                    "count": 10
                },
                "cost": {
                    "type": "float",
                    "example": 199.99,
                    "count": 5
                }
            }
        }

        assert "kb_id" in response
        assert "total_documents" in response
        assert "metadata_fields" in response

        for field_name, field_info in response["metadata_fields"].items():
            assert "type" in field_info
            assert "example" in field_info
            assert "count" in field_info

    def test_metadata_field_type_tracking(self):
        """Test metadata field type tracking across documents"""
        # Simulating field type analysis
        documents = [
            {"meta_fields": {"dept": "HR", "year": 2024}},
            {"meta_fields": {"dept": "IT", "year": 2023}},
            {"meta_fields": {"dept": "Finance", "year": "2024"}},  # Mixed type
        ]

        field_types = {}

        for doc in documents:
            for key, value in doc.get("meta_fields", {}).items():
                value_type = type(value).__name__

                if key not in field_types:
                    field_types[key] = value_type
                elif field_types[key] != value_type:
                    field_types[key] = "mixed"

        assert field_types["dept"] == "str"
        assert field_types["year"] == "mixed"  # int and str

    def test_json_metadata_parsing(self):
        """Test JSON metadata parsing"""
        # Test string JSON
        meta_str = '{"department": "HR", "cost": 123.45}'
        meta = json.loads(meta_str)

        assert isinstance(meta, dict)
        assert meta["department"] == "HR"
        assert meta["cost"] == 123.45

        # Test already parsed dict
        meta_dict = {"department": "HR", "cost": 123.45}
        assert isinstance(meta_dict, dict)

    def test_authorization_check_logic(self):
        """Test authorization checking logic"""
        user_id = "user123"
        doc_owner_id = "user123"

        # Same user - authorized
        assert user_id == doc_owner_id

        # Different user - not authorized
        other_user = "user456"
        assert user_id != other_user

    def test_batch_operation_partial_success(self):
        """Test handling partial success in batch operations"""
        doc_ids = ["doc1", "doc2", "doc3", "doc4"]

        # Simulate results
        results = {
            "doc1": {"success": True},
            "doc2": {"success": False, "error": "Not found"},
            "doc3": {"success": True},
            "doc4": {"success": False, "error": "No authorization"}
        }

        success_count = sum(1 for r in results.values() if r.get("success"))
        failed_count = len(doc_ids) - success_count

        assert success_count == 2
        assert failed_count == 2

        # Verify we can still return partial results
        assert len(results) == len(doc_ids)


if __name__ == "__main__":
    pytest.main([__file__, "-v"])