ragflow/test/unit_test/services/test_document_service.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import pytest
from unittest.mock import Mock, patch
from common.misc_utils import get_uuid


class TestDocumentService:
    """Comprehensive unit tests for DocumentService"""

    @pytest.fixture
    def mock_doc_service(self):
        """Create a mock DocumentService for testing"""
        with patch('api.db.services.document_service.DocumentService') as mock:
            yield mock

    @pytest.fixture
    def sample_document_data(self):
        """Sample document data for testing"""
        return {
            "id": get_uuid(),
            "kb_id": get_uuid(),
            "name": "test_document.pdf",
            "location": "test_document.pdf",
            "size": 1024000,  # 1MB
            "type": "pdf",
            "parser_id": "paper",
            "parser_config": {
                "chunk_token_num": 128,
                "layout_recognize": True
            },
            "status": "1",  # Parsing completed
            "progress": 1.0,
            "progress_msg": "Parsing completed",
            "chunk_num": 50,
            "token_num": 5000,
            "run": "0"
        }

    def test_document_creation_success(self, mock_doc_service, sample_document_data):
        """Test successful document creation"""
        mock_doc_service.save.return_value = True

        result = mock_doc_service.save(**sample_document_data)
        assert result is True

    def test_document_get_by_id_success(self, mock_doc_service, sample_document_data):
        """Test retrieving document by ID"""
        doc_id = sample_document_data["id"]
        mock_doc = Mock()
        mock_doc.to_dict.return_value = sample_document_data

        mock_doc_service.get_by_id.return_value = (True, mock_doc)

        exists, doc = mock_doc_service.get_by_id(doc_id)
        assert exists is True
        assert doc.to_dict() == sample_document_data

    def test_document_get_by_id_not_found(self, mock_doc_service):
        """Test retrieving non-existent document"""
        mock_doc_service.get_by_id.return_value = (False, None)

        exists, doc = mock_doc_service.get_by_id("nonexistent_id")
        assert exists is False
        assert doc is None

    def test_document_update_success(self, mock_doc_service):
        """Test successful document update"""
        doc_id = get_uuid()
        update_data = {"name": "updated_document.pdf"}

        mock_doc_service.update_by_id.return_value = True
        result = mock_doc_service.update_by_id(doc_id, update_data)

        assert result is True

    def test_document_delete_success(self, mock_doc_service):
        """Test document deletion"""
        doc_id = get_uuid()

        mock_doc_service.delete_by_id.return_value = True
        result = mock_doc_service.delete_by_id(doc_id)

        assert result is True

    def test_document_list_by_kb(self, mock_doc_service):
        """Test listing documents by knowledge base"""
        kb_id = get_uuid()
        mock_docs = [Mock() for _ in range(10)]

        mock_doc_service.query.return_value = mock_docs

        result = mock_doc_service.query(kb_id=kb_id)
        assert len(result) == 10

    def test_document_file_type_validation(self, sample_document_data):
        """Test document file type validation"""
        file_type = sample_document_data["type"]

        valid_types = ["pdf", "docx", "doc", "txt", "md", "csv", "xlsx", "pptx", "html", "json", "eml"]
        assert file_type in valid_types

    def test_document_size_validation(self, sample_document_data):
        """Test document size validation"""
        size = sample_document_data["size"]

        assert size > 0
        assert size < 100 * 1024 * 1024  # Less than 100MB

    def test_document_parser_id_validation(self, sample_document_data):
        """Test parser ID validation"""
        parser_id = sample_document_data["parser_id"]

        valid_parsers = ["naive", "paper", "book", "laws", "presentation", "manual", "qa", "table", "resume", "picture", "one", "knowledge_graph"]
        assert parser_id in valid_parsers

    def test_document_status_progression(self, sample_document_data):
        """Test document status progression"""
        # Status: 0=pending, 1=completed, 2=failed
        statuses = ["0", "1", "2"]

        for status in statuses:
            sample_document_data["status"] = status
            assert sample_document_data["status"] in statuses

    def test_document_progress_validation(self, sample_document_data):
        """Test document parsing progress validation"""
        progress = sample_document_data["progress"]

        assert 0.0 <= progress <= 1.0

    def test_document_chunk_count(self, sample_document_data):
        """Test document chunk count"""
        chunk_num = sample_document_data["chunk_num"]

        assert chunk_num >= 0
        assert isinstance(chunk_num, int)

    def test_document_token_count(self, sample_document_data):
        """Test document token count"""
        token_num = sample_document_data["token_num"]

        assert token_num >= 0
        assert isinstance(token_num, int)

    def test_document_parsing_pending(self, sample_document_data):
        """Test document in pending parsing state"""
        sample_document_data["status"] = "0"
        sample_document_data["progress"] = 0.0
        sample_document_data["progress_msg"] = "Waiting for parsing"

        assert sample_document_data["status"] == "0"
        assert sample_document_data["progress"] == 0.0

    def test_document_parsing_in_progress(self, sample_document_data):
        """Test document in parsing progress state"""
        sample_document_data["status"] = "0"
        sample_document_data["progress"] = 0.5
        sample_document_data["progress_msg"] = "Parsing in progress"

        assert 0.0 < sample_document_data["progress"] < 1.0

    def test_document_parsing_completed(self, sample_document_data):
        """Test document parsing completed state"""
        sample_document_data["status"] = "1"
        sample_document_data["progress"] = 1.0
        sample_document_data["progress_msg"] = "Parsing completed"

        assert sample_document_data["status"] == "1"
        assert sample_document_data["progress"] == 1.0

    def test_document_parsing_failed(self, sample_document_data):
        """Test document parsing failed state"""
        sample_document_data["status"] = "2"
        sample_document_data["progress_msg"] = "Parsing failed: Invalid format"

        assert sample_document_data["status"] == "2"
        assert "failed" in sample_document_data["progress_msg"].lower()

    def test_document_run_flag(self, sample_document_data):
        """Test document run flag"""
        run = sample_document_data["run"]

        # run: 0=not running, 1=running, 2=cancel
        assert run in ["0", "1", "2"]

    def test_document_batch_upload(self, mock_doc_service):
        """Test batch document upload"""
        kb_id = get_uuid()
        doc_count = 5

        for i in range(doc_count):
            doc_data = {
                "id": get_uuid(),
                "kb_id": kb_id,
                "name": f"document_{i}.pdf",
                "size": 1024 * (i + 1)
            }
            mock_doc_service.save.return_value = True
            result = mock_doc_service.save(**doc_data)
            assert result is True

    def test_document_batch_delete(self, mock_doc_service):
        """Test batch document deletion"""
        doc_ids = [get_uuid() for _ in range(5)]

        for doc_id in doc_ids:
            mock_doc_service.delete_by_id.return_value = True
            result = mock_doc_service.delete_by_id(doc_id)
            assert result is True

    def test_document_search_by_name(self, mock_doc_service):
        """Test document search by name"""
        kb_id = get_uuid()
        keywords = "test"
        mock_docs = [Mock(name="test_doc1.pdf"), Mock(name="test_doc2.pdf")]

        mock_doc_service.get_list.return_value = (mock_docs, 2)

        result, count = mock_doc_service.get_list(kb_id, 0, 0, "create_time", True, keywords)
        assert count == 2

    def test_document_pagination(self, mock_doc_service):
        """Test document listing with pagination"""
        kb_id = get_uuid()
        page = 1
        page_size = 10
        total = 25

        mock_docs = [Mock() for _ in range(page_size)]
        mock_doc_service.get_list.return_value = (mock_docs, total)

        result, count = mock_doc_service.get_list(kb_id, page, page_size, "create_time", True, "")

        assert len(result) == page_size
        assert count == total

    def test_document_ordering(self, mock_doc_service):
        """Test document ordering"""
        kb_id = get_uuid()

        mock_doc_service.get_list.return_value = ([], 0)
        mock_doc_service.get_list(kb_id, 0, 0, "create_time", True, "")

        mock_doc_service.get_list.assert_called_once()

    def test_document_parser_config_validation(self, sample_document_data):
        """Test parser configuration validation"""
        parser_config = sample_document_data["parser_config"]

        assert "chunk_token_num" in parser_config
        assert parser_config["chunk_token_num"] > 0

    def test_document_layout_recognition(self, sample_document_data):
        """Test layout recognition flag"""
        layout_recognize = sample_document_data["parser_config"]["layout_recognize"]

        assert isinstance(layout_recognize, bool)

    @pytest.mark.parametrize("file_type", [
        "pdf", "docx", "doc", "txt", "md", "csv", "xlsx", "pptx", "html", "json"
    ])
    def test_document_different_file_types(self, file_type, sample_document_data):
        """Test document with different file types"""
        sample_document_data["type"] = file_type
        assert sample_document_data["type"] == file_type

    def test_document_name_with_extension(self, sample_document_data):
        """Test document name includes file extension"""
        name = sample_document_data["name"]

        assert "." in name
        extension = name.split(".")[-1]
        assert len(extension) > 0

    def test_document_location_path(self, sample_document_data):
        """Test document location path"""
        location = sample_document_data["location"]

        assert location is not None
        assert len(location) > 0

    def test_document_stop_parsing(self, mock_doc_service):
        """Test stopping document parsing"""
        doc_id = get_uuid()

        mock_doc_service.update_by_id.return_value = True
        result = mock_doc_service.update_by_id(doc_id, {"run": "2"})  # Cancel

        assert result is True

    def test_document_restart_parsing(self, mock_doc_service):
        """Test restarting document parsing"""
        doc_id = get_uuid()

        mock_doc_service.update_by_id.return_value = True
        result = mock_doc_service.update_by_id(doc_id, {
            "status": "0",
            "progress": 0.0,
            "run": "1"
        })

        assert result is True

    def test_document_chunk_token_ratio(self, sample_document_data):
        """Test chunk to token ratio is reasonable"""
        chunk_num = sample_document_data["chunk_num"]
        token_num = sample_document_data["token_num"]

        if chunk_num > 0:
            avg_tokens_per_chunk = token_num / chunk_num
            assert avg_tokens_per_chunk > 0
            assert avg_tokens_per_chunk < 2048  # Reasonable upper limit

    def test_document_empty_file_handling(self):
        """Test handling of empty file"""
        empty_doc = {
            "size": 0,
            "chunk_num": 0,
            "token_num": 0
        }

        assert empty_doc["size"] == 0
        assert empty_doc["chunk_num"] == 0
        assert empty_doc["token_num"] == 0