LightRAG/tests/test_citation_utils.py

"""Tests for citation utility functions in lightrag/utils.py.

This module tests the helper functions used for generating citations
and reference lists from document chunks.
"""

import pytest

from lightrag.utils import (
    _extract_document_title,
    _generate_excerpt,
    generate_reference_list_from_chunks,
)

# ============================================================================
# Tests for _extract_document_title()
# ============================================================================


class TestExtractDocumentTitle:
    """Tests for _extract_document_title function."""

    @pytest.mark.offline
    def test_regular_path(self):
        """Test extracting title from regular file path."""
        assert _extract_document_title('/path/to/document.pdf') == 'document.pdf'

    @pytest.mark.offline
    def test_nested_path(self):
        """Test extracting title from deeply nested path."""
        assert _extract_document_title('/a/b/c/d/e/report.docx') == 'report.docx'

    @pytest.mark.offline
    def test_s3_path(self):
        """Test extracting title from S3 URL."""
        assert _extract_document_title('s3://bucket/archive/default/doc123/report.pdf') == 'report.pdf'

    @pytest.mark.offline
    def test_s3_path_simple(self):
        """Test extracting title from simple S3 URL."""
        assert _extract_document_title('s3://mybucket/file.txt') == 'file.txt'

    @pytest.mark.offline
    def test_empty_string(self):
        """Test with empty string returns empty."""
        assert _extract_document_title('') == ''

    @pytest.mark.offline
    def test_trailing_slash(self):
        """Test path with trailing slash returns empty."""
        assert _extract_document_title('/path/to/dir/') == ''

    @pytest.mark.offline
    def test_filename_only(self):
        """Test with just a filename (no path)."""
        assert _extract_document_title('document.pdf') == 'document.pdf'

    @pytest.mark.offline
    def test_no_extension(self):
        """Test filename without extension."""
        assert _extract_document_title('/path/to/README') == 'README'

    @pytest.mark.offline
    def test_windows_style_path(self):
        """Test Windows-style path (backslashes)."""
        # os.path.basename handles this correctly on Unix
        result = _extract_document_title('C:\\Users\\docs\\file.pdf')
        # On Unix, this returns the whole string as basename doesn't split on backslash
        assert 'file.pdf' in result or result == 'C:\\Users\\docs\\file.pdf'

    @pytest.mark.offline
    def test_special_characters(self):
        """Test filename with special characters."""
        assert _extract_document_title('/path/to/my file (1).pdf') == 'my file (1).pdf'

    @pytest.mark.offline
    def test_unicode_filename(self):
        """Test filename with unicode characters."""
        assert _extract_document_title('/path/to/文档.pdf') == '文档.pdf'


# ============================================================================
# Tests for _generate_excerpt()
# ============================================================================


class TestGenerateExcerpt:
    """Tests for _generate_excerpt function."""

    @pytest.mark.offline
    def test_short_content(self):
        """Test content shorter than max_length."""
        assert _generate_excerpt('Hello world') == 'Hello world'

    @pytest.mark.offline
    def test_exact_length(self):
        """Test content exactly at max_length."""
        content = 'a' * 150
        result = _generate_excerpt(content, max_length=150)
        assert result == content  # No ellipsis for exact length

    @pytest.mark.offline
    def test_long_content_truncated(self):
        """Test long content is truncated with ellipsis."""
        content = 'a' * 200
        result = _generate_excerpt(content, max_length=150)
        assert len(result) == 153  # 150 chars + '...'
        assert result.endswith('...')

    @pytest.mark.offline
    def test_empty_string(self):
        """Test empty string returns empty."""
        assert _generate_excerpt('') == ''

    @pytest.mark.offline
    def test_whitespace_stripped(self):
        """Test leading/trailing whitespace is stripped."""
        assert _generate_excerpt('  hello world  ') == 'hello world'

    @pytest.mark.offline
    def test_whitespace_only(self):
        """Test whitespace-only content returns empty."""
        assert _generate_excerpt('   \n\t  ') == ''

    @pytest.mark.offline
    def test_custom_max_length(self):
        """Test custom max_length parameter."""
        content = 'This is a test sentence for excerpts.'
        result = _generate_excerpt(content, max_length=10)
        # Note: rstrip() removes trailing space before adding ellipsis
        assert result == 'This is a...'

    @pytest.mark.offline
    def test_unicode_content(self):
        """Test unicode content handling."""
        content = '日本語テキスト' * 50  # 350 chars
        result = _generate_excerpt(content, max_length=150)
        assert len(result) == 153  # 150 chars + '...'

    @pytest.mark.offline
    def test_newlines_preserved(self):
        """Test that newlines within content are preserved."""
        content = 'Line 1\nLine 2'
        result = _generate_excerpt(content)
        assert result == 'Line 1\nLine 2'

    @pytest.mark.offline
    def test_very_short_max_length(self):
        """Test with very short max_length."""
        result = _generate_excerpt('Hello world', max_length=5)
        assert result == 'Hello...'


# ============================================================================
# Tests for generate_reference_list_from_chunks()
# ============================================================================


class TestGenerateReferenceListFromChunks:
    """Tests for generate_reference_list_from_chunks function."""

    @pytest.mark.offline
    def test_empty_chunks(self):
        """Test with empty chunk list."""
        ref_list, updated_chunks = generate_reference_list_from_chunks([])
        assert ref_list == []
        assert updated_chunks == []

    @pytest.mark.offline
    def test_single_chunk(self):
        """Test with a single chunk."""
        chunks = [
            {
                'file_path': '/path/to/doc.pdf',
                'content': 'This is the content.',
                's3_key': 'archive/doc.pdf',
            }
        ]
        ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)

        assert len(ref_list) == 1
        assert ref_list[0]['reference_id'] == '1'
        assert ref_list[0]['file_path'] == '/path/to/doc.pdf'
        assert ref_list[0]['document_title'] == 'doc.pdf'
        assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
        assert ref_list[0]['excerpt'] == 'This is the content.'

        assert len(updated_chunks) == 1
        assert updated_chunks[0]['reference_id'] == '1'

    @pytest.mark.offline
    def test_multiple_chunks_same_file(self):
        """Test multiple chunks from same file get same reference_id."""
        chunks = [
            {'file_path': '/path/doc.pdf', 'content': 'Chunk 1'},
            {'file_path': '/path/doc.pdf', 'content': 'Chunk 2'},
            {'file_path': '/path/doc.pdf', 'content': 'Chunk 3'},
        ]
        ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)

        assert len(ref_list) == 1
        assert ref_list[0]['reference_id'] == '1'
        # All chunks should have same reference_id
        for chunk in updated_chunks:
            assert chunk['reference_id'] == '1'

    @pytest.mark.offline
    def test_multiple_files_deduplication(self):
        """Test multiple files are deduplicated with unique reference_ids."""
        chunks = [
            {'file_path': '/path/doc1.pdf', 'content': 'Content 1'},
            {'file_path': '/path/doc2.pdf', 'content': 'Content 2'},
            {'file_path': '/path/doc1.pdf', 'content': 'Content 1 more'},
        ]
        ref_list, _updated_chunks = generate_reference_list_from_chunks(chunks)

        assert len(ref_list) == 2
        # doc1 appears twice, so should be reference_id '1' (higher frequency)
        # doc2 appears once, so should be reference_id '2'
        ref_ids = {r['file_path']: r['reference_id'] for r in ref_list}
        assert ref_ids['/path/doc1.pdf'] == '1'
        assert ref_ids['/path/doc2.pdf'] == '2'

    @pytest.mark.offline
    def test_prioritization_by_frequency(self):
        """Test that references are prioritized by frequency."""
        chunks = [
            {'file_path': '/rare.pdf', 'content': 'Rare'},
            {'file_path': '/common.pdf', 'content': 'Common 1'},
            {'file_path': '/common.pdf', 'content': 'Common 2'},
            {'file_path': '/common.pdf', 'content': 'Common 3'},
            {'file_path': '/rare.pdf', 'content': 'Rare 2'},
        ]
        ref_list, _ = generate_reference_list_from_chunks(chunks)

        # common.pdf appears 3 times, rare.pdf appears 2 times
        # common.pdf should get reference_id '1'
        assert ref_list[0]['file_path'] == '/common.pdf'
        assert ref_list[0]['reference_id'] == '1'
        assert ref_list[1]['file_path'] == '/rare.pdf'
        assert ref_list[1]['reference_id'] == '2'

    @pytest.mark.offline
    def test_unknown_source_filtered(self):
        """Test that 'unknown_source' file paths are filtered out."""
        chunks = [
            {'file_path': '/path/doc.pdf', 'content': 'Valid'},
            {'file_path': 'unknown_source', 'content': 'Unknown'},
            {'file_path': '/path/doc2.pdf', 'content': 'Valid 2'},
        ]
        ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)

        # unknown_source should not be in reference list
        assert len(ref_list) == 2
        file_paths = [r['file_path'] for r in ref_list]
        assert 'unknown_source' not in file_paths

        # Chunk with unknown_source should have empty reference_id
        assert updated_chunks[1]['reference_id'] == ''

    @pytest.mark.offline
    def test_empty_file_path_filtered(self):
        """Test that empty file paths are filtered out."""
        chunks = [
            {'file_path': '/path/doc.pdf', 'content': 'Valid'},
            {'file_path': '', 'content': 'No path'},
            {'content': 'Missing path key'},
        ]
        ref_list, _updated_chunks = generate_reference_list_from_chunks(chunks)

        assert len(ref_list) == 1
        assert ref_list[0]['file_path'] == '/path/doc.pdf'

    @pytest.mark.offline
    def test_s3_key_included(self):
        """Test that s3_key is included in reference list."""
        chunks = [
            {
                'file_path': 's3://bucket/archive/doc.pdf',
                'content': 'S3 content',
                's3_key': 'archive/doc.pdf',
            }
        ]
        ref_list, _ = generate_reference_list_from_chunks(chunks)

        assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
        assert ref_list[0]['document_title'] == 'doc.pdf'

    @pytest.mark.offline
    def test_excerpt_generated_from_first_chunk(self):
        """Test that excerpt is generated from first chunk of each file."""
        chunks = [
            {'file_path': '/doc.pdf', 'content': 'First chunk content'},
            {'file_path': '/doc.pdf', 'content': 'Second chunk different'},
        ]
        ref_list, _ = generate_reference_list_from_chunks(chunks)

        # Excerpt should be from first chunk
        assert ref_list[0]['excerpt'] == 'First chunk content'

    @pytest.mark.offline
    def test_excerpt_added_to_each_chunk(self):
        """Test that each updated chunk has its own excerpt."""
        chunks = [
            {'file_path': '/doc.pdf', 'content': 'First chunk'},
            {'file_path': '/doc.pdf', 'content': 'Second chunk'},
        ]
        _, updated_chunks = generate_reference_list_from_chunks(chunks)

        assert updated_chunks[0]['excerpt'] == 'First chunk'
        assert updated_chunks[1]['excerpt'] == 'Second chunk'

    @pytest.mark.offline
    def test_original_chunks_not_modified(self):
        """Test that original chunks are not modified (returns copies)."""
        original_chunks = [
            {'file_path': '/doc.pdf', 'content': 'Content'},
        ]
        _, updated_chunks = generate_reference_list_from_chunks(original_chunks)

        # Original should not have reference_id
        assert 'reference_id' not in original_chunks[0]
        # Updated should have reference_id
        assert 'reference_id' in updated_chunks[0]

    @pytest.mark.offline
    def test_missing_s3_key_is_none(self):
        """Test that missing s3_key results in None."""
        chunks = [
            {'file_path': '/local/doc.pdf', 'content': 'Local file'},
        ]
        ref_list, _ = generate_reference_list_from_chunks(chunks)

        assert ref_list[0]['s3_key'] is None

    @pytest.mark.offline
    def test_tie_breaking_by_first_appearance(self):
        """Test that same-frequency files are ordered by first appearance."""
        chunks = [
            {'file_path': '/doc_b.pdf', 'content': 'B first'},
            {'file_path': '/doc_a.pdf', 'content': 'A second'},
            {'file_path': '/doc_b.pdf', 'content': 'B again'},
            {'file_path': '/doc_a.pdf', 'content': 'A again'},
        ]
        ref_list, _ = generate_reference_list_from_chunks(chunks)

        # Both files appear twice, but doc_b appeared first
        assert ref_list[0]['file_path'] == '/doc_b.pdf'
        assert ref_list[0]['reference_id'] == '1'
        assert ref_list[1]['file_path'] == '/doc_a.pdf'
        assert ref_list[1]['reference_id'] == '2'