LightRAG/tests/test_citation_utils.py
clssck 082a5a8fad test(lightrag,api): add comprehensive test coverage and S3 support
Add extensive test suites for API routes and utilities:
- Implement test_search_routes.py (406 lines) for search endpoint validation
- Implement test_upload_routes.py (724 lines) for document upload workflows
- Implement test_s3_client.py (618 lines) for S3 storage operations
- Implement test_citation_utils.py (352 lines) for citation extraction
- Implement test_chunking.py (216 lines) for text chunking validation
Add S3 storage client implementation:
- Create lightrag/storage/s3_client.py with S3 operations
- Add storage module initialization with exports
- Integrate S3 client with document upload handling
Enhance API routes and core functionality:
- Add search_routes.py with full-text and graph search endpoints
- Add upload_routes.py with multipart document upload support
- Update operate.py with bulk operations and health checks
- Enhance postgres_impl.py with bulk upsert and parameterized queries
- Update lightrag_server.py to register new API routes
- Improve utils.py with citation and formatting utilities
Update dependencies and configuration:
- Add S3 and test dependencies to pyproject.toml
- Update docker-compose.test.yml for testing environment
- Sync uv.lock with new dependencies
Apply code quality improvements across all modified files:
- Add type hints to function signatures
- Update imports and router initialization
- Fix logging and error handling
2025-12-05 23:13:39 +01:00

352 lines
13 KiB
Python

"""Tests for citation utility functions in lightrag/utils.py.
This module tests the helper functions used for generating citations
and reference lists from document chunks.
"""
import pytest
from lightrag.utils import (
_extract_document_title,
_generate_excerpt,
generate_reference_list_from_chunks,
)
# ============================================================================
# Tests for _extract_document_title()
# ============================================================================
class TestExtractDocumentTitle:
"""Tests for _extract_document_title function."""
@pytest.mark.offline
def test_regular_path(self):
"""Test extracting title from regular file path."""
assert _extract_document_title('/path/to/document.pdf') == 'document.pdf'
@pytest.mark.offline
def test_nested_path(self):
"""Test extracting title from deeply nested path."""
assert _extract_document_title('/a/b/c/d/e/report.docx') == 'report.docx'
@pytest.mark.offline
def test_s3_path(self):
"""Test extracting title from S3 URL."""
assert _extract_document_title('s3://bucket/archive/default/doc123/report.pdf') == 'report.pdf'
@pytest.mark.offline
def test_s3_path_simple(self):
"""Test extracting title from simple S3 URL."""
assert _extract_document_title('s3://mybucket/file.txt') == 'file.txt'
@pytest.mark.offline
def test_empty_string(self):
"""Test with empty string returns empty."""
assert _extract_document_title('') == ''
@pytest.mark.offline
def test_trailing_slash(self):
"""Test path with trailing slash returns empty."""
assert _extract_document_title('/path/to/dir/') == ''
@pytest.mark.offline
def test_filename_only(self):
"""Test with just a filename (no path)."""
assert _extract_document_title('document.pdf') == 'document.pdf'
@pytest.mark.offline
def test_no_extension(self):
"""Test filename without extension."""
assert _extract_document_title('/path/to/README') == 'README'
@pytest.mark.offline
def test_windows_style_path(self):
"""Test Windows-style path (backslashes)."""
# os.path.basename handles this correctly on Unix
result = _extract_document_title('C:\\Users\\docs\\file.pdf')
# On Unix, this returns the whole string as basename doesn't split on backslash
assert 'file.pdf' in result or result == 'C:\\Users\\docs\\file.pdf'
@pytest.mark.offline
def test_special_characters(self):
"""Test filename with special characters."""
assert _extract_document_title('/path/to/my file (1).pdf') == 'my file (1).pdf'
@pytest.mark.offline
def test_unicode_filename(self):
"""Test filename with unicode characters."""
assert _extract_document_title('/path/to/文档.pdf') == '文档.pdf'
# ============================================================================
# Tests for _generate_excerpt()
# ============================================================================
class TestGenerateExcerpt:
"""Tests for _generate_excerpt function."""
@pytest.mark.offline
def test_short_content(self):
"""Test content shorter than max_length."""
assert _generate_excerpt('Hello world') == 'Hello world'
@pytest.mark.offline
def test_exact_length(self):
"""Test content exactly at max_length."""
content = 'a' * 150
result = _generate_excerpt(content, max_length=150)
assert result == content # No ellipsis for exact length
@pytest.mark.offline
def test_long_content_truncated(self):
"""Test long content is truncated with ellipsis."""
content = 'a' * 200
result = _generate_excerpt(content, max_length=150)
assert len(result) == 153 # 150 chars + '...'
assert result.endswith('...')
@pytest.mark.offline
def test_empty_string(self):
"""Test empty string returns empty."""
assert _generate_excerpt('') == ''
@pytest.mark.offline
def test_whitespace_stripped(self):
"""Test leading/trailing whitespace is stripped."""
assert _generate_excerpt(' hello world ') == 'hello world'
@pytest.mark.offline
def test_whitespace_only(self):
"""Test whitespace-only content returns empty."""
assert _generate_excerpt(' \n\t ') == ''
@pytest.mark.offline
def test_custom_max_length(self):
"""Test custom max_length parameter."""
content = 'This is a test sentence for excerpts.'
result = _generate_excerpt(content, max_length=10)
# Note: rstrip() removes trailing space before adding ellipsis
assert result == 'This is a...'
@pytest.mark.offline
def test_unicode_content(self):
"""Test unicode content handling."""
content = '日本語テキスト' * 50 # 350 chars
result = _generate_excerpt(content, max_length=150)
assert len(result) == 153 # 150 chars + '...'
@pytest.mark.offline
def test_newlines_preserved(self):
"""Test that newlines within content are preserved."""
content = 'Line 1\nLine 2'
result = _generate_excerpt(content)
assert result == 'Line 1\nLine 2'
@pytest.mark.offline
def test_very_short_max_length(self):
"""Test with very short max_length."""
result = _generate_excerpt('Hello world', max_length=5)
assert result == 'Hello...'
# ============================================================================
# Tests for generate_reference_list_from_chunks()
# ============================================================================
class TestGenerateReferenceListFromChunks:
"""Tests for generate_reference_list_from_chunks function."""
@pytest.mark.offline
def test_empty_chunks(self):
"""Test with empty chunk list."""
ref_list, updated_chunks = generate_reference_list_from_chunks([])
assert ref_list == []
assert updated_chunks == []
@pytest.mark.offline
def test_single_chunk(self):
"""Test with a single chunk."""
chunks = [
{
'file_path': '/path/to/doc.pdf',
'content': 'This is the content.',
's3_key': 'archive/doc.pdf',
}
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 1
assert ref_list[0]['reference_id'] == '1'
assert ref_list[0]['file_path'] == '/path/to/doc.pdf'
assert ref_list[0]['document_title'] == 'doc.pdf'
assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
assert ref_list[0]['excerpt'] == 'This is the content.'
assert len(updated_chunks) == 1
assert updated_chunks[0]['reference_id'] == '1'
@pytest.mark.offline
def test_multiple_chunks_same_file(self):
"""Test multiple chunks from same file get same reference_id."""
chunks = [
{'file_path': '/path/doc.pdf', 'content': 'Chunk 1'},
{'file_path': '/path/doc.pdf', 'content': 'Chunk 2'},
{'file_path': '/path/doc.pdf', 'content': 'Chunk 3'},
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 1
assert ref_list[0]['reference_id'] == '1'
# All chunks should have same reference_id
for chunk in updated_chunks:
assert chunk['reference_id'] == '1'
@pytest.mark.offline
def test_multiple_files_deduplication(self):
"""Test multiple files are deduplicated with unique reference_ids."""
chunks = [
{'file_path': '/path/doc1.pdf', 'content': 'Content 1'},
{'file_path': '/path/doc2.pdf', 'content': 'Content 2'},
{'file_path': '/path/doc1.pdf', 'content': 'Content 1 more'},
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 2
# doc1 appears twice, so should be reference_id '1' (higher frequency)
# doc2 appears once, so should be reference_id '2'
ref_ids = {r['file_path']: r['reference_id'] for r in ref_list}
assert ref_ids['/path/doc1.pdf'] == '1'
assert ref_ids['/path/doc2.pdf'] == '2'
@pytest.mark.offline
def test_prioritization_by_frequency(self):
"""Test that references are prioritized by frequency."""
chunks = [
{'file_path': '/rare.pdf', 'content': 'Rare'},
{'file_path': '/common.pdf', 'content': 'Common 1'},
{'file_path': '/common.pdf', 'content': 'Common 2'},
{'file_path': '/common.pdf', 'content': 'Common 3'},
{'file_path': '/rare.pdf', 'content': 'Rare 2'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
# common.pdf appears 3 times, rare.pdf appears 2 times
# common.pdf should get reference_id '1'
assert ref_list[0]['file_path'] == '/common.pdf'
assert ref_list[0]['reference_id'] == '1'
assert ref_list[1]['file_path'] == '/rare.pdf'
assert ref_list[1]['reference_id'] == '2'
@pytest.mark.offline
def test_unknown_source_filtered(self):
"""Test that 'unknown_source' file paths are filtered out."""
chunks = [
{'file_path': '/path/doc.pdf', 'content': 'Valid'},
{'file_path': 'unknown_source', 'content': 'Unknown'},
{'file_path': '/path/doc2.pdf', 'content': 'Valid 2'},
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
# unknown_source should not be in reference list
assert len(ref_list) == 2
file_paths = [r['file_path'] for r in ref_list]
assert 'unknown_source' not in file_paths
# Chunk with unknown_source should have empty reference_id
assert updated_chunks[1]['reference_id'] == ''
@pytest.mark.offline
def test_empty_file_path_filtered(self):
"""Test that empty file paths are filtered out."""
chunks = [
{'file_path': '/path/doc.pdf', 'content': 'Valid'},
{'file_path': '', 'content': 'No path'},
{'content': 'Missing path key'},
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 1
assert ref_list[0]['file_path'] == '/path/doc.pdf'
@pytest.mark.offline
def test_s3_key_included(self):
"""Test that s3_key is included in reference list."""
chunks = [
{
'file_path': 's3://bucket/archive/doc.pdf',
'content': 'S3 content',
's3_key': 'archive/doc.pdf',
}
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
assert ref_list[0]['document_title'] == 'doc.pdf'
@pytest.mark.offline
def test_excerpt_generated_from_first_chunk(self):
"""Test that excerpt is generated from first chunk of each file."""
chunks = [
{'file_path': '/doc.pdf', 'content': 'First chunk content'},
{'file_path': '/doc.pdf', 'content': 'Second chunk different'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
# Excerpt should be from first chunk
assert ref_list[0]['excerpt'] == 'First chunk content'
@pytest.mark.offline
def test_excerpt_added_to_each_chunk(self):
"""Test that each updated chunk has its own excerpt."""
chunks = [
{'file_path': '/doc.pdf', 'content': 'First chunk'},
{'file_path': '/doc.pdf', 'content': 'Second chunk'},
]
_, updated_chunks = generate_reference_list_from_chunks(chunks)
assert updated_chunks[0]['excerpt'] == 'First chunk'
assert updated_chunks[1]['excerpt'] == 'Second chunk'
@pytest.mark.offline
def test_original_chunks_not_modified(self):
"""Test that original chunks are not modified (returns copies)."""
original_chunks = [
{'file_path': '/doc.pdf', 'content': 'Content'},
]
_, updated_chunks = generate_reference_list_from_chunks(original_chunks)
# Original should not have reference_id
assert 'reference_id' not in original_chunks[0]
# Updated should have reference_id
assert 'reference_id' in updated_chunks[0]
@pytest.mark.offline
def test_missing_s3_key_is_none(self):
"""Test that missing s3_key results in None."""
chunks = [
{'file_path': '/local/doc.pdf', 'content': 'Local file'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
assert ref_list[0]['s3_key'] is None
@pytest.mark.offline
def test_tie_breaking_by_first_appearance(self):
"""Test that same-frequency files are ordered by first appearance."""
chunks = [
{'file_path': '/doc_b.pdf', 'content': 'B first'},
{'file_path': '/doc_a.pdf', 'content': 'A second'},
{'file_path': '/doc_b.pdf', 'content': 'B again'},
{'file_path': '/doc_a.pdf', 'content': 'A again'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
# Both files appear twice, but doc_b appeared first
assert ref_list[0]['file_path'] == '/doc_b.pdf'
assert ref_list[0]['reference_id'] == '1'
assert ref_list[1]['file_path'] == '/doc_a.pdf'
assert ref_list[1]['reference_id'] == '2'