LightRAG/tests/test_citation_utils.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

351 lines
13 KiB
Python

"""Tests for citation utility functions in lightrag/utils.py.
This module tests the helper functions used for generating citations
and reference lists from document chunks.
"""
import pytest
from lightrag.utils import (
_extract_document_title,
_generate_excerpt,
generate_reference_list_from_chunks,
)
# ============================================================================
# Tests for _extract_document_title()
# ============================================================================
class TestExtractDocumentTitle:
"""Tests for _extract_document_title function."""
@pytest.mark.offline
def test_regular_path(self):
"""Test extracting title from regular file path."""
assert _extract_document_title('/path/to/document.pdf') == 'document.pdf'
@pytest.mark.offline
def test_nested_path(self):
"""Test extracting title from deeply nested path."""
assert _extract_document_title('/a/b/c/d/e/report.docx') == 'report.docx'
@pytest.mark.offline
def test_s3_path(self):
"""Test extracting title from S3 URL."""
assert _extract_document_title('s3://bucket/archive/default/doc123/report.pdf') == 'report.pdf'
@pytest.mark.offline
def test_s3_path_simple(self):
"""Test extracting title from simple S3 URL."""
assert _extract_document_title('s3://mybucket/file.txt') == 'file.txt'
@pytest.mark.offline
def test_empty_string(self):
"""Test with empty string returns empty."""
assert _extract_document_title('') == ''
@pytest.mark.offline
def test_trailing_slash(self):
"""Test path with trailing slash returns empty."""
assert _extract_document_title('/path/to/dir/') == ''
@pytest.mark.offline
def test_filename_only(self):
"""Test with just a filename (no path)."""
assert _extract_document_title('document.pdf') == 'document.pdf'
@pytest.mark.offline
def test_no_extension(self):
"""Test filename without extension."""
assert _extract_document_title('/path/to/README') == 'README'
@pytest.mark.offline
def test_windows_style_path(self):
"""Test Windows-style path (backslashes)."""
# os.path.basename handles this correctly on Unix
result = _extract_document_title('C:\\Users\\docs\\file.pdf')
# On Unix, this returns the whole string as basename doesn't split on backslash
assert 'file.pdf' in result or result == 'C:\\Users\\docs\\file.pdf'
@pytest.mark.offline
def test_special_characters(self):
"""Test filename with special characters."""
assert _extract_document_title('/path/to/my file (1).pdf') == 'my file (1).pdf'
@pytest.mark.offline
def test_unicode_filename(self):
"""Test filename with unicode characters."""
assert _extract_document_title('/path/to/文档.pdf') == '文档.pdf'
# ============================================================================
# Tests for _generate_excerpt()
# ============================================================================
class TestGenerateExcerpt:
"""Tests for _generate_excerpt function."""
@pytest.mark.offline
def test_short_content(self):
"""Test content shorter than max_length."""
assert _generate_excerpt('Hello world') == 'Hello world'
@pytest.mark.offline
def test_exact_length(self):
"""Test content exactly at max_length."""
content = 'a' * 150
result = _generate_excerpt(content, max_length=150)
assert result == content # No ellipsis for exact length
@pytest.mark.offline
def test_long_content_truncated(self):
"""Test long content is truncated with ellipsis."""
content = 'a' * 200
result = _generate_excerpt(content, max_length=150)
assert len(result) == 153 # 150 chars + '...'
assert result.endswith('...')
@pytest.mark.offline
def test_empty_string(self):
"""Test empty string returns empty."""
assert _generate_excerpt('') == ''
@pytest.mark.offline
def test_whitespace_stripped(self):
"""Test leading/trailing whitespace is stripped."""
assert _generate_excerpt(' hello world ') == 'hello world'
@pytest.mark.offline
def test_whitespace_only(self):
"""Test whitespace-only content returns empty."""
assert _generate_excerpt(' \n\t ') == ''
@pytest.mark.offline
def test_custom_max_length(self):
"""Test custom max_length parameter."""
content = 'This is a test sentence for excerpts.'
result = _generate_excerpt(content, max_length=10)
# Note: rstrip() removes trailing space before adding ellipsis
assert result == 'This is a...'
@pytest.mark.offline
def test_unicode_content(self):
"""Test unicode content handling."""
content = '日本語テキスト' * 50 # 350 chars
result = _generate_excerpt(content, max_length=150)
assert len(result) == 153 # 150 chars + '...'
@pytest.mark.offline
def test_newlines_preserved(self):
"""Test that newlines within content are preserved."""
content = 'Line 1\nLine 2'
result = _generate_excerpt(content)
assert result == 'Line 1\nLine 2'
@pytest.mark.offline
def test_very_short_max_length(self):
"""Test with very short max_length."""
result = _generate_excerpt('Hello world', max_length=5)
assert result == 'Hello...'
# ============================================================================
# Tests for generate_reference_list_from_chunks()
# ============================================================================
class TestGenerateReferenceListFromChunks:
"""Tests for generate_reference_list_from_chunks function."""
@pytest.mark.offline
def test_empty_chunks(self):
"""Test with empty chunk list."""
ref_list, updated_chunks = generate_reference_list_from_chunks([])
assert ref_list == []
assert updated_chunks == []
@pytest.mark.offline
def test_single_chunk(self):
"""Test with a single chunk."""
chunks = [
{
'file_path': '/path/to/doc.pdf',
'content': 'This is the content.',
's3_key': 'archive/doc.pdf',
}
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 1
assert ref_list[0]['reference_id'] == '1'
assert ref_list[0]['file_path'] == '/path/to/doc.pdf'
assert ref_list[0]['document_title'] == 'doc.pdf'
assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
assert ref_list[0]['excerpt'] == 'This is the content.'
assert len(updated_chunks) == 1
assert updated_chunks[0]['reference_id'] == '1'
@pytest.mark.offline
def test_multiple_chunks_same_file(self):
"""Test multiple chunks from same file get same reference_id."""
chunks = [
{'file_path': '/path/doc.pdf', 'content': 'Chunk 1'},
{'file_path': '/path/doc.pdf', 'content': 'Chunk 2'},
{'file_path': '/path/doc.pdf', 'content': 'Chunk 3'},
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 1
assert ref_list[0]['reference_id'] == '1'
# All chunks should have same reference_id
for chunk in updated_chunks:
assert chunk['reference_id'] == '1'
@pytest.mark.offline
def test_multiple_files_deduplication(self):
"""Test multiple files are deduplicated with unique reference_ids."""
chunks = [
{'file_path': '/path/doc1.pdf', 'content': 'Content 1'},
{'file_path': '/path/doc2.pdf', 'content': 'Content 2'},
{'file_path': '/path/doc1.pdf', 'content': 'Content 1 more'},
]
ref_list, _updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 2
# doc1 appears twice, so should be reference_id '1' (higher frequency)
# doc2 appears once, so should be reference_id '2'
ref_ids = {r['file_path']: r['reference_id'] for r in ref_list}
assert ref_ids['/path/doc1.pdf'] == '1'
assert ref_ids['/path/doc2.pdf'] == '2'
@pytest.mark.offline
def test_prioritization_by_frequency(self):
"""Test that references are prioritized by frequency."""
chunks = [
{'file_path': '/rare.pdf', 'content': 'Rare'},
{'file_path': '/common.pdf', 'content': 'Common 1'},
{'file_path': '/common.pdf', 'content': 'Common 2'},
{'file_path': '/common.pdf', 'content': 'Common 3'},
{'file_path': '/rare.pdf', 'content': 'Rare 2'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
# common.pdf appears 3 times, rare.pdf appears 2 times
# common.pdf should get reference_id '1'
assert ref_list[0]['file_path'] == '/common.pdf'
assert ref_list[0]['reference_id'] == '1'
assert ref_list[1]['file_path'] == '/rare.pdf'
assert ref_list[1]['reference_id'] == '2'
@pytest.mark.offline
def test_unknown_source_filtered(self):
"""Test that 'unknown_source' file paths are filtered out."""
chunks = [
{'file_path': '/path/doc.pdf', 'content': 'Valid'},
{'file_path': 'unknown_source', 'content': 'Unknown'},
{'file_path': '/path/doc2.pdf', 'content': 'Valid 2'},
]
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
# unknown_source should not be in reference list
assert len(ref_list) == 2
file_paths = [r['file_path'] for r in ref_list]
assert 'unknown_source' not in file_paths
# Chunk with unknown_source should have empty reference_id
assert updated_chunks[1]['reference_id'] == ''
@pytest.mark.offline
def test_empty_file_path_filtered(self):
"""Test that empty file paths are filtered out."""
chunks = [
{'file_path': '/path/doc.pdf', 'content': 'Valid'},
{'file_path': '', 'content': 'No path'},
{'content': 'Missing path key'},
]
ref_list, _updated_chunks = generate_reference_list_from_chunks(chunks)
assert len(ref_list) == 1
assert ref_list[0]['file_path'] == '/path/doc.pdf'
@pytest.mark.offline
def test_s3_key_included(self):
"""Test that s3_key is included in reference list."""
chunks = [
{
'file_path': 's3://bucket/archive/doc.pdf',
'content': 'S3 content',
's3_key': 'archive/doc.pdf',
}
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
assert ref_list[0]['document_title'] == 'doc.pdf'
@pytest.mark.offline
def test_excerpt_generated_from_first_chunk(self):
"""Test that excerpt is generated from first chunk of each file."""
chunks = [
{'file_path': '/doc.pdf', 'content': 'First chunk content'},
{'file_path': '/doc.pdf', 'content': 'Second chunk different'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
# Excerpt should be from first chunk
assert ref_list[0]['excerpt'] == 'First chunk content'
@pytest.mark.offline
def test_excerpt_added_to_each_chunk(self):
"""Test that each updated chunk has its own excerpt."""
chunks = [
{'file_path': '/doc.pdf', 'content': 'First chunk'},
{'file_path': '/doc.pdf', 'content': 'Second chunk'},
]
_, updated_chunks = generate_reference_list_from_chunks(chunks)
assert updated_chunks[0]['excerpt'] == 'First chunk'
assert updated_chunks[1]['excerpt'] == 'Second chunk'
@pytest.mark.offline
def test_original_chunks_not_modified(self):
"""Test that original chunks are not modified (returns copies)."""
original_chunks = [
{'file_path': '/doc.pdf', 'content': 'Content'},
]
_, updated_chunks = generate_reference_list_from_chunks(original_chunks)
# Original should not have reference_id
assert 'reference_id' not in original_chunks[0]
# Updated should have reference_id
assert 'reference_id' in updated_chunks[0]
@pytest.mark.offline
def test_missing_s3_key_is_none(self):
"""Test that missing s3_key results in None."""
chunks = [
{'file_path': '/local/doc.pdf', 'content': 'Local file'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
assert ref_list[0]['s3_key'] is None
@pytest.mark.offline
def test_tie_breaking_by_first_appearance(self):
"""Test that same-frequency files are ordered by first appearance."""
chunks = [
{'file_path': '/doc_b.pdf', 'content': 'B first'},
{'file_path': '/doc_a.pdf', 'content': 'A second'},
{'file_path': '/doc_b.pdf', 'content': 'B again'},
{'file_path': '/doc_a.pdf', 'content': 'A again'},
]
ref_list, _ = generate_reference_list_from_chunks(chunks)
# Both files appear twice, but doc_b appeared first
assert ref_list[0]['file_path'] == '/doc_b.pdf'
assert ref_list[0]['reference_id'] == '1'
assert ref_list[1]['file_path'] == '/doc_a.pdf'
assert ref_list[1]['reference_id'] == '2'