Remove legacy storage implementations and deprecated examples: - Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends - Remove Kubernetes deployment manifests and installation scripts - Delete unofficial examples for deprecated backends and offline deployment docs Streamline core infrastructure: - Consolidate storage layer to PostgreSQL-only implementation - Add full-text search caching with FTS cache module - Implement metrics collection and monitoring pipeline - Add explain and metrics API routes Modernize frontend and tooling: - Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles - Update Dockerfile for PostgreSQL-only deployment - Add Makefile for common development tasks - Update environment and configuration examples Enhance evaluation and testing capabilities: - Add prompt optimization with DSPy and auto-tuning - Implement ground truth regeneration and variant testing - Add prompt debugging and response comparison utilities - Expand test coverage with new integration scenarios Simplify dependencies and configuration: - Remove offline-specific requirement files - Update pyproject.toml with streamlined dependencies - Add Python version pinning with .python-version - Create project guidelines in CLAUDE.md and AGENTS.md
351 lines
13 KiB
Python
351 lines
13 KiB
Python
"""Tests for citation utility functions in lightrag/utils.py.
|
|
|
|
This module tests the helper functions used for generating citations
|
|
and reference lists from document chunks.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from lightrag.utils import (
|
|
_extract_document_title,
|
|
_generate_excerpt,
|
|
generate_reference_list_from_chunks,
|
|
)
|
|
|
|
# ============================================================================
|
|
# Tests for _extract_document_title()
|
|
# ============================================================================
|
|
|
|
|
|
class TestExtractDocumentTitle:
|
|
"""Tests for _extract_document_title function."""
|
|
|
|
@pytest.mark.offline
|
|
def test_regular_path(self):
|
|
"""Test extracting title from regular file path."""
|
|
assert _extract_document_title('/path/to/document.pdf') == 'document.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_nested_path(self):
|
|
"""Test extracting title from deeply nested path."""
|
|
assert _extract_document_title('/a/b/c/d/e/report.docx') == 'report.docx'
|
|
|
|
@pytest.mark.offline
|
|
def test_s3_path(self):
|
|
"""Test extracting title from S3 URL."""
|
|
assert _extract_document_title('s3://bucket/archive/default/doc123/report.pdf') == 'report.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_s3_path_simple(self):
|
|
"""Test extracting title from simple S3 URL."""
|
|
assert _extract_document_title('s3://mybucket/file.txt') == 'file.txt'
|
|
|
|
@pytest.mark.offline
|
|
def test_empty_string(self):
|
|
"""Test with empty string returns empty."""
|
|
assert _extract_document_title('') == ''
|
|
|
|
@pytest.mark.offline
|
|
def test_trailing_slash(self):
|
|
"""Test path with trailing slash returns empty."""
|
|
assert _extract_document_title('/path/to/dir/') == ''
|
|
|
|
@pytest.mark.offline
|
|
def test_filename_only(self):
|
|
"""Test with just a filename (no path)."""
|
|
assert _extract_document_title('document.pdf') == 'document.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_no_extension(self):
|
|
"""Test filename without extension."""
|
|
assert _extract_document_title('/path/to/README') == 'README'
|
|
|
|
@pytest.mark.offline
|
|
def test_windows_style_path(self):
|
|
"""Test Windows-style path (backslashes)."""
|
|
# os.path.basename handles this correctly on Unix
|
|
result = _extract_document_title('C:\\Users\\docs\\file.pdf')
|
|
# On Unix, this returns the whole string as basename doesn't split on backslash
|
|
assert 'file.pdf' in result or result == 'C:\\Users\\docs\\file.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_special_characters(self):
|
|
"""Test filename with special characters."""
|
|
assert _extract_document_title('/path/to/my file (1).pdf') == 'my file (1).pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_unicode_filename(self):
|
|
"""Test filename with unicode characters."""
|
|
assert _extract_document_title('/path/to/文档.pdf') == '文档.pdf'
|
|
|
|
|
|
# ============================================================================
|
|
# Tests for _generate_excerpt()
|
|
# ============================================================================
|
|
|
|
|
|
class TestGenerateExcerpt:
|
|
"""Tests for _generate_excerpt function."""
|
|
|
|
@pytest.mark.offline
|
|
def test_short_content(self):
|
|
"""Test content shorter than max_length."""
|
|
assert _generate_excerpt('Hello world') == 'Hello world'
|
|
|
|
@pytest.mark.offline
|
|
def test_exact_length(self):
|
|
"""Test content exactly at max_length."""
|
|
content = 'a' * 150
|
|
result = _generate_excerpt(content, max_length=150)
|
|
assert result == content # No ellipsis for exact length
|
|
|
|
@pytest.mark.offline
|
|
def test_long_content_truncated(self):
|
|
"""Test long content is truncated with ellipsis."""
|
|
content = 'a' * 200
|
|
result = _generate_excerpt(content, max_length=150)
|
|
assert len(result) == 153 # 150 chars + '...'
|
|
assert result.endswith('...')
|
|
|
|
@pytest.mark.offline
|
|
def test_empty_string(self):
|
|
"""Test empty string returns empty."""
|
|
assert _generate_excerpt('') == ''
|
|
|
|
@pytest.mark.offline
|
|
def test_whitespace_stripped(self):
|
|
"""Test leading/trailing whitespace is stripped."""
|
|
assert _generate_excerpt(' hello world ') == 'hello world'
|
|
|
|
@pytest.mark.offline
|
|
def test_whitespace_only(self):
|
|
"""Test whitespace-only content returns empty."""
|
|
assert _generate_excerpt(' \n\t ') == ''
|
|
|
|
@pytest.mark.offline
|
|
def test_custom_max_length(self):
|
|
"""Test custom max_length parameter."""
|
|
content = 'This is a test sentence for excerpts.'
|
|
result = _generate_excerpt(content, max_length=10)
|
|
# Note: rstrip() removes trailing space before adding ellipsis
|
|
assert result == 'This is a...'
|
|
|
|
@pytest.mark.offline
|
|
def test_unicode_content(self):
|
|
"""Test unicode content handling."""
|
|
content = '日本語テキスト' * 50 # 350 chars
|
|
result = _generate_excerpt(content, max_length=150)
|
|
assert len(result) == 153 # 150 chars + '...'
|
|
|
|
@pytest.mark.offline
|
|
def test_newlines_preserved(self):
|
|
"""Test that newlines within content are preserved."""
|
|
content = 'Line 1\nLine 2'
|
|
result = _generate_excerpt(content)
|
|
assert result == 'Line 1\nLine 2'
|
|
|
|
@pytest.mark.offline
|
|
def test_very_short_max_length(self):
|
|
"""Test with very short max_length."""
|
|
result = _generate_excerpt('Hello world', max_length=5)
|
|
assert result == 'Hello...'
|
|
|
|
|
|
# ============================================================================
|
|
# Tests for generate_reference_list_from_chunks()
|
|
# ============================================================================
|
|
|
|
|
|
class TestGenerateReferenceListFromChunks:
|
|
"""Tests for generate_reference_list_from_chunks function."""
|
|
|
|
@pytest.mark.offline
|
|
def test_empty_chunks(self):
|
|
"""Test with empty chunk list."""
|
|
ref_list, updated_chunks = generate_reference_list_from_chunks([])
|
|
assert ref_list == []
|
|
assert updated_chunks == []
|
|
|
|
@pytest.mark.offline
|
|
def test_single_chunk(self):
|
|
"""Test with a single chunk."""
|
|
chunks = [
|
|
{
|
|
'file_path': '/path/to/doc.pdf',
|
|
'content': 'This is the content.',
|
|
's3_key': 'archive/doc.pdf',
|
|
}
|
|
]
|
|
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert len(ref_list) == 1
|
|
assert ref_list[0]['reference_id'] == '1'
|
|
assert ref_list[0]['file_path'] == '/path/to/doc.pdf'
|
|
assert ref_list[0]['document_title'] == 'doc.pdf'
|
|
assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
|
|
assert ref_list[0]['excerpt'] == 'This is the content.'
|
|
|
|
assert len(updated_chunks) == 1
|
|
assert updated_chunks[0]['reference_id'] == '1'
|
|
|
|
@pytest.mark.offline
|
|
def test_multiple_chunks_same_file(self):
|
|
"""Test multiple chunks from same file get same reference_id."""
|
|
chunks = [
|
|
{'file_path': '/path/doc.pdf', 'content': 'Chunk 1'},
|
|
{'file_path': '/path/doc.pdf', 'content': 'Chunk 2'},
|
|
{'file_path': '/path/doc.pdf', 'content': 'Chunk 3'},
|
|
]
|
|
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert len(ref_list) == 1
|
|
assert ref_list[0]['reference_id'] == '1'
|
|
# All chunks should have same reference_id
|
|
for chunk in updated_chunks:
|
|
assert chunk['reference_id'] == '1'
|
|
|
|
@pytest.mark.offline
|
|
def test_multiple_files_deduplication(self):
|
|
"""Test multiple files are deduplicated with unique reference_ids."""
|
|
chunks = [
|
|
{'file_path': '/path/doc1.pdf', 'content': 'Content 1'},
|
|
{'file_path': '/path/doc2.pdf', 'content': 'Content 2'},
|
|
{'file_path': '/path/doc1.pdf', 'content': 'Content 1 more'},
|
|
]
|
|
ref_list, _updated_chunks = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert len(ref_list) == 2
|
|
# doc1 appears twice, so should be reference_id '1' (higher frequency)
|
|
# doc2 appears once, so should be reference_id '2'
|
|
ref_ids = {r['file_path']: r['reference_id'] for r in ref_list}
|
|
assert ref_ids['/path/doc1.pdf'] == '1'
|
|
assert ref_ids['/path/doc2.pdf'] == '2'
|
|
|
|
@pytest.mark.offline
|
|
def test_prioritization_by_frequency(self):
|
|
"""Test that references are prioritized by frequency."""
|
|
chunks = [
|
|
{'file_path': '/rare.pdf', 'content': 'Rare'},
|
|
{'file_path': '/common.pdf', 'content': 'Common 1'},
|
|
{'file_path': '/common.pdf', 'content': 'Common 2'},
|
|
{'file_path': '/common.pdf', 'content': 'Common 3'},
|
|
{'file_path': '/rare.pdf', 'content': 'Rare 2'},
|
|
]
|
|
ref_list, _ = generate_reference_list_from_chunks(chunks)
|
|
|
|
# common.pdf appears 3 times, rare.pdf appears 2 times
|
|
# common.pdf should get reference_id '1'
|
|
assert ref_list[0]['file_path'] == '/common.pdf'
|
|
assert ref_list[0]['reference_id'] == '1'
|
|
assert ref_list[1]['file_path'] == '/rare.pdf'
|
|
assert ref_list[1]['reference_id'] == '2'
|
|
|
|
@pytest.mark.offline
|
|
def test_unknown_source_filtered(self):
|
|
"""Test that 'unknown_source' file paths are filtered out."""
|
|
chunks = [
|
|
{'file_path': '/path/doc.pdf', 'content': 'Valid'},
|
|
{'file_path': 'unknown_source', 'content': 'Unknown'},
|
|
{'file_path': '/path/doc2.pdf', 'content': 'Valid 2'},
|
|
]
|
|
ref_list, updated_chunks = generate_reference_list_from_chunks(chunks)
|
|
|
|
# unknown_source should not be in reference list
|
|
assert len(ref_list) == 2
|
|
file_paths = [r['file_path'] for r in ref_list]
|
|
assert 'unknown_source' not in file_paths
|
|
|
|
# Chunk with unknown_source should have empty reference_id
|
|
assert updated_chunks[1]['reference_id'] == ''
|
|
|
|
@pytest.mark.offline
|
|
def test_empty_file_path_filtered(self):
|
|
"""Test that empty file paths are filtered out."""
|
|
chunks = [
|
|
{'file_path': '/path/doc.pdf', 'content': 'Valid'},
|
|
{'file_path': '', 'content': 'No path'},
|
|
{'content': 'Missing path key'},
|
|
]
|
|
ref_list, _updated_chunks = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert len(ref_list) == 1
|
|
assert ref_list[0]['file_path'] == '/path/doc.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_s3_key_included(self):
|
|
"""Test that s3_key is included in reference list."""
|
|
chunks = [
|
|
{
|
|
'file_path': 's3://bucket/archive/doc.pdf',
|
|
'content': 'S3 content',
|
|
's3_key': 'archive/doc.pdf',
|
|
}
|
|
]
|
|
ref_list, _ = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert ref_list[0]['s3_key'] == 'archive/doc.pdf'
|
|
assert ref_list[0]['document_title'] == 'doc.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_excerpt_generated_from_first_chunk(self):
|
|
"""Test that excerpt is generated from first chunk of each file."""
|
|
chunks = [
|
|
{'file_path': '/doc.pdf', 'content': 'First chunk content'},
|
|
{'file_path': '/doc.pdf', 'content': 'Second chunk different'},
|
|
]
|
|
ref_list, _ = generate_reference_list_from_chunks(chunks)
|
|
|
|
# Excerpt should be from first chunk
|
|
assert ref_list[0]['excerpt'] == 'First chunk content'
|
|
|
|
@pytest.mark.offline
|
|
def test_excerpt_added_to_each_chunk(self):
|
|
"""Test that each updated chunk has its own excerpt."""
|
|
chunks = [
|
|
{'file_path': '/doc.pdf', 'content': 'First chunk'},
|
|
{'file_path': '/doc.pdf', 'content': 'Second chunk'},
|
|
]
|
|
_, updated_chunks = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert updated_chunks[0]['excerpt'] == 'First chunk'
|
|
assert updated_chunks[1]['excerpt'] == 'Second chunk'
|
|
|
|
@pytest.mark.offline
|
|
def test_original_chunks_not_modified(self):
|
|
"""Test that original chunks are not modified (returns copies)."""
|
|
original_chunks = [
|
|
{'file_path': '/doc.pdf', 'content': 'Content'},
|
|
]
|
|
_, updated_chunks = generate_reference_list_from_chunks(original_chunks)
|
|
|
|
# Original should not have reference_id
|
|
assert 'reference_id' not in original_chunks[0]
|
|
# Updated should have reference_id
|
|
assert 'reference_id' in updated_chunks[0]
|
|
|
|
@pytest.mark.offline
|
|
def test_missing_s3_key_is_none(self):
|
|
"""Test that missing s3_key results in None."""
|
|
chunks = [
|
|
{'file_path': '/local/doc.pdf', 'content': 'Local file'},
|
|
]
|
|
ref_list, _ = generate_reference_list_from_chunks(chunks)
|
|
|
|
assert ref_list[0]['s3_key'] is None
|
|
|
|
@pytest.mark.offline
|
|
def test_tie_breaking_by_first_appearance(self):
|
|
"""Test that same-frequency files are ordered by first appearance."""
|
|
chunks = [
|
|
{'file_path': '/doc_b.pdf', 'content': 'B first'},
|
|
{'file_path': '/doc_a.pdf', 'content': 'A second'},
|
|
{'file_path': '/doc_b.pdf', 'content': 'B again'},
|
|
{'file_path': '/doc_a.pdf', 'content': 'A again'},
|
|
]
|
|
ref_list, _ = generate_reference_list_from_chunks(chunks)
|
|
|
|
# Both files appear twice, but doc_b appeared first
|
|
assert ref_list[0]['file_path'] == '/doc_b.pdf'
|
|
assert ref_list[0]['reference_id'] == '1'
|
|
assert ref_list[1]['file_path'] == '/doc_a.pdf'
|
|
assert ref_list[1]['reference_id'] == '2'
|