Format entire codebase with ruff and add type hints across all modules: - Apply ruff formatting to all Python files (121 files, 17K insertions) - Add type hints to function signatures throughout lightrag core and API - Update test suite with improved type annotations and docstrings - Add pyrightconfig.json for static type checking configuration - Create prompt_optimized.py and test_extraction_prompt_ab.py test files - Update ruff.toml and .gitignore for improved linting configuration - Standardize code style across examples, reproduce scripts, and utilities
103 lines
4.2 KiB
Python
103 lines
4.2 KiB
Python
"""
|
|
Test for overlap_tokens validation to prevent infinite loop.
|
|
|
|
This test validates the fix for the bug where overlap_tokens >= max_tokens
|
|
causes an infinite loop in the chunking function.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from lightrag.rerank import chunk_documents_for_rerank
|
|
|
|
|
|
@pytest.mark.offline
|
|
class TestOverlapValidation:
|
|
"""Test suite for overlap_tokens validation"""
|
|
|
|
def test_overlap_greater_than_max_tokens(self) -> None:
|
|
"""Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
|
|
documents = [' '.join([f'word{i}' for i in range(100)])]
|
|
|
|
# This should clamp overlap_tokens to 29 (max_tokens - 1)
|
|
chunked_docs, doc_indices = chunk_documents_for_rerank(documents, max_tokens=30, overlap_tokens=32)
|
|
|
|
# Should complete without hanging
|
|
assert len(chunked_docs) > 0
|
|
assert all(idx == 0 for idx in doc_indices)
|
|
|
|
def test_overlap_equal_to_max_tokens(self) -> None:
|
|
"""Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
|
|
documents = [' '.join([f'word{i}' for i in range(100)])]
|
|
|
|
# This should clamp overlap_tokens to 29 (max_tokens - 1)
|
|
chunked_docs, doc_indices = chunk_documents_for_rerank(documents, max_tokens=30, overlap_tokens=30)
|
|
|
|
# Should complete without hanging
|
|
assert len(chunked_docs) > 0
|
|
assert all(idx == 0 for idx in doc_indices)
|
|
|
|
def test_overlap_slightly_less_than_max_tokens(self) -> None:
|
|
"""Test that overlap_tokens < max_tokens works normally"""
|
|
documents = [' '.join([f'word{i}' for i in range(100)])]
|
|
|
|
# This should work without clamping
|
|
chunked_docs, doc_indices = chunk_documents_for_rerank(documents, max_tokens=30, overlap_tokens=29)
|
|
|
|
# Should complete successfully
|
|
assert len(chunked_docs) > 0
|
|
assert all(idx == 0 for idx in doc_indices)
|
|
|
|
def test_small_max_tokens_with_large_overlap(self) -> None:
|
|
"""Test edge case with very small max_tokens"""
|
|
documents = [' '.join([f'word{i}' for i in range(50)])]
|
|
|
|
# max_tokens=5, overlap_tokens=10 should clamp to 4
|
|
chunked_docs, doc_indices = chunk_documents_for_rerank(documents, max_tokens=5, overlap_tokens=10)
|
|
|
|
# Should complete without hanging
|
|
assert len(chunked_docs) > 0
|
|
assert all(idx == 0 for idx in doc_indices)
|
|
|
|
def test_multiple_documents_with_invalid_overlap(self) -> None:
|
|
"""Test multiple documents with overlap_tokens >= max_tokens"""
|
|
documents = [
|
|
' '.join([f'word{i}' for i in range(50)]),
|
|
'short document',
|
|
' '.join([f'word{i}' for i in range(75)]),
|
|
]
|
|
|
|
# overlap_tokens > max_tokens
|
|
chunked_docs, _ = chunk_documents_for_rerank(documents, max_tokens=25, overlap_tokens=30)
|
|
|
|
# Should complete successfully and chunk the long documents
|
|
assert len(chunked_docs) >= len(documents)
|
|
# Short document should not be chunked
|
|
assert 'short document' in chunked_docs
|
|
|
|
def test_normal_operation_unaffected(self) -> None:
|
|
"""Test that normal cases continue to work correctly"""
|
|
documents = [
|
|
' '.join([f'word{i}' for i in range(100)]),
|
|
'short doc',
|
|
]
|
|
|
|
# Normal case: overlap_tokens (10) < max_tokens (50)
|
|
chunked_docs, doc_indices = chunk_documents_for_rerank(documents, max_tokens=50, overlap_tokens=10)
|
|
|
|
# Long document should be chunked, short one should not
|
|
assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short)
|
|
assert 'short doc' in chunked_docs
|
|
# Verify doc_indices maps "short doc" to document index 1
|
|
short_doc_idx = chunked_docs.index('short doc')
|
|
assert doc_indices[short_doc_idx] == 1
|
|
|
|
def test_edge_case_max_tokens_one(self) -> None:
|
|
"""Test edge case where max_tokens=1"""
|
|
documents = [' '.join([f'word{i}' for i in range(20)])]
|
|
|
|
# max_tokens=1, overlap_tokens=5 should clamp to 0
|
|
chunked_docs, doc_indices = chunk_documents_for_rerank(documents, max_tokens=1, overlap_tokens=5)
|
|
|
|
# Should complete without hanging
|
|
assert len(chunked_docs) > 0
|
|
assert all(idx == 0 for idx in doc_indices)
|