LightRAG/tests/test_overlap_validation.py
clssck c5f230a30c test: fix env handling, add type hints, improve docs
Improve code quality and test robustness:
- Refactor environment variable parsing in rerank config using centralized get_env_value helper
- Add return type hints to all test methods for better type safety
- Fix patch path in test from lightrag.utils to lightrag.rerank for correct import location
- Clarify batch insert endpoint behavior regarding duplicate content rejection
- Expand .dockerignore to comprehensively exclude node_modules (200MB+), Python cache files, and venv directories
- Update dependency groups: align evaluation and test extras with pytest/pre-commit/ruff tools
2025-12-03 15:02:11 +01:00

117 lines
4.3 KiB
Python

"""
Test for overlap_tokens validation to prevent infinite loop.
This test validates the fix for the bug where overlap_tokens >= max_tokens
causes an infinite loop in the chunking function.
"""
import pytest
from lightrag.rerank import chunk_documents_for_rerank
@pytest.mark.offline
class TestOverlapValidation:
"""Test suite for overlap_tokens validation"""
def test_overlap_greater_than_max_tokens(self) -> None:
"""Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
documents = [" ".join([f"word{i}" for i in range(100)])]
# This should clamp overlap_tokens to 29 (max_tokens - 1)
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents, max_tokens=30, overlap_tokens=32
)
# Should complete without hanging
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_overlap_equal_to_max_tokens(self) -> None:
"""Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
documents = [" ".join([f"word{i}" for i in range(100)])]
# This should clamp overlap_tokens to 29 (max_tokens - 1)
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents, max_tokens=30, overlap_tokens=30
)
# Should complete without hanging
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_overlap_slightly_less_than_max_tokens(self) -> None:
"""Test that overlap_tokens < max_tokens works normally"""
documents = [" ".join([f"word{i}" for i in range(100)])]
# This should work without clamping
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents, max_tokens=30, overlap_tokens=29
)
# Should complete successfully
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_small_max_tokens_with_large_overlap(self) -> None:
"""Test edge case with very small max_tokens"""
documents = [" ".join([f"word{i}" for i in range(50)])]
# max_tokens=5, overlap_tokens=10 should clamp to 4
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents, max_tokens=5, overlap_tokens=10
)
# Should complete without hanging
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_multiple_documents_with_invalid_overlap(self) -> None:
"""Test multiple documents with overlap_tokens >= max_tokens"""
documents = [
" ".join([f"word{i}" for i in range(50)]),
"short document",
" ".join([f"word{i}" for i in range(75)]),
]
# overlap_tokens > max_tokens
chunked_docs, _ = chunk_documents_for_rerank(
documents, max_tokens=25, overlap_tokens=30
)
# Should complete successfully and chunk the long documents
assert len(chunked_docs) >= len(documents)
# Short document should not be chunked
assert "short document" in chunked_docs
def test_normal_operation_unaffected(self) -> None:
"""Test that normal cases continue to work correctly"""
documents = [
" ".join([f"word{i}" for i in range(100)]),
"short doc",
]
# Normal case: overlap_tokens (10) < max_tokens (50)
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents, max_tokens=50, overlap_tokens=10
)
# Long document should be chunked, short one should not
assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short)
assert "short doc" in chunked_docs
# Verify doc_indices maps "short doc" to document index 1
short_doc_idx = chunked_docs.index("short doc")
assert doc_indices[short_doc_idx] == 1
def test_edge_case_max_tokens_one(self) -> None:
"""Test edge case where max_tokens=1"""
documents = [" ".join([f"word{i}" for i in range(20)])]
# max_tokens=1, overlap_tokens=5 should clamp to 0
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents, max_tokens=1, overlap_tokens=5
)
# Should complete without hanging
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)