diff --git a/lightrag/rerank.py b/lightrag/rerank.py index b3892d56..81632b71 100644 --- a/lightrag/rerank.py +++ b/lightrag/rerank.py @@ -39,6 +39,18 @@ def chunk_documents_for_rerank( - chunked_documents: List of document chunks (may be more than input) - original_doc_indices: Maps each chunk back to its original document index """ + # Clamp overlap_tokens to ensure the loop always advances + # If overlap_tokens >= max_tokens, the chunking loop would hang + if overlap_tokens >= max_tokens: + original_overlap = overlap_tokens + # Ensure overlap is at least 1 token less than max to guarantee progress + # For very small max_tokens (e.g., 1), set overlap to 0 + overlap_tokens = max(0, max_tokens - 1) + logger.warning( + f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). " + f"Clamping to {overlap_tokens} to prevent infinite loop." + ) + try: from .utils import TiktokenTokenizer diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py new file mode 100644 index 00000000..7f84a3cf --- /dev/null +++ b/tests/test_overlap_validation.py @@ -0,0 +1,113 @@ +""" +Test for overlap_tokens validation to prevent infinite loop. + +This test validates the fix for the bug where overlap_tokens >= max_tokens +causes an infinite loop in the chunking function. +""" + +from lightrag.rerank import chunk_documents_for_rerank + + +class TestOverlapValidation: + """Test suite for overlap_tokens validation""" + + def test_overlap_greater_than_max_tokens(self): + """Test that overlap_tokens > max_tokens is clamped and doesn't hang""" + documents = [" ".join([f"word{i}" for i in range(100)])] + + # This should clamp overlap_tokens to 29 (max_tokens - 1) + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=30, overlap_tokens=32 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_overlap_equal_to_max_tokens(self): + """Test that overlap_tokens == max_tokens is clamped and doesn't hang""" + documents = [" ".join([f"word{i}" for i in range(100)])] + + # This should clamp overlap_tokens to 29 (max_tokens - 1) + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=30, overlap_tokens=30 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_overlap_slightly_less_than_max_tokens(self): + """Test that overlap_tokens < max_tokens works normally""" + documents = [" ".join([f"word{i}" for i in range(100)])] + + # This should work without clamping + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=30, overlap_tokens=29 + ) + + # Should complete successfully + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_small_max_tokens_with_large_overlap(self): + """Test edge case with very small max_tokens""" + documents = [" ".join([f"word{i}" for i in range(50)])] + + # max_tokens=5, overlap_tokens=10 should clamp to 4 + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=5, overlap_tokens=10 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_multiple_documents_with_invalid_overlap(self): + """Test multiple documents with overlap_tokens >= max_tokens""" + documents = [ + " ".join([f"word{i}" for i in range(50)]), + "short document", + " ".join([f"word{i}" for i in range(75)]), + ] + + # overlap_tokens > max_tokens + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=25, overlap_tokens=30 + ) + + # Should complete successfully and chunk the long documents + assert len(chunked_docs) >= len(documents) + # Short document should not be chunked + assert "short document" in chunked_docs + + def test_normal_operation_unaffected(self): + """Test that normal cases continue to work correctly""" + documents = [ + " ".join([f"word{i}" for i in range(100)]), + "short doc", + ] + + # Normal case: overlap_tokens (10) < max_tokens (50) + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=50, overlap_tokens=10 + ) + + # Long document should be chunked, short one should not + assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short) + assert "short doc" in chunked_docs + # Verify doc_indices maps correctly + assert doc_indices[-1] == 1 # Last chunk is from second document + + def test_edge_case_max_tokens_one(self): + """Test edge case where max_tokens=1""" + documents = [" ".join([f"word{i}" for i in range(20)])] + + # max_tokens=1, overlap_tokens=5 should clamp to 0 + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=1, overlap_tokens=5 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices)