Fix chunking infinite loop when overlap_tokens >= max_tokens

Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com> (cherry picked from commit 1d6ea0c5f7)
2025-11-24 03:40:58 +00:00 · 2025-11-24 03:40:58 +00:00 · 85f21aecd5
commit 85f21aecd5
parent b65ef37569
2 changed files with 22 additions and 25 deletions
--- a/lightrag/rerank.py
+++ b/lightrag/rerank.py
@ -39,6 +39,16 @@ def chunk_documents_for_rerank(
        - chunked_documents: List of document chunks (may be more than input)
        - original_doc_indices: Maps each chunk back to its original document index
    """
+    # Clamp overlap_tokens to ensure the loop always advances
+    # If overlap_tokens >= max_tokens, the chunking loop would hang
+    if overlap_tokens >= max_tokens:
+        original_overlap = overlap_tokens
+        overlap_tokens = max(1, max_tokens - 1)
+        logger.warning(
+            f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
+            f"Clamping to {overlap_tokens} to prevent infinite loop."
+        )
+    
    try:
        from .utils import TiktokenTokenizer

--- a/tests/test_overlap_validation.py
+++ b/tests/test_overlap_validation.py
@ -14,12 +14,12 @@ class TestOverlapValidation:
    def test_overlap_greater_than_max_tokens(self):
        """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
        documents = [" ".join([f"word{i}" for i in range(100)])]
-
+        
        # This should clamp overlap_tokens to 29 (max_tokens - 1)
        chunked_docs, doc_indices = chunk_documents_for_rerank(
            documents, max_tokens=30, overlap_tokens=32
        )
-
+        
        # Should complete without hanging
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)
@ -27,12 +27,12 @@ class TestOverlapValidation:
    def test_overlap_equal_to_max_tokens(self):
        """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
        documents = [" ".join([f"word{i}" for i in range(100)])]
-
+        
        # This should clamp overlap_tokens to 29 (max_tokens - 1)
        chunked_docs, doc_indices = chunk_documents_for_rerank(
            documents, max_tokens=30, overlap_tokens=30
        )
-
+        
        # Should complete without hanging
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)
@ -40,12 +40,12 @@ class TestOverlapValidation:
    def test_overlap_slightly_less_than_max_tokens(self):
        """Test that overlap_tokens < max_tokens works normally"""
        documents = [" ".join([f"word{i}" for i in range(100)])]
-
+        
        # This should work without clamping
        chunked_docs, doc_indices = chunk_documents_for_rerank(
            documents, max_tokens=30, overlap_tokens=29
        )
-
+        
        # Should complete successfully
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)
@ -53,12 +53,12 @@ class TestOverlapValidation:
    def test_small_max_tokens_with_large_overlap(self):
        """Test edge case with very small max_tokens"""
        documents = [" ".join([f"word{i}" for i in range(50)])]
-
+        
        # max_tokens=5, overlap_tokens=10 should clamp to 4
        chunked_docs, doc_indices = chunk_documents_for_rerank(
            documents, max_tokens=5, overlap_tokens=10
        )
-
+        
        # Should complete without hanging
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)
@ -70,12 +70,12 @@ class TestOverlapValidation:
            "short document",
            " ".join([f"word{i}" for i in range(75)]),
        ]
-
+        
        # overlap_tokens > max_tokens
        chunked_docs, doc_indices = chunk_documents_for_rerank(
            documents, max_tokens=25, overlap_tokens=30
        )
-
+        
        # Should complete successfully and chunk the long documents
        assert len(chunked_docs) >= len(documents)
        # Short document should not be chunked
@ -87,27 +87,14 @@ class TestOverlapValidation:
            " ".join([f"word{i}" for i in range(100)]),
            "short doc",
        ]
-
+        
        # Normal case: overlap_tokens (10) < max_tokens (50)
        chunked_docs, doc_indices = chunk_documents_for_rerank(
            documents, max_tokens=50, overlap_tokens=10
        )
-
+        
        # Long document should be chunked, short one should not
        assert len(chunked_docs) > 2  # At least 3 chunks (2 from long doc + 1 short)
        assert "short doc" in chunked_docs
        # Verify doc_indices maps correctly
        assert doc_indices[-1] == 1  # Last chunk is from second document
-
-    def test_edge_case_max_tokens_one(self):
-        """Test edge case where max_tokens=1"""
-        documents = [" ".join([f"word{i}" for i in range(20)])]
-
-        # max_tokens=1, overlap_tokens=5 should clamp to 0
-        chunked_docs, doc_indices = chunk_documents_for_rerank(
-            documents, max_tokens=1, overlap_tokens=5
-        )
-
-        # Should complete without hanging
-        assert len(chunked_docs) > 0
-        assert all(idx == 0 for idx in doc_indices)