From e136da968bded7c2cc0b772ce0a383d891a3c19c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 24 Nov 2025 03:33:26 +0000
Subject: [PATCH 1/3] Initial plan


From 1d6ea0c5f7dd48d6f2c1e9ea0cafeb54478c490d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 24 Nov 2025 03:40:58 +0000
Subject: [PATCH 2/3] Fix chunking infinite loop when overlap_tokens >=
 max_tokens

Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com>
---
 lightrag/rerank.py               |  10 ++++
 tests/test_overlap_validation.py | 100 +++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 tests/test_overlap_validation.py

diff --git a/lightrag/rerank.py b/lightrag/rerank.py
index b3892d56..1b5d7612 100644
--- a/lightrag/rerank.py
+++ b/lightrag/rerank.py
@@ -39,6 +39,16 @@ def chunk_documents_for_rerank(
         - chunked_documents: List of document chunks (may be more than input)
         - original_doc_indices: Maps each chunk back to its original document index
     """
+    # Clamp overlap_tokens to ensure the loop always advances
+    # If overlap_tokens >= max_tokens, the chunking loop would hang
+    if overlap_tokens >= max_tokens:
+        original_overlap = overlap_tokens
+        overlap_tokens = max(1, max_tokens - 1)
+        logger.warning(
+            f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
+            f"Clamping to {overlap_tokens} to prevent infinite loop."
+        )
+    
     try:
         from .utils import TiktokenTokenizer
 
diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py
new file mode 100644
index 00000000..da364719
--- /dev/null
+++ b/tests/test_overlap_validation.py
@@ -0,0 +1,100 @@
+"""
+Test for overlap_tokens validation to prevent infinite loop.
+
+This test validates the fix for the bug where overlap_tokens >= max_tokens
+causes an infinite loop in the chunking function.
+"""
+
+from lightrag.rerank import chunk_documents_for_rerank
+
+
+class TestOverlapValidation:
+    """Test suite for overlap_tokens validation"""
+
+    def test_overlap_greater_than_max_tokens(self):
+        """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+        
+        # This should clamp overlap_tokens to 29 (max_tokens - 1)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=32
+        )
+        
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_overlap_equal_to_max_tokens(self):
+        """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+        
+        # This should clamp overlap_tokens to 29 (max_tokens - 1)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=30
+        )
+        
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_overlap_slightly_less_than_max_tokens(self):
+        """Test that overlap_tokens < max_tokens works normally"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+        
+        # This should work without clamping
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=29
+        )
+        
+        # Should complete successfully
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_small_max_tokens_with_large_overlap(self):
+        """Test edge case with very small max_tokens"""
+        documents = [" ".join([f"word{i}" for i in range(50)])]
+        
+        # max_tokens=5, overlap_tokens=10 should clamp to 4
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=5, overlap_tokens=10
+        )
+        
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_multiple_documents_with_invalid_overlap(self):
+        """Test multiple documents with overlap_tokens >= max_tokens"""
+        documents = [
+            " ".join([f"word{i}" for i in range(50)]),
+            "short document",
+            " ".join([f"word{i}" for i in range(75)]),
+        ]
+        
+        # overlap_tokens > max_tokens
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=25, overlap_tokens=30
+        )
+        
+        # Should complete successfully and chunk the long documents
+        assert len(chunked_docs) >= len(documents)
+        # Short document should not be chunked
+        assert "short document" in chunked_docs
+
+    def test_normal_operation_unaffected(self):
+        """Test that normal cases continue to work correctly"""
+        documents = [
+            " ".join([f"word{i}" for i in range(100)]),
+            "short doc",
+        ]
+        
+        # Normal case: overlap_tokens (10) < max_tokens (50)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=50, overlap_tokens=10
+        )
+        
+        # Long document should be chunked, short one should not
+        assert len(chunked_docs) > 2  # At least 3 chunks (2 from long doc + 1 short)
+        assert "short doc" in chunked_docs
+        # Verify doc_indices maps correctly
+        assert doc_indices[-1] == 1  # Last chunk is from second document

From 8835fc244a90017b8fc98f60017ae4e78e975c0b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 24 Nov 2025 03:43:05 +0000
Subject: [PATCH 3/3] Improve edge case handling for max_tokens=1

Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com>
---
 lightrag/rerank.py               |  4 +++-
 tests/test_overlap_validation.py | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lightrag/rerank.py b/lightrag/rerank.py
index 1b5d7612..81632b71 100644
--- a/lightrag/rerank.py
+++ b/lightrag/rerank.py
@@ -43,7 +43,9 @@ def chunk_documents_for_rerank(
     # If overlap_tokens >= max_tokens, the chunking loop would hang
     if overlap_tokens >= max_tokens:
         original_overlap = overlap_tokens
-        overlap_tokens = max(1, max_tokens - 1)
+        # Ensure overlap is at least 1 token less than max to guarantee progress
+        # For very small max_tokens (e.g., 1), set overlap to 0
+        overlap_tokens = max(0, max_tokens - 1)
         logger.warning(
             f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
             f"Clamping to {overlap_tokens} to prevent infinite loop."
diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py
index da364719..7f84a3cf 100644
--- a/tests/test_overlap_validation.py
+++ b/tests/test_overlap_validation.py
@@ -98,3 +98,16 @@ class TestOverlapValidation:
         assert "short doc" in chunked_docs
         # Verify doc_indices maps correctly
         assert doc_indices[-1] == 1  # Last chunk is from second document
+
+    def test_edge_case_max_tokens_one(self):
+        """Test edge case where max_tokens=1"""
+        documents = [" ".join([f"word{i}" for i in range(20)])]
+        
+        # max_tokens=1, overlap_tokens=5 should clamp to 0
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=1, overlap_tokens=5
+        )
+        
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)