From e136da968bded7c2cc0b772ce0a383d891a3c19c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 24 Nov 2025 03:33:26 +0000 Subject: [PATCH 1/3] Initial plan From 1d6ea0c5f7dd48d6f2c1e9ea0cafeb54478c490d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 24 Nov 2025 03:40:58 +0000 Subject: [PATCH 2/3] Fix chunking infinite loop when overlap_tokens >= max_tokens Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com> --- lightrag/rerank.py | 10 ++++ tests/test_overlap_validation.py | 100 +++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 tests/test_overlap_validation.py diff --git a/lightrag/rerank.py b/lightrag/rerank.py index b3892d56..1b5d7612 100644 --- a/lightrag/rerank.py +++ b/lightrag/rerank.py @@ -39,6 +39,16 @@ def chunk_documents_for_rerank( - chunked_documents: List of document chunks (may be more than input) - original_doc_indices: Maps each chunk back to its original document index """ + # Clamp overlap_tokens to ensure the loop always advances + # If overlap_tokens >= max_tokens, the chunking loop would hang + if overlap_tokens >= max_tokens: + original_overlap = overlap_tokens + overlap_tokens = max(1, max_tokens - 1) + logger.warning( + f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). " + f"Clamping to {overlap_tokens} to prevent infinite loop." + ) + try: from .utils import TiktokenTokenizer diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py new file mode 100644 index 00000000..da364719 --- /dev/null +++ b/tests/test_overlap_validation.py @@ -0,0 +1,100 @@ +""" +Test for overlap_tokens validation to prevent infinite loop. + +This test validates the fix for the bug where overlap_tokens >= max_tokens +causes an infinite loop in the chunking function. +""" + +from lightrag.rerank import chunk_documents_for_rerank + + +class TestOverlapValidation: + """Test suite for overlap_tokens validation""" + + def test_overlap_greater_than_max_tokens(self): + """Test that overlap_tokens > max_tokens is clamped and doesn't hang""" + documents = [" ".join([f"word{i}" for i in range(100)])] + + # This should clamp overlap_tokens to 29 (max_tokens - 1) + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=30, overlap_tokens=32 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_overlap_equal_to_max_tokens(self): + """Test that overlap_tokens == max_tokens is clamped and doesn't hang""" + documents = [" ".join([f"word{i}" for i in range(100)])] + + # This should clamp overlap_tokens to 29 (max_tokens - 1) + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=30, overlap_tokens=30 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_overlap_slightly_less_than_max_tokens(self): + """Test that overlap_tokens < max_tokens works normally""" + documents = [" ".join([f"word{i}" for i in range(100)])] + + # This should work without clamping + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=30, overlap_tokens=29 + ) + + # Should complete successfully + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_small_max_tokens_with_large_overlap(self): + """Test edge case with very small max_tokens""" + documents = [" ".join([f"word{i}" for i in range(50)])] + + # max_tokens=5, overlap_tokens=10 should clamp to 4 + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=5, overlap_tokens=10 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices) + + def test_multiple_documents_with_invalid_overlap(self): + """Test multiple documents with overlap_tokens >= max_tokens""" + documents = [ + " ".join([f"word{i}" for i in range(50)]), + "short document", + " ".join([f"word{i}" for i in range(75)]), + ] + + # overlap_tokens > max_tokens + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=25, overlap_tokens=30 + ) + + # Should complete successfully and chunk the long documents + assert len(chunked_docs) >= len(documents) + # Short document should not be chunked + assert "short document" in chunked_docs + + def test_normal_operation_unaffected(self): + """Test that normal cases continue to work correctly""" + documents = [ + " ".join([f"word{i}" for i in range(100)]), + "short doc", + ] + + # Normal case: overlap_tokens (10) < max_tokens (50) + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=50, overlap_tokens=10 + ) + + # Long document should be chunked, short one should not + assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short) + assert "short doc" in chunked_docs + # Verify doc_indices maps correctly + assert doc_indices[-1] == 1 # Last chunk is from second document From 8835fc244a90017b8fc98f60017ae4e78e975c0b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 24 Nov 2025 03:43:05 +0000 Subject: [PATCH 3/3] Improve edge case handling for max_tokens=1 Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com> --- lightrag/rerank.py | 4 +++- tests/test_overlap_validation.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lightrag/rerank.py b/lightrag/rerank.py index 1b5d7612..81632b71 100644 --- a/lightrag/rerank.py +++ b/lightrag/rerank.py @@ -43,7 +43,9 @@ def chunk_documents_for_rerank( # If overlap_tokens >= max_tokens, the chunking loop would hang if overlap_tokens >= max_tokens: original_overlap = overlap_tokens - overlap_tokens = max(1, max_tokens - 1) + # Ensure overlap is at least 1 token less than max to guarantee progress + # For very small max_tokens (e.g., 1), set overlap to 0 + overlap_tokens = max(0, max_tokens - 1) logger.warning( f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). " f"Clamping to {overlap_tokens} to prevent infinite loop." diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py index da364719..7f84a3cf 100644 --- a/tests/test_overlap_validation.py +++ b/tests/test_overlap_validation.py @@ -98,3 +98,16 @@ class TestOverlapValidation: assert "short doc" in chunked_docs # Verify doc_indices maps correctly assert doc_indices[-1] == 1 # Last chunk is from second document + + def test_edge_case_max_tokens_one(self): + """Test edge case where max_tokens=1""" + documents = [" ".join([f"word{i}" for i in range(20)])] + + # max_tokens=1, overlap_tokens=5 should clamp to 0 + chunked_docs, doc_indices = chunk_documents_for_rerank( + documents, max_tokens=1, overlap_tokens=5 + ) + + # Should complete without hanging + assert len(chunked_docs) > 0 + assert all(idx == 0 for idx in doc_indices)