Fix chunking infinite loop when overlap_tokens >= max_tokens
Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com>
(cherry picked from commit 1d6ea0c5f7)
This commit is contained in:
parent
b65ef37569
commit
85f21aecd5
2 changed files with 22 additions and 25 deletions
|
|
@ -39,6 +39,16 @@ def chunk_documents_for_rerank(
|
||||||
- chunked_documents: List of document chunks (may be more than input)
|
- chunked_documents: List of document chunks (may be more than input)
|
||||||
- original_doc_indices: Maps each chunk back to its original document index
|
- original_doc_indices: Maps each chunk back to its original document index
|
||||||
"""
|
"""
|
||||||
|
# Clamp overlap_tokens to ensure the loop always advances
|
||||||
|
# If overlap_tokens >= max_tokens, the chunking loop would hang
|
||||||
|
if overlap_tokens >= max_tokens:
|
||||||
|
original_overlap = overlap_tokens
|
||||||
|
overlap_tokens = max(1, max_tokens - 1)
|
||||||
|
logger.warning(
|
||||||
|
f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
|
||||||
|
f"Clamping to {overlap_tokens} to prevent infinite loop."
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from .utils import TiktokenTokenizer
|
from .utils import TiktokenTokenizer
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -98,16 +98,3 @@ class TestOverlapValidation:
|
||||||
assert "short doc" in chunked_docs
|
assert "short doc" in chunked_docs
|
||||||
# Verify doc_indices maps correctly
|
# Verify doc_indices maps correctly
|
||||||
assert doc_indices[-1] == 1 # Last chunk is from second document
|
assert doc_indices[-1] == 1 # Last chunk is from second document
|
||||||
|
|
||||||
def test_edge_case_max_tokens_one(self):
|
|
||||||
"""Test edge case where max_tokens=1"""
|
|
||||||
documents = [" ".join([f"word{i}" for i in range(20)])]
|
|
||||||
|
|
||||||
# max_tokens=1, overlap_tokens=5 should clamp to 0
|
|
||||||
chunked_docs, doc_indices = chunk_documents_for_rerank(
|
|
||||||
documents, max_tokens=1, overlap_tokens=5
|
|
||||||
)
|
|
||||||
|
|
||||||
# Should complete without hanging
|
|
||||||
assert len(chunked_docs) > 0
|
|
||||||
assert all(idx == 0 for idx in doc_indices)
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue