Improve edge case handling for max_tokens=1
Co-authored-by: netbrah <162479981+netbrah@users.noreply.github.com>
This commit is contained in:
parent
1d6ea0c5f7
commit
8835fc244a
2 changed files with 16 additions and 1 deletions
|
|
@ -43,7 +43,9 @@ def chunk_documents_for_rerank(
|
|||
# If overlap_tokens >= max_tokens, the chunking loop would hang
|
||||
if overlap_tokens >= max_tokens:
|
||||
original_overlap = overlap_tokens
|
||||
overlap_tokens = max(1, max_tokens - 1)
|
||||
# Ensure overlap is at least 1 token less than max to guarantee progress
|
||||
# For very small max_tokens (e.g., 1), set overlap to 0
|
||||
overlap_tokens = max(0, max_tokens - 1)
|
||||
logger.warning(
|
||||
f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
|
||||
f"Clamping to {overlap_tokens} to prevent infinite loop."
|
||||
|
|
|
|||
|
|
@ -98,3 +98,16 @@ class TestOverlapValidation:
|
|||
assert "short doc" in chunked_docs
|
||||
# Verify doc_indices maps correctly
|
||||
assert doc_indices[-1] == 1 # Last chunk is from second document
|
||||
|
||||
def test_edge_case_max_tokens_one(self):
|
||||
"""Test edge case where max_tokens=1"""
|
||||
documents = [" ".join([f"word{i}" for i in range(20)])]
|
||||
|
||||
# max_tokens=1, overlap_tokens=5 should clamp to 0
|
||||
chunked_docs, doc_indices = chunk_documents_for_rerank(
|
||||
documents, max_tokens=1, overlap_tokens=5
|
||||
)
|
||||
|
||||
# Should complete without hanging
|
||||
assert len(chunked_docs) > 0
|
||||
assert all(idx == 0 for idx in doc_indices)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue