Add token limit validation for character-only chunking
- Add ChunkTokenLimitExceededError exception - Validate chunks against token limits - Include chunk preview in error messages - Add comprehensive test coverage - Log warnings for oversized chunks
This commit is contained in:
parent
5cc916861f
commit
f988a22652
3 changed files with 149 additions and 1 deletions
|
|
@ -106,6 +106,27 @@ class PipelineCancelledException(Exception):
|
||||||
self.message = message
|
self.message = message
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkTokenLimitExceededError(ValueError):
|
||||||
|
"""Raised when a chunk exceeds the configured token limit."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
chunk_tokens: int,
|
||||||
|
chunk_token_limit: int,
|
||||||
|
chunk_preview: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
preview = chunk_preview.strip() if chunk_preview else None
|
||||||
|
truncated_preview = preview[:80] if preview else None
|
||||||
|
preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
|
||||||
|
super().__init__(
|
||||||
|
f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
|
||||||
|
f"{preview_note}"
|
||||||
|
)
|
||||||
|
self.chunk_tokens = chunk_tokens
|
||||||
|
self.chunk_token_limit = chunk_token_limit
|
||||||
|
self.chunk_preview = truncated_preview
|
||||||
|
|
||||||
|
|
||||||
class QdrantMigrationError(Exception):
|
class QdrantMigrationError(Exception):
|
||||||
"""Raised when Qdrant data migration from legacy collections fails."""
|
"""Raised when Qdrant data migration from legacy collections fails."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,10 @@ import json_repair
|
||||||
from typing import Any, AsyncIterator, overload, Literal
|
from typing import Any, AsyncIterator, overload, Literal
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
|
|
||||||
from lightrag.exceptions import PipelineCancelledException
|
from lightrag.exceptions import (
|
||||||
|
PipelineCancelledException,
|
||||||
|
ChunkTokenLimitExceededError,
|
||||||
|
)
|
||||||
from lightrag.utils import (
|
from lightrag.utils import (
|
||||||
logger,
|
logger,
|
||||||
compute_mdhash_id,
|
compute_mdhash_id,
|
||||||
|
|
@ -109,6 +112,17 @@ def chunking_by_token_size(
|
||||||
if split_by_character_only:
|
if split_by_character_only:
|
||||||
for chunk in raw_chunks:
|
for chunk in raw_chunks:
|
||||||
_tokens = tokenizer.encode(chunk)
|
_tokens = tokenizer.encode(chunk)
|
||||||
|
if len(_tokens) > chunk_token_size:
|
||||||
|
logger.warning(
|
||||||
|
"Chunk split_by_character exceeds token limit: len=%d limit=%d",
|
||||||
|
len(_tokens),
|
||||||
|
chunk_token_size,
|
||||||
|
)
|
||||||
|
raise ChunkTokenLimitExceededError(
|
||||||
|
chunk_tokens=len(_tokens),
|
||||||
|
chunk_token_limit=chunk_token_size,
|
||||||
|
chunk_preview=chunk[:120],
|
||||||
|
)
|
||||||
new_chunks.append((len(_tokens), chunk))
|
new_chunks.append((len(_tokens), chunk))
|
||||||
else:
|
else:
|
||||||
for chunk in raw_chunks:
|
for chunk in raw_chunks:
|
||||||
|
|
|
||||||
113
tests/test_chunking.py
Normal file
113
tests/test_chunking.py
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lightrag.exceptions import ChunkTokenLimitExceededError
|
||||||
|
from lightrag.operate import chunking_by_token_size
|
||||||
|
from lightrag.utils import Tokenizer, TokenizerInterface
|
||||||
|
|
||||||
|
|
||||||
|
class DummyTokenizer(TokenizerInterface):
|
||||||
|
def encode(self, content: str):
|
||||||
|
return [ord(ch) for ch in content]
|
||||||
|
|
||||||
|
def decode(self, tokens):
|
||||||
|
return "".join(chr(token) for token in tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def make_tokenizer() -> Tokenizer:
|
||||||
|
return Tokenizer(model_name="dummy", tokenizer=DummyTokenizer())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.offline
|
||||||
|
def test_split_by_character_only_within_limit():
|
||||||
|
"""Test chunking when all chunks are within token limit."""
|
||||||
|
tokenizer = make_tokenizer()
|
||||||
|
|
||||||
|
chunks = chunking_by_token_size(
|
||||||
|
tokenizer,
|
||||||
|
"alpha\n\nbeta",
|
||||||
|
split_by_character="\n\n",
|
||||||
|
split_by_character_only=True,
|
||||||
|
chunk_token_size=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert [chunk["content"] for chunk in chunks] == ["alpha", "beta"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.offline
|
||||||
|
def test_split_by_character_only_exceeding_limit_raises():
|
||||||
|
"""Test that oversized chunks raise ChunkTokenLimitExceededError."""
|
||||||
|
tokenizer = make_tokenizer()
|
||||||
|
oversized = "a" * 12
|
||||||
|
|
||||||
|
with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
|
||||||
|
chunking_by_token_size(
|
||||||
|
tokenizer,
|
||||||
|
oversized,
|
||||||
|
split_by_character="\n\n",
|
||||||
|
split_by_character_only=True,
|
||||||
|
chunk_token_size=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
err = excinfo.value
|
||||||
|
assert err.chunk_tokens == len(oversized)
|
||||||
|
assert err.chunk_token_limit == 5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.offline
|
||||||
|
def test_chunk_error_includes_preview():
|
||||||
|
"""Test that error message includes chunk preview."""
|
||||||
|
tokenizer = make_tokenizer()
|
||||||
|
oversized = "x" * 100
|
||||||
|
|
||||||
|
with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
|
||||||
|
chunking_by_token_size(
|
||||||
|
tokenizer,
|
||||||
|
oversized,
|
||||||
|
split_by_character="\n\n",
|
||||||
|
split_by_character_only=True,
|
||||||
|
chunk_token_size=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
err = excinfo.value
|
||||||
|
# Preview should be first 80 chars of a 100-char string
|
||||||
|
assert err.chunk_preview == "x" * 80
|
||||||
|
assert "Preview:" in str(err)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.offline
|
||||||
|
def test_split_by_character_only_at_exact_limit():
|
||||||
|
"""Test chunking when chunk is exactly at token limit."""
|
||||||
|
tokenizer = make_tokenizer()
|
||||||
|
exact_size = "a" * 10
|
||||||
|
|
||||||
|
chunks = chunking_by_token_size(
|
||||||
|
tokenizer,
|
||||||
|
exact_size,
|
||||||
|
split_by_character="\n\n",
|
||||||
|
split_by_character_only=True,
|
||||||
|
chunk_token_size=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0]["content"] == exact_size
|
||||||
|
assert chunks[0]["tokens"] == 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.offline
|
||||||
|
def test_split_by_character_only_one_over_limit():
|
||||||
|
"""Test that chunk with one token over limit raises error."""
|
||||||
|
tokenizer = make_tokenizer()
|
||||||
|
one_over = "a" * 11
|
||||||
|
|
||||||
|
with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
|
||||||
|
chunking_by_token_size(
|
||||||
|
tokenizer,
|
||||||
|
one_over,
|
||||||
|
split_by_character="\n\n",
|
||||||
|
split_by_character_only=True,
|
||||||
|
chunk_token_size=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
err = excinfo.value
|
||||||
|
assert err.chunk_tokens == 11
|
||||||
|
assert err.chunk_token_limit == 10
|
||||||
Loading…
Add table
Reference in a new issue