Add token limit validation for character-only chunking

- Add ChunkTokenLimitExceededError exception
- Validate chunks against token limits
- Include chunk preview in error messages
- Add comprehensive test coverage
- Log warnings for oversized chunks
This commit is contained in:
yangdx 2025-11-19 18:32:43 +08:00
parent 5cc916861f
commit f988a22652
3 changed files with 149 additions and 1 deletions

View file

@ -106,6 +106,27 @@ class PipelineCancelledException(Exception):
self.message = message self.message = message
class ChunkTokenLimitExceededError(ValueError):
"""Raised when a chunk exceeds the configured token limit."""
def __init__(
self,
chunk_tokens: int,
chunk_token_limit: int,
chunk_preview: str | None = None,
) -> None:
preview = chunk_preview.strip() if chunk_preview else None
truncated_preview = preview[:80] if preview else None
preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
super().__init__(
f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
f"{preview_note}"
)
self.chunk_tokens = chunk_tokens
self.chunk_token_limit = chunk_token_limit
self.chunk_preview = truncated_preview
class QdrantMigrationError(Exception): class QdrantMigrationError(Exception):
"""Raised when Qdrant data migration from legacy collections fails.""" """Raised when Qdrant data migration from legacy collections fails."""

View file

@ -8,7 +8,10 @@ import json_repair
from typing import Any, AsyncIterator, overload, Literal from typing import Any, AsyncIterator, overload, Literal
from collections import Counter, defaultdict from collections import Counter, defaultdict
from lightrag.exceptions import PipelineCancelledException from lightrag.exceptions import (
PipelineCancelledException,
ChunkTokenLimitExceededError,
)
from lightrag.utils import ( from lightrag.utils import (
logger, logger,
compute_mdhash_id, compute_mdhash_id,
@ -109,6 +112,17 @@ def chunking_by_token_size(
if split_by_character_only: if split_by_character_only:
for chunk in raw_chunks: for chunk in raw_chunks:
_tokens = tokenizer.encode(chunk) _tokens = tokenizer.encode(chunk)
if len(_tokens) > chunk_token_size:
logger.warning(
"Chunk split_by_character exceeds token limit: len=%d limit=%d",
len(_tokens),
chunk_token_size,
)
raise ChunkTokenLimitExceededError(
chunk_tokens=len(_tokens),
chunk_token_limit=chunk_token_size,
chunk_preview=chunk[:120],
)
new_chunks.append((len(_tokens), chunk)) new_chunks.append((len(_tokens), chunk))
else: else:
for chunk in raw_chunks: for chunk in raw_chunks:

113
tests/test_chunking.py Normal file
View file

@ -0,0 +1,113 @@
import pytest
from lightrag.exceptions import ChunkTokenLimitExceededError
from lightrag.operate import chunking_by_token_size
from lightrag.utils import Tokenizer, TokenizerInterface
class DummyTokenizer(TokenizerInterface):
def encode(self, content: str):
return [ord(ch) for ch in content]
def decode(self, tokens):
return "".join(chr(token) for token in tokens)
def make_tokenizer() -> Tokenizer:
return Tokenizer(model_name="dummy", tokenizer=DummyTokenizer())
@pytest.mark.offline
def test_split_by_character_only_within_limit():
"""Test chunking when all chunks are within token limit."""
tokenizer = make_tokenizer()
chunks = chunking_by_token_size(
tokenizer,
"alpha\n\nbeta",
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)
assert [chunk["content"] for chunk in chunks] == ["alpha", "beta"]
@pytest.mark.offline
def test_split_by_character_only_exceeding_limit_raises():
"""Test that oversized chunks raise ChunkTokenLimitExceededError."""
tokenizer = make_tokenizer()
oversized = "a" * 12
with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
chunking_by_token_size(
tokenizer,
oversized,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=5,
)
err = excinfo.value
assert err.chunk_tokens == len(oversized)
assert err.chunk_token_limit == 5
@pytest.mark.offline
def test_chunk_error_includes_preview():
"""Test that error message includes chunk preview."""
tokenizer = make_tokenizer()
oversized = "x" * 100
with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
chunking_by_token_size(
tokenizer,
oversized,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)
err = excinfo.value
# Preview should be first 80 chars of a 100-char string
assert err.chunk_preview == "x" * 80
assert "Preview:" in str(err)
@pytest.mark.offline
def test_split_by_character_only_at_exact_limit():
"""Test chunking when chunk is exactly at token limit."""
tokenizer = make_tokenizer()
exact_size = "a" * 10
chunks = chunking_by_token_size(
tokenizer,
exact_size,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)
assert len(chunks) == 1
assert chunks[0]["content"] == exact_size
assert chunks[0]["tokens"] == 10
@pytest.mark.offline
def test_split_by_character_only_one_over_limit():
"""Test that chunk with one token over limit raises error."""
tokenizer = make_tokenizer()
one_over = "a" * 11
with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
chunking_by_token_size(
tokenizer,
one_over,
split_by_character="\n\n",
split_by_character_only=True,
chunk_token_size=10,
)
err = excinfo.value
assert err.chunk_tokens == 11
assert err.chunk_token_limit == 10