diff --git a/lightrag/exceptions.py b/lightrag/exceptions.py index e6a616cd..64c7ea3e 100644 --- a/lightrag/exceptions.py +++ b/lightrag/exceptions.py @@ -106,6 +106,27 @@ class PipelineCancelledException(Exception): self.message = message +class ChunkTokenLimitExceededError(ValueError): + """Raised when a chunk exceeds the configured token limit.""" + + def __init__( + self, + chunk_tokens: int, + chunk_token_limit: int, + chunk_preview: str | None = None, + ) -> None: + preview = chunk_preview.strip() if chunk_preview else None + truncated_preview = preview[:80] if preview else None + preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else "" + super().__init__( + f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}." + f"{preview_note}" + ) + self.chunk_tokens = chunk_tokens + self.chunk_token_limit = chunk_token_limit + self.chunk_preview = truncated_preview + + class QdrantMigrationError(Exception): """Raised when Qdrant data migration from legacy collections fails.""" diff --git a/lightrag/operate.py b/lightrag/operate.py index f226566b..5f824af0 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -8,7 +8,10 @@ import json_repair from typing import Any, AsyncIterator, overload, Literal from collections import Counter, defaultdict -from lightrag.exceptions import PipelineCancelledException +from lightrag.exceptions import ( + PipelineCancelledException, + ChunkTokenLimitExceededError, +) from lightrag.utils import ( logger, compute_mdhash_id, @@ -109,6 +112,17 @@ def chunking_by_token_size( if split_by_character_only: for chunk in raw_chunks: _tokens = tokenizer.encode(chunk) + if len(_tokens) > chunk_token_size: + logger.warning( + "Chunk split_by_character exceeds token limit: len=%d limit=%d", + len(_tokens), + chunk_token_size, + ) + raise ChunkTokenLimitExceededError( + chunk_tokens=len(_tokens), + chunk_token_limit=chunk_token_size, + chunk_preview=chunk[:120], + ) new_chunks.append((len(_tokens), chunk)) else: for chunk in raw_chunks: diff --git a/tests/test_chunking.py b/tests/test_chunking.py new file mode 100644 index 00000000..0650adc2 --- /dev/null +++ b/tests/test_chunking.py @@ -0,0 +1,113 @@ +import pytest + +from lightrag.exceptions import ChunkTokenLimitExceededError +from lightrag.operate import chunking_by_token_size +from lightrag.utils import Tokenizer, TokenizerInterface + + +class DummyTokenizer(TokenizerInterface): + def encode(self, content: str): + return [ord(ch) for ch in content] + + def decode(self, tokens): + return "".join(chr(token) for token in tokens) + + +def make_tokenizer() -> Tokenizer: + return Tokenizer(model_name="dummy", tokenizer=DummyTokenizer()) + + +@pytest.mark.offline +def test_split_by_character_only_within_limit(): + """Test chunking when all chunks are within token limit.""" + tokenizer = make_tokenizer() + + chunks = chunking_by_token_size( + tokenizer, + "alpha\n\nbeta", + split_by_character="\n\n", + split_by_character_only=True, + chunk_token_size=10, + ) + + assert [chunk["content"] for chunk in chunks] == ["alpha", "beta"] + + +@pytest.mark.offline +def test_split_by_character_only_exceeding_limit_raises(): + """Test that oversized chunks raise ChunkTokenLimitExceededError.""" + tokenizer = make_tokenizer() + oversized = "a" * 12 + + with pytest.raises(ChunkTokenLimitExceededError) as excinfo: + chunking_by_token_size( + tokenizer, + oversized, + split_by_character="\n\n", + split_by_character_only=True, + chunk_token_size=5, + ) + + err = excinfo.value + assert err.chunk_tokens == len(oversized) + assert err.chunk_token_limit == 5 + + +@pytest.mark.offline +def test_chunk_error_includes_preview(): + """Test that error message includes chunk preview.""" + tokenizer = make_tokenizer() + oversized = "x" * 100 + + with pytest.raises(ChunkTokenLimitExceededError) as excinfo: + chunking_by_token_size( + tokenizer, + oversized, + split_by_character="\n\n", + split_by_character_only=True, + chunk_token_size=10, + ) + + err = excinfo.value + # Preview should be first 80 chars of a 100-char string + assert err.chunk_preview == "x" * 80 + assert "Preview:" in str(err) + + +@pytest.mark.offline +def test_split_by_character_only_at_exact_limit(): + """Test chunking when chunk is exactly at token limit.""" + tokenizer = make_tokenizer() + exact_size = "a" * 10 + + chunks = chunking_by_token_size( + tokenizer, + exact_size, + split_by_character="\n\n", + split_by_character_only=True, + chunk_token_size=10, + ) + + assert len(chunks) == 1 + assert chunks[0]["content"] == exact_size + assert chunks[0]["tokens"] == 10 + + +@pytest.mark.offline +def test_split_by_character_only_one_over_limit(): + """Test that chunk with one token over limit raises error.""" + tokenizer = make_tokenizer() + one_over = "a" * 11 + + with pytest.raises(ChunkTokenLimitExceededError) as excinfo: + chunking_by_token_size( + tokenizer, + one_over, + split_by_character="\n\n", + split_by_character_only=True, + chunk_token_size=10, + ) + + err = excinfo.value + assert err.chunk_tokens == 11 + assert err.chunk_token_limit == 10