Add token limit validation for character-only chunking

- Add ChunkTokenLimitExceededError exception - Validate chunks against token limits - Include chunk preview in error messages - Add comprehensive test coverage - Log warnings for oversized chunks
2025-11-19 18:32:43 +08:00 · 2025-11-19 18:32:43 +08:00 · f988a22652
commit f988a22652
parent 5cc916861f
3 changed files with 149 additions and 1 deletions
--- a/lightrag/exceptions.py
+++ b/lightrag/exceptions.py
@ -106,6 +106,27 @@ class PipelineCancelledException(Exception):
        self.message = message


+class ChunkTokenLimitExceededError(ValueError):
+    """Raised when a chunk exceeds the configured token limit."""
+
+    def __init__(
+        self,
+        chunk_tokens: int,
+        chunk_token_limit: int,
+        chunk_preview: str | None = None,
+    ) -> None:
+        preview = chunk_preview.strip() if chunk_preview else None
+        truncated_preview = preview[:80] if preview else None
+        preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
+        super().__init__(
+            f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
+            f"{preview_note}"
+        )
+        self.chunk_tokens = chunk_tokens
+        self.chunk_token_limit = chunk_token_limit
+        self.chunk_preview = truncated_preview
+
+
 class QdrantMigrationError(Exception):
    """Raised when Qdrant data migration from legacy collections fails."""

--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -8,7 +8,10 @@ import json_repair
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict

-from lightrag.exceptions import PipelineCancelledException
+from lightrag.exceptions import (
+    PipelineCancelledException,
+    ChunkTokenLimitExceededError,
+)
 from lightrag.utils import (
    logger,
    compute_mdhash_id,
@ -109,6 +112,17 @@ def chunking_by_token_size(
        if split_by_character_only:
            for chunk in raw_chunks:
                _tokens = tokenizer.encode(chunk)
+                if len(_tokens) > chunk_token_size:
+                    logger.warning(
+                        "Chunk split_by_character exceeds token limit: len=%d limit=%d",
+                        len(_tokens),
+                        chunk_token_size,
+                    )
+                    raise ChunkTokenLimitExceededError(
+                        chunk_tokens=len(_tokens),
+                        chunk_token_limit=chunk_token_size,
+                        chunk_preview=chunk[:120],
+                    )
                new_chunks.append((len(_tokens), chunk))
        else:
            for chunk in raw_chunks:
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@ -0,0 +1,113 @@
+import pytest
+
+from lightrag.exceptions import ChunkTokenLimitExceededError
+from lightrag.operate import chunking_by_token_size
+from lightrag.utils import Tokenizer, TokenizerInterface
+
+
+class DummyTokenizer(TokenizerInterface):
+    def encode(self, content: str):
+        return [ord(ch) for ch in content]
+
+    def decode(self, tokens):
+        return "".join(chr(token) for token in tokens)
+
+
+def make_tokenizer() -> Tokenizer:
+    return Tokenizer(model_name="dummy", tokenizer=DummyTokenizer())
+
+
+@pytest.mark.offline
+def test_split_by_character_only_within_limit():
+    """Test chunking when all chunks are within token limit."""
+    tokenizer = make_tokenizer()
+
+    chunks = chunking_by_token_size(
+        tokenizer,
+        "alpha\n\nbeta",
+        split_by_character="\n\n",
+        split_by_character_only=True,
+        chunk_token_size=10,
+    )
+
+    assert [chunk["content"] for chunk in chunks] == ["alpha", "beta"]
+
+
+@pytest.mark.offline
+def test_split_by_character_only_exceeding_limit_raises():
+    """Test that oversized chunks raise ChunkTokenLimitExceededError."""
+    tokenizer = make_tokenizer()
+    oversized = "a" * 12
+
+    with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
+        chunking_by_token_size(
+            tokenizer,
+            oversized,
+            split_by_character="\n\n",
+            split_by_character_only=True,
+            chunk_token_size=5,
+        )
+
+    err = excinfo.value
+    assert err.chunk_tokens == len(oversized)
+    assert err.chunk_token_limit == 5
+
+
+@pytest.mark.offline
+def test_chunk_error_includes_preview():
+    """Test that error message includes chunk preview."""
+    tokenizer = make_tokenizer()
+    oversized = "x" * 100
+
+    with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
+        chunking_by_token_size(
+            tokenizer,
+            oversized,
+            split_by_character="\n\n",
+            split_by_character_only=True,
+            chunk_token_size=10,
+        )
+
+    err = excinfo.value
+    # Preview should be first 80 chars of a 100-char string
+    assert err.chunk_preview == "x" * 80
+    assert "Preview:" in str(err)
+
+
+@pytest.mark.offline
+def test_split_by_character_only_at_exact_limit():
+    """Test chunking when chunk is exactly at token limit."""
+    tokenizer = make_tokenizer()
+    exact_size = "a" * 10
+
+    chunks = chunking_by_token_size(
+        tokenizer,
+        exact_size,
+        split_by_character="\n\n",
+        split_by_character_only=True,
+        chunk_token_size=10,
+    )
+
+    assert len(chunks) == 1
+    assert chunks[0]["content"] == exact_size
+    assert chunks[0]["tokens"] == 10
+
+
+@pytest.mark.offline
+def test_split_by_character_only_one_over_limit():
+    """Test that chunk with one token over limit raises error."""
+    tokenizer = make_tokenizer()
+    one_over = "a" * 11
+
+    with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
+        chunking_by_token_size(
+            tokenizer,
+            one_over,
+            split_by_character="\n\n",
+            split_by_character_only=True,
+            chunk_token_size=10,
+        )
+
+    err = excinfo.value
+    assert err.chunk_tokens == 11
+    assert err.chunk_token_limit == 10