Merge pull request #2389 from danielaskdd/fix-chunk-size

Fix: Add chunk token limit validation with detailed error reporting
2025-11-19 20:34:11 +08:00 · 2025-11-19 20:34:11 +08:00 · f72f435cef
commit f72f435cef
parent 5cc916861f fec7c67f45
3 changed files with 1103 additions and 1 deletions
--- a/lightrag/exceptions.py
+++ b/lightrag/exceptions.py
@ -106,6 +106,28 @@ class PipelineCancelledException(Exception):
        self.message = message
 class ChunkTokenLimitExceededError(ValueError):
    """Raised when a chunk exceeds the configured token limit."""
    def __init__(
        self,
        chunk_tokens: int,
        chunk_token_limit: int,
        chunk_preview: str | None = None,
    ) -> None:
        preview = chunk_preview.strip() if chunk_preview else None
        truncated_preview = preview[:80] if preview else None
        preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
        message = (
            f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
            f"{preview_note}"
        )
        super().__init__(message)
        self.chunk_tokens = chunk_tokens
        self.chunk_token_limit = chunk_token_limit
        self.chunk_preview = truncated_preview
 class QdrantMigrationError(Exception):
    """Raised when Qdrant data migration from legacy collections fails."""
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -8,7 +8,10 @@ import json_repair
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict
-from lightrag.exceptions import PipelineCancelledException
+from lightrag.exceptions import (
    PipelineCancelledException,
    ChunkTokenLimitExceededError,
 )
 from lightrag.utils import (
    logger,
    compute_mdhash_id,
@ -109,6 +112,17 @@ def chunking_by_token_size(
        if split_by_character_only:
            for chunk in raw_chunks:
                _tokens = tokenizer.encode(chunk)
                if len(_tokens) > chunk_token_size:
                    logger.warning(
                        "Chunk split_by_character exceeds token limit: len=%d limit=%d",
                        len(_tokens),
                        chunk_token_size,
                    )
                    raise ChunkTokenLimitExceededError(
                        chunk_tokens=len(_tokens),
                        chunk_token_limit=chunk_token_size,
                        chunk_preview=chunk[:120],
                    )
                new_chunks.append((len(_tokens), chunk))
        else:
            for chunk in raw_chunks:
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py