Merge pull request #2389 from danielaskdd/fix-chunk-size

Fix: Add chunk token limit validation with detailed error reporting
This commit is contained in:
Daniel.y 2025-11-19 20:34:11 +08:00 committed by GitHub
commit f72f435cef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 1103 additions and 1 deletions

View file

@ -106,6 +106,28 @@ class PipelineCancelledException(Exception):
self.message = message self.message = message
class ChunkTokenLimitExceededError(ValueError):
"""Raised when a chunk exceeds the configured token limit."""
def __init__(
self,
chunk_tokens: int,
chunk_token_limit: int,
chunk_preview: str | None = None,
) -> None:
preview = chunk_preview.strip() if chunk_preview else None
truncated_preview = preview[:80] if preview else None
preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
message = (
f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
f"{preview_note}"
)
super().__init__(message)
self.chunk_tokens = chunk_tokens
self.chunk_token_limit = chunk_token_limit
self.chunk_preview = truncated_preview
class QdrantMigrationError(Exception): class QdrantMigrationError(Exception):
"""Raised when Qdrant data migration from legacy collections fails.""" """Raised when Qdrant data migration from legacy collections fails."""

View file

@ -8,7 +8,10 @@ import json_repair
from typing import Any, AsyncIterator, overload, Literal from typing import Any, AsyncIterator, overload, Literal
from collections import Counter, defaultdict from collections import Counter, defaultdict
from lightrag.exceptions import PipelineCancelledException from lightrag.exceptions import (
PipelineCancelledException,
ChunkTokenLimitExceededError,
)
from lightrag.utils import ( from lightrag.utils import (
logger, logger,
compute_mdhash_id, compute_mdhash_id,
@ -109,6 +112,17 @@ def chunking_by_token_size(
if split_by_character_only: if split_by_character_only:
for chunk in raw_chunks: for chunk in raw_chunks:
_tokens = tokenizer.encode(chunk) _tokens = tokenizer.encode(chunk)
if len(_tokens) > chunk_token_size:
logger.warning(
"Chunk split_by_character exceeds token limit: len=%d limit=%d",
len(_tokens),
chunk_token_size,
)
raise ChunkTokenLimitExceededError(
chunk_tokens=len(_tokens),
chunk_token_limit=chunk_token_size,
chunk_preview=chunk[:120],
)
new_chunks.append((len(_tokens), chunk)) new_chunks.append((len(_tokens), chunk))
else: else:
for chunk in raw_chunks: for chunk in raw_chunks:

1066
tests/test_chunking.py Normal file

File diff suppressed because it is too large Load diff