Merge pull request #2389 from danielaskdd/fix-chunk-size

Fix: Add chunk token limit validation with detailed error reporting
This commit is contained in:
Daniel.y 2025-11-19 20:34:11 +08:00 committed by GitHub
commit f72f435cef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 1103 additions and 1 deletions

View file

@ -106,6 +106,28 @@ class PipelineCancelledException(Exception):
self.message = message
class ChunkTokenLimitExceededError(ValueError):
"""Raised when a chunk exceeds the configured token limit."""
def __init__(
self,
chunk_tokens: int,
chunk_token_limit: int,
chunk_preview: str | None = None,
) -> None:
preview = chunk_preview.strip() if chunk_preview else None
truncated_preview = preview[:80] if preview else None
preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
message = (
f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
f"{preview_note}"
)
super().__init__(message)
self.chunk_tokens = chunk_tokens
self.chunk_token_limit = chunk_token_limit
self.chunk_preview = truncated_preview
class QdrantMigrationError(Exception):
"""Raised when Qdrant data migration from legacy collections fails."""

View file

@ -8,7 +8,10 @@ import json_repair
from typing import Any, AsyncIterator, overload, Literal
from collections import Counter, defaultdict
from lightrag.exceptions import PipelineCancelledException
from lightrag.exceptions import (
PipelineCancelledException,
ChunkTokenLimitExceededError,
)
from lightrag.utils import (
logger,
compute_mdhash_id,
@ -109,6 +112,17 @@ def chunking_by_token_size(
if split_by_character_only:
for chunk in raw_chunks:
_tokens = tokenizer.encode(chunk)
if len(_tokens) > chunk_token_size:
logger.warning(
"Chunk split_by_character exceeds token limit: len=%d limit=%d",
len(_tokens),
chunk_token_size,
)
raise ChunkTokenLimitExceededError(
chunk_tokens=len(_tokens),
chunk_token_limit=chunk_token_size,
chunk_preview=chunk[:120],
)
new_chunks.append((len(_tokens), chunk))
else:
for chunk in raw_chunks:

1066
tests/test_chunking.py Normal file

File diff suppressed because it is too large Load diff