Merge branch 'tongda/main'

This commit is contained in:
yangdx 2025-11-13 12:56:28 +08:00
commit 8765974467

View file

@ -3,6 +3,7 @@ from __future__ import annotations
import traceback
import asyncio
import configparser
import inspect
import os
import time
import warnings
@ -12,6 +13,7 @@ from functools import partial
from typing import (
Any,
AsyncIterator,
Awaitable,
Callable,
Iterator,
cast,
@ -20,6 +22,7 @@ from typing import (
Optional,
List,
Dict,
Union,
)
from lightrag.prompt import PROMPTS
from lightrag.exceptions import PipelineCancelledException
@ -243,11 +246,13 @@ class LightRAG:
int,
int,
],
List[Dict[str, Any]],
Union[List[Dict[str, Any]], Awaitable[List[Dict[str, Any]]]],
] = field(default_factory=lambda: chunking_by_token_size)
"""
Custom chunking function for splitting text into chunks before processing.
The function can be either synchronous or asynchronous.
The function should take the following parameters:
- `tokenizer`: A Tokenizer instance to use for tokenization.
@ -257,7 +262,8 @@ class LightRAG:
- `chunk_token_size`: The maximum number of tokens per chunk.
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
The function should return a list of dictionaries, where each dictionary contains the following keys:
The function should return a list of dictionaries (or an awaitable that resolves to a list),
where each dictionary contains the following keys:
- `tokens`: The number of tokens in the chunk.
- `content`: The text content of the chunk.
@ -1756,7 +1762,28 @@ class LightRAG:
)
content = content_data["content"]
# Generate chunks from document
# Call chunking function, supporting both sync and async implementations
chunking_result = self.chunking_func(
self.tokenizer,
content,
split_by_character,
split_by_character_only,
self.chunk_overlap_token_size,
self.chunk_token_size,
)
# If result is awaitable, await to get actual result
if inspect.isawaitable(chunking_result):
chunking_result = await chunking_result
# Validate return type
if not isinstance(chunking_result, (list, tuple)):
raise TypeError(
f"chunking_func must return a list or tuple of dicts, "
f"got {type(chunking_result)}"
)
# Build chunks dictionary
chunks: dict[str, Any] = {
compute_mdhash_id(dp["content"], prefix="chunk-"): {
**dp,
@ -1764,14 +1791,7 @@ class LightRAG:
"file_path": file_path, # Add file path to each chunk
"llm_cache_list": [], # Initialize empty LLM cache list for each chunk
}
for dp in self.chunking_func(
self.tokenizer,
content,
split_by_character,
split_by_character_only,
self.chunk_overlap_token_size,
self.chunk_token_size,
)
for dp in chunking_result
}
if not chunks: