Merge branch 'tongda/main'
This commit is contained in:
commit
8765974467
1 changed files with 31 additions and 11 deletions
|
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
||||||
import traceback
|
import traceback
|
||||||
import asyncio
|
import asyncio
|
||||||
import configparser
|
import configparser
|
||||||
|
import inspect
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
|
|
@ -12,6 +13,7 @@ from functools import partial
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
AsyncIterator,
|
AsyncIterator,
|
||||||
|
Awaitable,
|
||||||
Callable,
|
Callable,
|
||||||
Iterator,
|
Iterator,
|
||||||
cast,
|
cast,
|
||||||
|
|
@ -20,6 +22,7 @@ from typing import (
|
||||||
Optional,
|
Optional,
|
||||||
List,
|
List,
|
||||||
Dict,
|
Dict,
|
||||||
|
Union,
|
||||||
)
|
)
|
||||||
from lightrag.prompt import PROMPTS
|
from lightrag.prompt import PROMPTS
|
||||||
from lightrag.exceptions import PipelineCancelledException
|
from lightrag.exceptions import PipelineCancelledException
|
||||||
|
|
@ -243,11 +246,13 @@ class LightRAG:
|
||||||
int,
|
int,
|
||||||
int,
|
int,
|
||||||
],
|
],
|
||||||
List[Dict[str, Any]],
|
Union[List[Dict[str, Any]], Awaitable[List[Dict[str, Any]]]],
|
||||||
] = field(default_factory=lambda: chunking_by_token_size)
|
] = field(default_factory=lambda: chunking_by_token_size)
|
||||||
"""
|
"""
|
||||||
Custom chunking function for splitting text into chunks before processing.
|
Custom chunking function for splitting text into chunks before processing.
|
||||||
|
|
||||||
|
The function can be either synchronous or asynchronous.
|
||||||
|
|
||||||
The function should take the following parameters:
|
The function should take the following parameters:
|
||||||
|
|
||||||
- `tokenizer`: A Tokenizer instance to use for tokenization.
|
- `tokenizer`: A Tokenizer instance to use for tokenization.
|
||||||
|
|
@ -257,7 +262,8 @@ class LightRAG:
|
||||||
- `chunk_token_size`: The maximum number of tokens per chunk.
|
- `chunk_token_size`: The maximum number of tokens per chunk.
|
||||||
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
|
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
|
||||||
|
|
||||||
The function should return a list of dictionaries, where each dictionary contains the following keys:
|
The function should return a list of dictionaries (or an awaitable that resolves to a list),
|
||||||
|
where each dictionary contains the following keys:
|
||||||
- `tokens`: The number of tokens in the chunk.
|
- `tokens`: The number of tokens in the chunk.
|
||||||
- `content`: The text content of the chunk.
|
- `content`: The text content of the chunk.
|
||||||
|
|
||||||
|
|
@ -1756,7 +1762,28 @@ class LightRAG:
|
||||||
)
|
)
|
||||||
content = content_data["content"]
|
content = content_data["content"]
|
||||||
|
|
||||||
# Generate chunks from document
|
# Call chunking function, supporting both sync and async implementations
|
||||||
|
chunking_result = self.chunking_func(
|
||||||
|
self.tokenizer,
|
||||||
|
content,
|
||||||
|
split_by_character,
|
||||||
|
split_by_character_only,
|
||||||
|
self.chunk_overlap_token_size,
|
||||||
|
self.chunk_token_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
# If result is awaitable, await to get actual result
|
||||||
|
if inspect.isawaitable(chunking_result):
|
||||||
|
chunking_result = await chunking_result
|
||||||
|
|
||||||
|
# Validate return type
|
||||||
|
if not isinstance(chunking_result, (list, tuple)):
|
||||||
|
raise TypeError(
|
||||||
|
f"chunking_func must return a list or tuple of dicts, "
|
||||||
|
f"got {type(chunking_result)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build chunks dictionary
|
||||||
chunks: dict[str, Any] = {
|
chunks: dict[str, Any] = {
|
||||||
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
||||||
**dp,
|
**dp,
|
||||||
|
|
@ -1764,14 +1791,7 @@ class LightRAG:
|
||||||
"file_path": file_path, # Add file path to each chunk
|
"file_path": file_path, # Add file path to each chunk
|
||||||
"llm_cache_list": [], # Initialize empty LLM cache list for each chunk
|
"llm_cache_list": [], # Initialize empty LLM cache list for each chunk
|
||||||
}
|
}
|
||||||
for dp in self.chunking_func(
|
for dp in chunking_result
|
||||||
self.tokenizer,
|
|
||||||
content,
|
|
||||||
split_by_character,
|
|
||||||
split_by_character_only,
|
|
||||||
self.chunk_overlap_token_size,
|
|
||||||
self.chunk_token_size,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if not chunks:
|
if not chunks:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue