support async chunking func to improve processing performance when a heavy chunking_func is passed in by user
This commit is contained in:
parent
18a4870229
commit
7740500693
1 changed files with 20 additions and 8 deletions
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from inspect import iscoroutinefunction
|
||||
import traceback
|
||||
import asyncio
|
||||
import configparser
|
||||
|
|
@ -1779,14 +1780,8 @@ class LightRAG:
|
|||
content = content_data["content"]
|
||||
|
||||
# Generate chunks from document
|
||||
chunks: dict[str, Any] = {
|
||||
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
||||
**dp,
|
||||
"full_doc_id": doc_id,
|
||||
"file_path": file_path, # Add file path to each chunk
|
||||
"llm_cache_list": [], # Initialize empty LLM cache list for each chunk
|
||||
}
|
||||
for dp in self.chunking_func(
|
||||
if iscoroutinefunction(self.chunking_func):
|
||||
chunks = await self.chunking_func(
|
||||
self.tokenizer,
|
||||
content,
|
||||
split_by_character,
|
||||
|
|
@ -1794,6 +1789,23 @@ class LightRAG:
|
|||
self.chunk_overlap_token_size,
|
||||
self.chunk_token_size,
|
||||
)
|
||||
else:
|
||||
chunks = self.chunking_func(
|
||||
self.tokenizer,
|
||||
content,
|
||||
split_by_character,
|
||||
split_by_character_only,
|
||||
self.chunk_overlap_token_size,
|
||||
self.chunk_token_size,
|
||||
)
|
||||
chunks: dict[str, Any] = {
|
||||
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
||||
**dp,
|
||||
"full_doc_id": doc_id,
|
||||
"file_path": file_path, # Add file path to each chunk
|
||||
"llm_cache_list": [], # Initialize empty LLM cache list for each chunk
|
||||
}
|
||||
for dp in chunks
|
||||
}
|
||||
|
||||
if not chunks:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue