support async chunking func to improve processing performance when a heavy chunking_func is passed in by user

(cherry picked from commit 7740500693)
2025-11-09 14:52:42 +08:00 · 2025-11-09 14:52:42 +08:00 · 8a43e16f6e
commit 8a43e16f6e
parent 70ba7cd787
1 changed files with 20 additions and 8 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -1,5 +1,6 @@
 from __future__ import annotations
 from inspect import iscoroutinefunction
 import traceback
 import asyncio
 import configparser
@ -1757,14 +1758,8 @@ class LightRAG:
                            content = content_data["content"]
                            # Generate chunks from document
-                            chunks: dict[str, Any] = {
+                            if iscoroutinefunction(self.chunking_func):
-                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
+                                chunks = await self.chunking_func(
                                    **dp,
                                    "full_doc_id": doc_id,
                                    "file_path": file_path,  # Add file path to each chunk
                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
                                }
                                for dp in self.chunking_func(
                                    self.tokenizer,
                                    content,
                                    split_by_character,
@ -1772,6 +1767,23 @@ class LightRAG:
                                    self.chunk_overlap_token_size,
                                    self.chunk_token_size,
                                )
                            else:
                                chunks = self.chunking_func(
                                    self.tokenizer,
                                    content,
                                    split_by_character,
                                    split_by_character_only,
                                    self.chunk_overlap_token_size,
                                    self.chunk_token_size,
                                )
                            chunks: dict[str, Any] = {
                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
                                    **dp,
                                    "full_doc_id": doc_id,
                                    "file_path": file_path,  # Add file path to each chunk
                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
                                }
                                for dp in chunks
                            }
                            if not chunks: