support async chunking func to improve processing performance when a heavy chunking_func is passed in by user

2025-11-09 14:52:42 +08:00 · 2025-11-09 14:52:42 +08:00 · 7740500693
commit 7740500693
parent 18a4870229
1 changed files with 20 additions and 8 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+from inspect import iscoroutinefunction
 import traceback
 import asyncio
 import configparser
@ -1779,14 +1780,8 @@ class LightRAG:
                            content = content_data["content"]

                            # Generate chunks from document
-                            chunks: dict[str, Any] = {
-                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
-                                    **dp,
-                                    "full_doc_id": doc_id,
-                                    "file_path": file_path,  # Add file path to each chunk
-                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
-                                }
-                                for dp in self.chunking_func(
+                            if iscoroutinefunction(self.chunking_func):
+                                chunks = await self.chunking_func(
                                    self.tokenizer,
                                    content,
                                    split_by_character,
@ -1794,6 +1789,23 @@ class LightRAG:
                                    self.chunk_overlap_token_size,
                                    self.chunk_token_size,
                                )
+                            else:
+                                chunks = self.chunking_func(
+                                    self.tokenizer,
+                                    content,
+                                    split_by_character,
+                                    split_by_character_only,
+                                    self.chunk_overlap_token_size,
+                                    self.chunk_token_size,
+                                )
+                            chunks: dict[str, Any] = {
+                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
+                                    **dp,
+                                    "full_doc_id": doc_id,
+                                    "file_path": file_path,  # Add file path to each chunk
+                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
+                                }
+                                for dp in chunks
                            }

                            if not chunks: