From 7740500693e3ae9459554e6e88cfb7067c730dea Mon Sep 17 00:00:00 2001
From: Tong Da <tongda@outlook.com>
Date: Sun, 9 Nov 2025 14:52:42 +0800
Subject: [PATCH] support async chunking func to improve processing performance
 when a heavy `chunking_func` is passed in by user

---
 lightrag/lightrag.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index ae8411d2..7bd38fec 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from inspect import iscoroutinefunction
 import traceback
 import asyncio
 import configparser
@@ -1779,14 +1780,8 @@ class LightRAG:
                             content = content_data["content"]
 
                             # Generate chunks from document
-                            chunks: dict[str, Any] = {
-                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
-                                    **dp,
-                                    "full_doc_id": doc_id,
-                                    "file_path": file_path,  # Add file path to each chunk
-                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
-                                }
-                                for dp in self.chunking_func(
+                            if iscoroutinefunction(self.chunking_func):
+                                chunks = await self.chunking_func(
                                     self.tokenizer,
                                     content,
                                     split_by_character,
@@ -1794,6 +1789,23 @@ class LightRAG:
                                     self.chunk_overlap_token_size,
                                     self.chunk_token_size,
                                 )
+                            else:
+                                chunks = self.chunking_func(
+                                    self.tokenizer,
+                                    content,
+                                    split_by_character,
+                                    split_by_character_only,
+                                    self.chunk_overlap_token_size,
+                                    self.chunk_token_size,
+                                )
+                            chunks: dict[str, Any] = {
+                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
+                                    **dp,
+                                    "full_doc_id": doc_id,
+                                    "file_path": file_path,  # Add file path to each chunk
+                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
+                                }
+                                for dp in chunks
                             }
 
                             if not chunks: