From 7740500693e3ae9459554e6e88cfb7067c730dea Mon Sep 17 00:00:00 2001 From: Tong Da Date: Sun, 9 Nov 2025 14:52:42 +0800 Subject: [PATCH] support async chunking func to improve processing performance when a heavy `chunking_func` is passed in by user --- lightrag/lightrag.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ae8411d2..7bd38fec 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1,5 +1,6 @@ from __future__ import annotations +from inspect import iscoroutinefunction import traceback import asyncio import configparser @@ -1779,14 +1780,8 @@ class LightRAG: content = content_data["content"] # Generate chunks from document - chunks: dict[str, Any] = { - compute_mdhash_id(dp["content"], prefix="chunk-"): { - **dp, - "full_doc_id": doc_id, - "file_path": file_path, # Add file path to each chunk - "llm_cache_list": [], # Initialize empty LLM cache list for each chunk - } - for dp in self.chunking_func( + if iscoroutinefunction(self.chunking_func): + chunks = await self.chunking_func( self.tokenizer, content, split_by_character, @@ -1794,6 +1789,23 @@ class LightRAG: self.chunk_overlap_token_size, self.chunk_token_size, ) + else: + chunks = self.chunking_func( + self.tokenizer, + content, + split_by_character, + split_by_character_only, + self.chunk_overlap_token_size, + self.chunk_token_size, + ) + chunks: dict[str, Any] = { + compute_mdhash_id(dp["content"], prefix="chunk-"): { + **dp, + "full_doc_id": doc_id, + "file_path": file_path, # Add file path to each chunk + "llm_cache_list": [], # Initialize empty LLM cache list for each chunk + } + for dp in chunks } if not chunks: