Merge branch 'tongda/main'

2025-11-13 12:56:28 +08:00 · 2025-11-13 12:56:28 +08:00 · 8765974467
commit 8765974467
parent 343d30727a c230d1a28d
1 changed files with 31 additions and 11 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -3,6 +3,7 @@ from __future__ import annotations
 import traceback
 import asyncio
 import configparser
+import inspect
 import os
 import time
 import warnings
@ -12,6 +13,7 @@ from functools import partial
 from typing import (
    Any,
    AsyncIterator,
+    Awaitable,
    Callable,
    Iterator,
    cast,
@ -20,6 +22,7 @@ from typing import (
    Optional,
    List,
    Dict,
+    Union,
 )
 from lightrag.prompt import PROMPTS
 from lightrag.exceptions import PipelineCancelledException
@ -243,11 +246,13 @@ class LightRAG:
            int,
            int,
        ],
-        List[Dict[str, Any]],
+        Union[List[Dict[str, Any]], Awaitable[List[Dict[str, Any]]]],
    ] = field(default_factory=lambda: chunking_by_token_size)
    """
    Custom chunking function for splitting text into chunks before processing.

+    The function can be either synchronous or asynchronous.
+
    The function should take the following parameters:

        - `tokenizer`: A Tokenizer instance to use for tokenization.
@ -257,7 +262,8 @@ class LightRAG:
        - `chunk_token_size`: The maximum number of tokens per chunk.
        - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.

-    The function should return a list of dictionaries, where each dictionary contains the following keys:
+    The function should return a list of dictionaries (or an awaitable that resolves to a list),
+    where each dictionary contains the following keys:
        - `tokens`: The number of tokens in the chunk.
        - `content`: The text content of the chunk.

@ -1756,7 +1762,28 @@ class LightRAG:
                                )
                            content = content_data["content"]

-                            # Generate chunks from document
+                            # Call chunking function, supporting both sync and async implementations
+                            chunking_result = self.chunking_func(
+                                self.tokenizer,
+                                content,
+                                split_by_character,
+                                split_by_character_only,
+                                self.chunk_overlap_token_size,
+                                self.chunk_token_size,
+                            )
+
+                            # If result is awaitable, await to get actual result
+                            if inspect.isawaitable(chunking_result):
+                                chunking_result = await chunking_result
+
+                            # Validate return type
+                            if not isinstance(chunking_result, (list, tuple)):
+                                raise TypeError(
+                                    f"chunking_func must return a list or tuple of dicts, "
+                                    f"got {type(chunking_result)}"
+                                )
+
+                            # Build chunks dictionary
                            chunks: dict[str, Any] = {
                                compute_mdhash_id(dp["content"], prefix="chunk-"): {
                                    **dp,
@ -1764,14 +1791,7 @@ class LightRAG:
                                    "file_path": file_path,  # Add file path to each chunk
                                    "llm_cache_list": [],  # Initialize empty LLM cache list for each chunk
                                }
-                                for dp in self.chunking_func(
-                                    self.tokenizer,
-                                    content,
-                                    split_by_character,
-                                    split_by_character_only,
-                                    self.chunk_overlap_token_size,
-                                    self.chunk_token_size,
-                                )
+                                for dp in chunking_result
                            }

                            if not chunks: