From dacca334e07105966a24ecaffd41227e07a6f143 Mon Sep 17 00:00:00 2001 From: EightyOliveira Date: Tue, 18 Nov 2025 15:46:28 +0800 Subject: [PATCH] refactor(chunking): rename params and improve docstring for chunking_by_token_size --- lightrag/lightrag.py | 10 ++++++---- lightrag/operate.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index c0fa8627..eb5a07ba 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -260,14 +260,16 @@ class LightRAG: - `content`: The text to be split into chunks. - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens. - `split_by_character_only`: If True, the text is split only on the specified character. - - `chunk_token_size`: The maximum number of tokens per chunk. - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks. + - `chunk_token_size`: The maximum number of tokens per chunk. + The function should return a list of dictionaries (or an awaitable that resolves to a list), where each dictionary contains the following keys: - - `tokens`: The number of tokens in the chunk. - - `content`: The text content of the chunk. - + - `tokens` (int): The number of tokens in the chunk. + - `content` (str): The text content of the chunk. + - `chunk_order_index` (int): Zero-based index indicating the chunk's order in the document. + Defaults to `chunking_by_token_size` if not specified. """ diff --git a/lightrag/operate.py b/lightrag/operate.py index 858553b1..512b04a2 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -98,8 +98,8 @@ def chunking_by_token_size( content: str, split_by_character: str | None = None, split_by_character_only: bool = False, - overlap_token_size: int = 128, - max_token_size: int = 1024, + chunk_overlap_token_size: int = 128, + chunk_token_size: int = 1024, ) -> list[dict[str, Any]]: tokens = tokenizer.encode(content) results: list[dict[str, Any]] = [] @@ -113,15 +113,15 @@ def chunking_by_token_size( else: for chunk in raw_chunks: _tokens = tokenizer.encode(chunk) - if len(_tokens) > max_token_size: + if len(_tokens) > chunk_token_size: for start in range( - 0, len(_tokens), max_token_size - overlap_token_size + 0, len(_tokens), chunk_token_size - chunk_overlap_token_size ): chunk_content = tokenizer.decode( - _tokens[start : start + max_token_size] + _tokens[start : start + chunk_token_size] ) new_chunks.append( - (min(max_token_size, len(_tokens) - start), chunk_content) + (min(chunk_token_size, len(_tokens) - start), chunk_content) ) else: new_chunks.append((len(_tokens), chunk)) @@ -135,12 +135,12 @@ def chunking_by_token_size( ) else: for index, start in enumerate( - range(0, len(tokens), max_token_size - overlap_token_size) + range(0, len(tokens), chunk_token_size - chunk_overlap_token_size) ): - chunk_content = tokenizer.decode(tokens[start : start + max_token_size]) + chunk_content = tokenizer.decode(tokens[start : start + chunk_token_size]) results.append( { - "tokens": min(max_token_size, len(tokens) - start), + "tokens": min(chunk_token_size, len(tokens) - start), "content": chunk_content.strip(), "chunk_order_index": index, }