refactor(chunking): rename params and improve docstring for chunking_by_token_size
This commit is contained in:
parent
dfbc97363c
commit
dacca334e0
2 changed files with 15 additions and 13 deletions
|
|
@ -260,14 +260,16 @@ class LightRAG:
|
|||
- `content`: The text to be split into chunks.
|
||||
- `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
|
||||
- `split_by_character_only`: If True, the text is split only on the specified character.
|
||||
- `chunk_token_size`: The maximum number of tokens per chunk.
|
||||
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
|
||||
- `chunk_token_size`: The maximum number of tokens per chunk.
|
||||
|
||||
|
||||
The function should return a list of dictionaries (or an awaitable that resolves to a list),
|
||||
where each dictionary contains the following keys:
|
||||
- `tokens`: The number of tokens in the chunk.
|
||||
- `content`: The text content of the chunk.
|
||||
|
||||
- `tokens` (int): The number of tokens in the chunk.
|
||||
- `content` (str): The text content of the chunk.
|
||||
- `chunk_order_index` (int): Zero-based index indicating the chunk's order in the document.
|
||||
|
||||
Defaults to `chunking_by_token_size` if not specified.
|
||||
"""
|
||||
|
||||
|
|
|
|||
|
|
@ -98,8 +98,8 @@ def chunking_by_token_size(
|
|||
content: str,
|
||||
split_by_character: str | None = None,
|
||||
split_by_character_only: bool = False,
|
||||
overlap_token_size: int = 128,
|
||||
max_token_size: int = 1024,
|
||||
chunk_overlap_token_size: int = 128,
|
||||
chunk_token_size: int = 1024,
|
||||
) -> list[dict[str, Any]]:
|
||||
tokens = tokenizer.encode(content)
|
||||
results: list[dict[str, Any]] = []
|
||||
|
|
@ -113,15 +113,15 @@ def chunking_by_token_size(
|
|||
else:
|
||||
for chunk in raw_chunks:
|
||||
_tokens = tokenizer.encode(chunk)
|
||||
if len(_tokens) > max_token_size:
|
||||
if len(_tokens) > chunk_token_size:
|
||||
for start in range(
|
||||
0, len(_tokens), max_token_size - overlap_token_size
|
||||
0, len(_tokens), chunk_token_size - chunk_overlap_token_size
|
||||
):
|
||||
chunk_content = tokenizer.decode(
|
||||
_tokens[start : start + max_token_size]
|
||||
_tokens[start : start + chunk_token_size]
|
||||
)
|
||||
new_chunks.append(
|
||||
(min(max_token_size, len(_tokens) - start), chunk_content)
|
||||
(min(chunk_token_size, len(_tokens) - start), chunk_content)
|
||||
)
|
||||
else:
|
||||
new_chunks.append((len(_tokens), chunk))
|
||||
|
|
@ -135,12 +135,12 @@ def chunking_by_token_size(
|
|||
)
|
||||
else:
|
||||
for index, start in enumerate(
|
||||
range(0, len(tokens), max_token_size - overlap_token_size)
|
||||
range(0, len(tokens), chunk_token_size - chunk_overlap_token_size)
|
||||
):
|
||||
chunk_content = tokenizer.decode(tokens[start : start + max_token_size])
|
||||
chunk_content = tokenizer.decode(tokens[start : start + chunk_token_size])
|
||||
results.append(
|
||||
{
|
||||
"tokens": min(max_token_size, len(tokens) - start),
|
||||
"tokens": min(chunk_token_size, len(tokens) - start),
|
||||
"content": chunk_content.strip(),
|
||||
"chunk_order_index": index,
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue