cognee/cognee/tasks/chunks/chunk_by_row.py
EricXiao 742866b4c9 feat: csv ingestion loader & chunk
Signed-off-by: EricXiao <taoiaox@gmail.com>
2025-10-22 16:56:46 +08:00

94 lines
3.1 KiB
Python

from typing import Any, Dict, Iterator
from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
def _get_pair_size(pair_text: str) -> int:
"""
Calculate the size of a given text in terms of tokens.
If an embedding engine's tokenizer is available, count the tokens for the provided word.
If the tokenizer is not available, assume the word counts as one token.
Parameters:
-----------
- pair_text (str): The key:value pair text for which the token size is to be calculated.
Returns:
--------
- int: The number of tokens representing the text, typically an integer, depending
on the tokenizer's output.
"""
embedding_engine = get_embedding_engine()
if embedding_engine.tokenizer:
return embedding_engine.tokenizer.count_tokens(pair_text)
else:
return 3
def chunk_by_row(
data: str,
max_chunk_size,
) -> Iterator[Dict[str, Any]]:
"""
Chunk the input text by row while enabling exact text reconstruction.
This function divides the given text data into smaller chunks on a line-by-line basis,
ensuring that the size of each chunk is less than or equal to the specified maximum
chunk size. It guarantees that when the generated chunks are concatenated, they
reproduce the original text accurately. The tokenization process is handled by
adapters compatible with the vector engine's embedding model.
Parameters:
-----------
- data (str): The input text to be chunked.
- max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
words.
"""
current_chunk_list = []
chunk_index = 0
current_chunk_size = 0
lines = data.split("\n\n")
for line in lines:
pairs_text = line.split(", ")
for pair_text in pairs_text:
pair_size = _get_pair_size(pair_text)
if current_chunk_size > 0 and (current_chunk_size + pair_size > max_chunk_size):
# Yield current cut chunk
current_chunk = ", ".join(current_chunk_list)
chunk_dict = {
"text": current_chunk,
"chunk_size": current_chunk_size,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"chunk_index": chunk_index,
"cut_type": "row_cut",
}
yield chunk_dict
# Start new chunk with current pair text
current_chunk_list = []
current_chunk_size = 0
chunk_index += 1
current_chunk_list.append(pair_text)
current_chunk_size += pair_size
# Yield row chunk
current_chunk = ", ".join(current_chunk_list)
if current_chunk:
chunk_dict = {
"text": current_chunk,
"chunk_size": current_chunk_size,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"chunk_index": chunk_index,
"cut_type": "row_end",
}
yield chunk_dict