94 lines
3.1 KiB
Python
94 lines
3.1 KiB
Python
from typing import Any, Dict, Iterator
|
|
from uuid import NAMESPACE_OID, uuid5
|
|
|
|
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
|
|
|
|
|
|
def _get_pair_size(pair_text: str) -> int:
|
|
"""
|
|
Calculate the size of a given text in terms of tokens.
|
|
|
|
If an embedding engine's tokenizer is available, count the tokens for the provided word.
|
|
If the tokenizer is not available, assume the word counts as one token.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- pair_text (str): The key:value pair text for which the token size is to be calculated.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- int: The number of tokens representing the text, typically an integer, depending
|
|
on the tokenizer's output.
|
|
"""
|
|
embedding_engine = get_embedding_engine()
|
|
if embedding_engine.tokenizer:
|
|
return embedding_engine.tokenizer.count_tokens(pair_text)
|
|
else:
|
|
return 3
|
|
|
|
|
|
def chunk_by_row(
|
|
data: str,
|
|
max_chunk_size,
|
|
) -> Iterator[Dict[str, Any]]:
|
|
"""
|
|
Chunk the input text by row while enabling exact text reconstruction.
|
|
|
|
This function divides the given text data into smaller chunks on a line-by-line basis,
|
|
ensuring that the size of each chunk is less than or equal to the specified maximum
|
|
chunk size. It guarantees that when the generated chunks are concatenated, they
|
|
reproduce the original text accurately. The tokenization process is handled by
|
|
adapters compatible with the vector engine's embedding model.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- data (str): The input text to be chunked.
|
|
- max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
|
|
words.
|
|
"""
|
|
current_chunk_list = []
|
|
chunk_index = 0
|
|
current_chunk_size = 0
|
|
|
|
lines = data.split("\n\n")
|
|
for line in lines:
|
|
pairs_text = line.split(", ")
|
|
|
|
for pair_text in pairs_text:
|
|
pair_size = _get_pair_size(pair_text)
|
|
if current_chunk_size > 0 and (current_chunk_size + pair_size > max_chunk_size):
|
|
# Yield current cut chunk
|
|
current_chunk = ", ".join(current_chunk_list)
|
|
chunk_dict = {
|
|
"text": current_chunk,
|
|
"chunk_size": current_chunk_size,
|
|
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
|
"chunk_index": chunk_index,
|
|
"cut_type": "row_cut",
|
|
}
|
|
|
|
yield chunk_dict
|
|
|
|
# Start new chunk with current pair text
|
|
current_chunk_list = []
|
|
current_chunk_size = 0
|
|
chunk_index += 1
|
|
|
|
current_chunk_list.append(pair_text)
|
|
current_chunk_size += pair_size
|
|
|
|
# Yield row chunk
|
|
current_chunk = ", ".join(current_chunk_list)
|
|
if current_chunk:
|
|
chunk_dict = {
|
|
"text": current_chunk,
|
|
"chunk_size": current_chunk_size,
|
|
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
|
"chunk_index": chunk_index,
|
|
"cut_type": "row_end",
|
|
}
|
|
|
|
yield chunk_dict
|