cognee/cognee/tasks/chunks/chunk_by_row.py

from typing import Any, Dict, Iterator
from uuid import NAMESPACE_OID, uuid5

from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine


def _get_pair_size(pair_text: str) -> int:
    """
    Calculate the size of a given text in terms of tokens.

    If an embedding engine's tokenizer is available, count the tokens for the provided word.
    If the tokenizer is not available, assume the word counts as one token.

    Parameters:
    -----------

        - pair_text (str): The key:value pair text for which the token size is to be calculated.

    Returns:
    --------

        - int: The number of tokens representing the text, typically an integer, depending
          on the tokenizer's output.
    """
    embedding_engine = get_embedding_engine()
    if embedding_engine.tokenizer:
        return embedding_engine.tokenizer.count_tokens(pair_text)
    else:
        return 3


def chunk_by_row(
    data: str,
    max_chunk_size,
) -> Iterator[Dict[str, Any]]:
    """
    Chunk the input text by row while enabling exact text reconstruction.

    This function divides the given text data into smaller chunks on a line-by-line basis,
    ensuring that the size of each chunk is less than or equal to the specified maximum
    chunk size. It guarantees that when the generated chunks are concatenated, they
    reproduce the original text accurately. The tokenization process is handled by
    adapters compatible with the vector engine's embedding model.

    Parameters:
    -----------

        - data (str): The input text to be chunked.
        - max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
          words.
    """
    current_chunk_list = []
    chunk_index = 0
    current_chunk_size = 0

    lines = data.split("\n\n")
    for line in lines:
        pairs_text = line.split(", ")

        for pair_text in pairs_text:
            pair_size = _get_pair_size(pair_text)
            if current_chunk_size > 0 and (current_chunk_size + pair_size > max_chunk_size):
                # Yield current cut chunk
                current_chunk = ", ".join(current_chunk_list)
                chunk_dict = {
                    "text": current_chunk,
                    "chunk_size": current_chunk_size,
                    "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                    "chunk_index": chunk_index,
                    "cut_type": "row_cut",
                }

                yield chunk_dict

                # Start new chunk with current pair text
                current_chunk_list = []
                current_chunk_size = 0
                chunk_index += 1

            current_chunk_list.append(pair_text)
            current_chunk_size += pair_size

        # Yield row chunk
        current_chunk = ", ".join(current_chunk_list)
        if current_chunk:
            chunk_dict = {
                "text": current_chunk,
                "chunk_size": current_chunk_size,
                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                "chunk_index": chunk_index,
                "cut_type": "row_end",
            }

            yield chunk_dict