cognee/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
2025-01-10 15:11:00 +01:00

41 lines
1.3 KiB
Python

from itertools import product
import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
maximum_length_vals = [None, 8, 64]
@pytest.mark.parametrize(
"input_text,maximum_length",
list(product(list(INPUT_TEXTS.values()), maximum_length_vals)),
)
def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
chunks = chunk_by_sentence(input_text, maximum_length)
reconstructed_text = "".join([chunk[1] for chunk in chunks])
assert reconstructed_text == input_text, (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
)
@pytest.mark.parametrize(
"input_text,maximum_length",
list(
product(
list(INPUT_TEXTS.values()),
[val for val in maximum_length_vals if val is not None],
)
),
)
def test_paragraph_chunk_length(input_text, maximum_length):
chunks = list(chunk_by_sentence(input_text, maximum_length))
chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
assert np.all(chunk_lengths <= maximum_length), (
f"{maximum_length = }: {larger_chunks} are too large"
)