cognee/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
2024-11-13 15:35:02 +01:00

17 lines
616 B
Python

import pytest
import numpy as np
from cognee.tasks.chunks import chunk_by_sentence
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
@pytest.mark.parametrize("input_text", [
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"]
])
def test_chunk_by_sentence_isomorphism(input_text):
chunks = chunk_by_sentence(input_text)
reconstructed_text = "".join([chunk[2] for chunk in chunks])
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"