Autoformat chunking tests

This commit is contained in:
Leon Luithlen 2024-11-13 12:13:12 +01:00
parent ce498d97dd
commit 92a66dddb9
4 changed files with 58 additions and 39 deletions

View file

@ -1,17 +1,22 @@
import pytest
import numpy as np import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_paragraph from cognee.tasks.chunks import chunk_by_paragraph
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
@pytest.mark.parametrize("input_text", [
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"]
])
@pytest.mark.parametrize(
"input_text",
[
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
],
)
def test_chunk_by_paragraph_isomorphism(input_text): def test_chunk_by_paragraph_isomorphism(input_text):
chunks = chunk_by_paragraph(input_text) chunks = chunk_by_paragraph(input_text)
reconstructed_text = "".join([chunk["text"] for chunk in chunks]) reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" assert (
reconstructed_text == input_text
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"

View file

@ -1,17 +1,22 @@
import pytest
import numpy as np import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_sentence from cognee.tasks.chunks import chunk_by_sentence
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
@pytest.mark.parametrize("input_text", [
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"]
])
@pytest.mark.parametrize(
"input_text",
[
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
],
)
def test_chunk_by_sentence_isomorphism(input_text): def test_chunk_by_sentence_isomorphism(input_text):
chunks = chunk_by_sentence(input_text) chunks = chunk_by_sentence(input_text)
reconstructed_text = "".join([chunk[2] for chunk in chunks]) reconstructed_text = "".join([chunk[2] for chunk in chunks])
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" assert (
reconstructed_text == input_text
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"

View file

@ -1,31 +1,40 @@
import pytest
import numpy as np import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_word from cognee.tasks.chunks import chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
@pytest.mark.parametrize("input_text", [
INPUT_TEXTS["english_text"], @pytest.mark.parametrize(
INPUT_TEXTS["english_lists"], "input_text",
INPUT_TEXTS["python_code"], [
INPUT_TEXTS["chinese_text"] INPUT_TEXTS["english_text"],
]) INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
],
)
def test_chunk_by_word_isomorphism(input_text): def test_chunk_by_word_isomorphism(input_text):
chunks = chunk_by_word(input_text) chunks = chunk_by_word(input_text)
reconstructed_text = "".join([chunk[0] for chunk in chunks]) reconstructed_text = "".join([chunk[0] for chunk in chunks])
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" assert (
reconstructed_text == input_text
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@pytest.mark.parametrize("input_text", [
INPUT_TEXTS["english_text"], @pytest.mark.parametrize(
INPUT_TEXTS["english_lists"], "input_text",
INPUT_TEXTS["python_code"], [
INPUT_TEXTS["chinese_text"] INPUT_TEXTS["english_text"],
]) INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
],
)
def test_chunk_by_word_splits(input_text): def test_chunk_by_word_splits(input_text):
chunks = np.array(list(chunk_by_word(input_text))) chunks = np.array(list(chunk_by_word(input_text)))
space_test = np.array([" " not in chunk[0].strip() for chunk in chunks]) space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
assert np.all(space_test), f"These chunks contain spaces within them: {chunks[space_test == False]}" assert np.all(
space_test
), f"These chunks contain spaces within them: {chunks[space_test == False]}"

View file

@ -271,5 +271,5 @@ What feign'd submission swore: ease would recant
Vows made in pain, as violent and void. Vows made in pain, as violent and void.
For never can true reconcilement grow For never can true reconcilement grow
Where wounds of deadly hate have peirc'd so deep: Where wounds of deadly hate have peirc'd so deep:
Which would but lead me to a worse relapse [ 100 ]""" Which would but lead me to a worse relapse [ 100 ]""",
} }