Autoformat chunking tests
This commit is contained in:
parent
ce498d97dd
commit
92a66dddb9
4 changed files with 58 additions and 39 deletions
|
|
@ -1,17 +1,22 @@
|
||||||
import pytest
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
from cognee.tasks.chunks import chunk_by_paragraph
|
from cognee.tasks.chunks import chunk_by_paragraph
|
||||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
|
|
||||||
@pytest.mark.parametrize("input_text", [
|
|
||||||
INPUT_TEXTS["english_text"],
|
|
||||||
INPUT_TEXTS["english_lists"],
|
|
||||||
INPUT_TEXTS["python_code"],
|
|
||||||
INPUT_TEXTS["chinese_text"]
|
|
||||||
])
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"input_text",
|
||||||
|
[
|
||||||
|
INPUT_TEXTS["english_text"],
|
||||||
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"],
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_chunk_by_paragraph_isomorphism(input_text):
|
def test_chunk_by_paragraph_isomorphism(input_text):
|
||||||
chunks = chunk_by_paragraph(input_text)
|
chunks = chunk_by_paragraph(input_text)
|
||||||
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
|
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
|
||||||
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
assert (
|
||||||
|
reconstructed_text == input_text
|
||||||
|
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,22 @@
|
||||||
import pytest
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
from cognee.tasks.chunks import chunk_by_sentence
|
from cognee.tasks.chunks import chunk_by_sentence
|
||||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
|
|
||||||
@pytest.mark.parametrize("input_text", [
|
|
||||||
INPUT_TEXTS["english_text"],
|
|
||||||
INPUT_TEXTS["english_lists"],
|
|
||||||
INPUT_TEXTS["python_code"],
|
|
||||||
INPUT_TEXTS["chinese_text"]
|
|
||||||
])
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"input_text",
|
||||||
|
[
|
||||||
|
INPUT_TEXTS["english_text"],
|
||||||
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"],
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_chunk_by_sentence_isomorphism(input_text):
|
def test_chunk_by_sentence_isomorphism(input_text):
|
||||||
chunks = chunk_by_sentence(input_text)
|
chunks = chunk_by_sentence(input_text)
|
||||||
reconstructed_text = "".join([chunk[2] for chunk in chunks])
|
reconstructed_text = "".join([chunk[2] for chunk in chunks])
|
||||||
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
assert (
|
||||||
|
reconstructed_text == input_text
|
||||||
|
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
|
||||||
|
|
@ -1,31 +1,40 @@
|
||||||
import pytest
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
from cognee.tasks.chunks import chunk_by_word
|
from cognee.tasks.chunks import chunk_by_word
|
||||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
|
|
||||||
@pytest.mark.parametrize("input_text", [
|
|
||||||
INPUT_TEXTS["english_text"],
|
@pytest.mark.parametrize(
|
||||||
INPUT_TEXTS["english_lists"],
|
"input_text",
|
||||||
INPUT_TEXTS["python_code"],
|
[
|
||||||
INPUT_TEXTS["chinese_text"]
|
INPUT_TEXTS["english_text"],
|
||||||
])
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"],
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_chunk_by_word_isomorphism(input_text):
|
def test_chunk_by_word_isomorphism(input_text):
|
||||||
chunks = chunk_by_word(input_text)
|
chunks = chunk_by_word(input_text)
|
||||||
reconstructed_text = "".join([chunk[0] for chunk in chunks])
|
reconstructed_text = "".join([chunk[0] for chunk in chunks])
|
||||||
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
assert (
|
||||||
|
reconstructed_text == input_text
|
||||||
|
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
||||||
@pytest.mark.parametrize("input_text", [
|
|
||||||
INPUT_TEXTS["english_text"],
|
@pytest.mark.parametrize(
|
||||||
INPUT_TEXTS["english_lists"],
|
"input_text",
|
||||||
INPUT_TEXTS["python_code"],
|
[
|
||||||
INPUT_TEXTS["chinese_text"]
|
INPUT_TEXTS["english_text"],
|
||||||
])
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"],
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_chunk_by_word_splits(input_text):
|
def test_chunk_by_word_splits(input_text):
|
||||||
chunks = np.array(list(chunk_by_word(input_text)))
|
chunks = np.array(list(chunk_by_word(input_text)))
|
||||||
space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
|
space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
|
||||||
|
|
||||||
assert np.all(space_test), f"These chunks contain spaces within them: {chunks[space_test == False]}"
|
assert np.all(
|
||||||
|
space_test
|
||||||
|
), f"These chunks contain spaces within them: {chunks[space_test == False]}"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -271,5 +271,5 @@ What feign'd submission swore: ease would recant
|
||||||
Vows made in pain, as violent and void.
|
Vows made in pain, as violent and void.
|
||||||
For never can true reconcilement grow
|
For never can true reconcilement grow
|
||||||
Where wounds of deadly hate have peirc'd so deep:
|
Where wounds of deadly hate have peirc'd so deep:
|
||||||
Which would but lead me to a worse relapse [ 100 ]"""
|
Which would but lead me to a worse relapse [ 100 ]""",
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue