Adapt chunk_by_paragraph test parametrization

This commit is contained in:
Leon Luithlen 2024-11-13 13:33:08 +01:00
parent 92a66dddb9
commit ef7a19043d

View file

@ -6,16 +6,36 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
@pytest.mark.parametrize(
"input_text",
"input_text,paragraph_length,batch_paragraphs",
[
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
(INPUT_TEXTS["english_text"], 64, True),
(INPUT_TEXTS["english_text"], 64, False),
(INPUT_TEXTS["english_text"], 256, True),
(INPUT_TEXTS["english_text"], 256, False),
(INPUT_TEXTS["english_text"], 1024, True),
(INPUT_TEXTS["english_text"], 1024, False),
(INPUT_TEXTS["english_lists"], 64, True),
(INPUT_TEXTS["english_lists"], 64, False),
(INPUT_TEXTS["english_lists"], 256, True),
(INPUT_TEXTS["english_lists"], 256, False),
(INPUT_TEXTS["english_lists"], 1024, True),
(INPUT_TEXTS["english_lists"], 1024, False),
(INPUT_TEXTS["python_code"], 64, True),
(INPUT_TEXTS["python_code"], 64, False),
(INPUT_TEXTS["python_code"], 256, True),
(INPUT_TEXTS["python_code"], 256, False),
(INPUT_TEXTS["python_code"], 1024, True),
(INPUT_TEXTS["python_code"], 1024, False),
(INPUT_TEXTS["chinese_text"], 64, True),
(INPUT_TEXTS["chinese_text"], 64, False),
(INPUT_TEXTS["chinese_text"], 256, True),
(INPUT_TEXTS["chinese_text"], 256, False),
(INPUT_TEXTS["chinese_text"], 1024, True),
(INPUT_TEXTS["chinese_text"], 1024, False),
],
)
def test_chunk_by_paragraph_isomorphism(input_text):
chunks = chunk_by_paragraph(input_text)
def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert (
reconstructed_text == input_text