Adapt chunk_by_paragraph test parametrization

This commit is contained in:
Leon Luithlen 2024-11-13 13:33:08 +01:00
parent 92a66dddb9
commit ef7a19043d

View file

@ -6,16 +6,36 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
@pytest.mark.parametrize( @pytest.mark.parametrize(
"input_text", "input_text,paragraph_length,batch_paragraphs",
[ [
INPUT_TEXTS["english_text"], (INPUT_TEXTS["english_text"], 64, True),
INPUT_TEXTS["english_lists"], (INPUT_TEXTS["english_text"], 64, False),
INPUT_TEXTS["python_code"], (INPUT_TEXTS["english_text"], 256, True),
INPUT_TEXTS["chinese_text"], (INPUT_TEXTS["english_text"], 256, False),
(INPUT_TEXTS["english_text"], 1024, True),
(INPUT_TEXTS["english_text"], 1024, False),
(INPUT_TEXTS["english_lists"], 64, True),
(INPUT_TEXTS["english_lists"], 64, False),
(INPUT_TEXTS["english_lists"], 256, True),
(INPUT_TEXTS["english_lists"], 256, False),
(INPUT_TEXTS["english_lists"], 1024, True),
(INPUT_TEXTS["english_lists"], 1024, False),
(INPUT_TEXTS["python_code"], 64, True),
(INPUT_TEXTS["python_code"], 64, False),
(INPUT_TEXTS["python_code"], 256, True),
(INPUT_TEXTS["python_code"], 256, False),
(INPUT_TEXTS["python_code"], 1024, True),
(INPUT_TEXTS["python_code"], 1024, False),
(INPUT_TEXTS["chinese_text"], 64, True),
(INPUT_TEXTS["chinese_text"], 64, False),
(INPUT_TEXTS["chinese_text"], 256, True),
(INPUT_TEXTS["chinese_text"], 256, False),
(INPUT_TEXTS["chinese_text"], 1024, True),
(INPUT_TEXTS["chinese_text"], 1024, False),
], ],
) )
def test_chunk_by_paragraph_isomorphism(input_text): def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
chunks = chunk_by_paragraph(input_text) chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
reconstructed_text = "".join([chunk["text"] for chunk in chunks]) reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert ( assert (
reconstructed_text == input_text reconstructed_text == input_text