diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 2bf1bf8ee..9159922af 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -15,7 +15,7 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): sentence += word word_count += 1 - if word_type == "paragraph_end" or word_type == "sentence_end" or ((maximum_length is not None) and (word_count == maximum_length)): + if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)): yield (paragraph_id, chunk_index, sentence, word_count, word_type) sentence = "" word_count = 0 diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index ef75094c4..ad09c9671 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -1,38 +1,18 @@ +from itertools import product + import numpy as np import pytest from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS +paragraph_lengths = [64, 256, 1024] +batch_paragraphs_vals = [True, False] + @pytest.mark.parametrize( "input_text,paragraph_length,batch_paragraphs", - [ - (INPUT_TEXTS["english_text"], 64, True), - (INPUT_TEXTS["english_text"], 64, False), - (INPUT_TEXTS["english_text"], 256, True), - (INPUT_TEXTS["english_text"], 256, False), - (INPUT_TEXTS["english_text"], 1024, True), - (INPUT_TEXTS["english_text"], 1024, False), - (INPUT_TEXTS["english_lists"], 64, True), - (INPUT_TEXTS["english_lists"], 64, False), - (INPUT_TEXTS["english_lists"], 256, True), - (INPUT_TEXTS["english_lists"], 256, False), - (INPUT_TEXTS["english_lists"], 1024, True), - (INPUT_TEXTS["english_lists"], 1024, False), - (INPUT_TEXTS["python_code"], 64, True), - (INPUT_TEXTS["python_code"], 64, False), - (INPUT_TEXTS["python_code"], 256, True), - (INPUT_TEXTS["python_code"], 256, False), - (INPUT_TEXTS["python_code"], 1024, True), - (INPUT_TEXTS["python_code"], 1024, False), - (INPUT_TEXTS["chinese_text"], 64, True), - (INPUT_TEXTS["chinese_text"], 64, False), - (INPUT_TEXTS["chinese_text"], 256, True), - (INPUT_TEXTS["chinese_text"], 256, False), - (INPUT_TEXTS["chinese_text"], 1024, True), - (INPUT_TEXTS["chinese_text"], 1024, False), - ], + list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), ) def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) @@ -44,32 +24,7 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para @pytest.mark.parametrize( "input_text,paragraph_length,batch_paragraphs", - [ - (INPUT_TEXTS["english_text"], 64, True), - (INPUT_TEXTS["english_text"], 64, False), - (INPUT_TEXTS["english_text"], 256, True), - (INPUT_TEXTS["english_text"], 256, False), - (INPUT_TEXTS["english_text"], 1024, True), - (INPUT_TEXTS["english_text"], 1024, False), - (INPUT_TEXTS["english_lists"], 64, True), - (INPUT_TEXTS["english_lists"], 64, False), - (INPUT_TEXTS["english_lists"], 256, True), - (INPUT_TEXTS["english_lists"], 256, False), - (INPUT_TEXTS["english_lists"], 1024, True), - (INPUT_TEXTS["english_lists"], 1024, False), - (INPUT_TEXTS["python_code"], 64, True), - (INPUT_TEXTS["python_code"], 64, False), - (INPUT_TEXTS["python_code"], 256, True), - (INPUT_TEXTS["python_code"], 256, False), - (INPUT_TEXTS["python_code"], 1024, True), - (INPUT_TEXTS["python_code"], 1024, False), - (INPUT_TEXTS["chinese_text"], 64, True), - (INPUT_TEXTS["chinese_text"], 64, False), - (INPUT_TEXTS["chinese_text"], 256, True), - (INPUT_TEXTS["chinese_text"], 256, False), - (INPUT_TEXTS["chinese_text"], 1024, True), - (INPUT_TEXTS["chinese_text"], 1024, False), - ], + list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), ) def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index 45af2ed39..2f42f836a 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -1,26 +1,17 @@ +from itertools import product + import numpy as np import pytest from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS +maximum_length_vals = [None, 8, 64] + @pytest.mark.parametrize( "input_text,maximum_length", - [ - (INPUT_TEXTS["english_text"], None), - (INPUT_TEXTS["english_text"], 8), - (INPUT_TEXTS["english_text"], 64), - (INPUT_TEXTS["english_lists"], None), - (INPUT_TEXTS["english_lists"], 8), - (INPUT_TEXTS["english_lists"], 64), - (INPUT_TEXTS["python_code"], None), - (INPUT_TEXTS["python_code"], 8), - (INPUT_TEXTS["python_code"], 64), - (INPUT_TEXTS["chinese_text"], None), - (INPUT_TEXTS["chinese_text"], 8), - (INPUT_TEXTS["chinese_text"], 64), - ], + list(product(list(INPUT_TEXTS.values()), maximum_length_vals)), ) def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) @@ -32,16 +23,12 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length): @pytest.mark.parametrize( "input_text,maximum_length", - [ - (INPUT_TEXTS["english_text"], 8), - (INPUT_TEXTS["english_text"], 64), - (INPUT_TEXTS["english_lists"], 8), - (INPUT_TEXTS["english_lists"], 64), - (INPUT_TEXTS["python_code"], 8), - (INPUT_TEXTS["python_code"], 64), - (INPUT_TEXTS["chinese_text"], 8), - (INPUT_TEXTS["chinese_text"], 64), - ], + list( + product( + list(INPUT_TEXTS.values()), + [val for val in maximum_length_vals if val is not None], + ) + ), ) def test_paragraph_chunk_length(input_text, maximum_length): chunks = list(chunk_by_sentence(input_text, maximum_length)) diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py index b7a57b75a..820bf2d2d 100644 --- a/cognee/tests/unit/processing/chunks/test_input.py +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -272,4 +272,12 @@ Vows made in pain, as violent and void. For never can true reconcilement grow Where wounds of deadly hate have peirc'd so deep: Which would but lead me to a worse relapse [ 100 ]""", + "empty": "", + "single_char": "x", + "whitespace": " \n\t \r\n ", + "unicode_special": "Hello 👋 مرحبا שָׁלוֹם", + "mixed_endings": "line1\r\nline2\nline3\r\nline4", + "many_newlines": "\n\n\n\ntext\n\n\n\n", + "html_mixed": "

Hello

\nPlain text\n
World
", + "urls_emails": "Visit https://example.com or email user@example.com", }