Add more adversarial examples

2024-11-13 16:23:14 +01:00 · 2024-11-13 16:23:14 +01:00 · b787407db7
commit b787407db7
parent fdec9a692e
4 changed files with 27 additions and 77 deletions
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@ -15,7 +15,7 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
        sentence += word
        word_count += 1

-        if word_type == "paragraph_end" or word_type == "sentence_end" or ((maximum_length is not None) and (word_count == maximum_length)):
+        if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)):
            yield (paragraph_id, chunk_index, sentence, word_count, word_type)
            sentence = ""
            word_count = 0
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
@ -1,38 +1,18 @@
+from itertools import product
+
 import numpy as np
 import pytest

 from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
 from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS

+paragraph_lengths = [64, 256, 1024]
+batch_paragraphs_vals = [True, False]
+

@pytest.mark.parametrize(
    "input_text,paragraph_length,batch_paragraphs",
-    [
-        (INPUT_TEXTS["english_text"], 64, True),
-        (INPUT_TEXTS["english_text"], 64, False),
-        (INPUT_TEXTS["english_text"], 256, True),
-        (INPUT_TEXTS["english_text"], 256, False),
-        (INPUT_TEXTS["english_text"], 1024, True),
-        (INPUT_TEXTS["english_text"], 1024, False),
-        (INPUT_TEXTS["english_lists"], 64, True),
-        (INPUT_TEXTS["english_lists"], 64, False),
-        (INPUT_TEXTS["english_lists"], 256, True),
-        (INPUT_TEXTS["english_lists"], 256, False),
-        (INPUT_TEXTS["english_lists"], 1024, True),
-        (INPUT_TEXTS["english_lists"], 1024, False),
-        (INPUT_TEXTS["python_code"], 64, True),
-        (INPUT_TEXTS["python_code"], 64, False),
-        (INPUT_TEXTS["python_code"], 256, True),
-        (INPUT_TEXTS["python_code"], 256, False),
-        (INPUT_TEXTS["python_code"], 1024, True),
-        (INPUT_TEXTS["python_code"], 1024, False),
-        (INPUT_TEXTS["chinese_text"], 64, True),
-        (INPUT_TEXTS["chinese_text"], 64, False),
-        (INPUT_TEXTS["chinese_text"], 256, True),
-        (INPUT_TEXTS["chinese_text"], 256, False),
-        (INPUT_TEXTS["chinese_text"], 1024, True),
-        (INPUT_TEXTS["chinese_text"], 1024, False),
-    ],
+    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
 )
 def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
    chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
@ -44,32 +24,7 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para

@pytest.mark.parametrize(
    "input_text,paragraph_length,batch_paragraphs",
-    [
-        (INPUT_TEXTS["english_text"], 64, True),
-        (INPUT_TEXTS["english_text"], 64, False),
-        (INPUT_TEXTS["english_text"], 256, True),
-        (INPUT_TEXTS["english_text"], 256, False),
-        (INPUT_TEXTS["english_text"], 1024, True),
-        (INPUT_TEXTS["english_text"], 1024, False),
-        (INPUT_TEXTS["english_lists"], 64, True),
-        (INPUT_TEXTS["english_lists"], 64, False),
-        (INPUT_TEXTS["english_lists"], 256, True),
-        (INPUT_TEXTS["english_lists"], 256, False),
-        (INPUT_TEXTS["english_lists"], 1024, True),
-        (INPUT_TEXTS["english_lists"], 1024, False),
-        (INPUT_TEXTS["python_code"], 64, True),
-        (INPUT_TEXTS["python_code"], 64, False),
-        (INPUT_TEXTS["python_code"], 256, True),
-        (INPUT_TEXTS["python_code"], 256, False),
-        (INPUT_TEXTS["python_code"], 1024, True),
-        (INPUT_TEXTS["python_code"], 1024, False),
-        (INPUT_TEXTS["chinese_text"], 64, True),
-        (INPUT_TEXTS["chinese_text"], 64, False),
-        (INPUT_TEXTS["chinese_text"], 256, True),
-        (INPUT_TEXTS["chinese_text"], 256, False),
-        (INPUT_TEXTS["chinese_text"], 1024, True),
-        (INPUT_TEXTS["chinese_text"], 1024, False),
-    ],
+    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
 )
 def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
    chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs))
--- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
@ -1,26 +1,17 @@
+from itertools import product
+
 import numpy as np
 import pytest

 from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
 from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS

+maximum_length_vals = [None, 8, 64]
+

@pytest.mark.parametrize(
    "input_text,maximum_length",
-    [
-        (INPUT_TEXTS["english_text"], None),
-        (INPUT_TEXTS["english_text"], 8),
-        (INPUT_TEXTS["english_text"], 64),
-        (INPUT_TEXTS["english_lists"], None),
-        (INPUT_TEXTS["english_lists"], 8),
-        (INPUT_TEXTS["english_lists"], 64),
-        (INPUT_TEXTS["python_code"], None),
-        (INPUT_TEXTS["python_code"], 8),
-        (INPUT_TEXTS["python_code"], 64),
-        (INPUT_TEXTS["chinese_text"], None),
-        (INPUT_TEXTS["chinese_text"], 8),
-        (INPUT_TEXTS["chinese_text"], 64),
-    ],
+    list(product(list(INPUT_TEXTS.values()), maximum_length_vals)),
 )
 def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
    chunks = chunk_by_sentence(input_text, maximum_length)
@ -32,16 +23,12 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length):

@pytest.mark.parametrize(
    "input_text,maximum_length",
-    [
-        (INPUT_TEXTS["english_text"], 8),
-        (INPUT_TEXTS["english_text"], 64),
-        (INPUT_TEXTS["english_lists"], 8),
-        (INPUT_TEXTS["english_lists"], 64),
-        (INPUT_TEXTS["python_code"], 8),
-        (INPUT_TEXTS["python_code"], 64),
-        (INPUT_TEXTS["chinese_text"], 8),
-        (INPUT_TEXTS["chinese_text"], 64),
-    ],
+    list(
+        product(
+            list(INPUT_TEXTS.values()),
+            [val for val in maximum_length_vals if val is not None],
+        )
+    ),
 )
 def test_paragraph_chunk_length(input_text, maximum_length):
    chunks = list(chunk_by_sentence(input_text, maximum_length))
--- a/cognee/tests/unit/processing/chunks/test_input.py
+++ b/cognee/tests/unit/processing/chunks/test_input.py
@ -272,4 +272,12 @@ Vows made in pain, as violent and void.
 For never can true reconcilement grow
 Where wounds of deadly hate have peirc'd so deep:
 Which would but lead me to a worse relapse [ 100 ]""",
+    "empty": "",
+    "single_char": "x",
+    "whitespace": "   \n\t   \r\n   ",
+    "unicode_special": "Hello 👋 مرحبا שָׁלוֹם",
+    "mixed_endings": "line1\r\nline2\nline3\r\nline4",
+    "many_newlines": "\n\n\n\ntext\n\n\n\n",
+    "html_mixed": "<p>Hello</p>\nPlain text\n<div>World</div>",
+    "urls_emails": "Visit https://example.com or email user@example.com",
 }