Add more adversarial examples
This commit is contained in:
parent
fdec9a692e
commit
b787407db7
4 changed files with 27 additions and 77 deletions
|
|
@ -15,7 +15,7 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
|||
sentence += word
|
||||
word_count += 1
|
||||
|
||||
if word_type == "paragraph_end" or word_type == "sentence_end" or ((maximum_length is not None) and (word_count == maximum_length)):
|
||||
if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)):
|
||||
yield (paragraph_id, chunk_index, sentence, word_count, word_type)
|
||||
sentence = ""
|
||||
word_count = 0
|
||||
|
|
|
|||
|
|
@ -1,38 +1,18 @@
|
|||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||
|
||||
paragraph_lengths = [64, 256, 1024]
|
||||
batch_paragraphs_vals = [True, False]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,paragraph_length,batch_paragraphs",
|
||||
[
|
||||
(INPUT_TEXTS["english_text"], 64, True),
|
||||
(INPUT_TEXTS["english_text"], 64, False),
|
||||
(INPUT_TEXTS["english_text"], 256, True),
|
||||
(INPUT_TEXTS["english_text"], 256, False),
|
||||
(INPUT_TEXTS["english_text"], 1024, True),
|
||||
(INPUT_TEXTS["english_text"], 1024, False),
|
||||
(INPUT_TEXTS["english_lists"], 64, True),
|
||||
(INPUT_TEXTS["english_lists"], 64, False),
|
||||
(INPUT_TEXTS["english_lists"], 256, True),
|
||||
(INPUT_TEXTS["english_lists"], 256, False),
|
||||
(INPUT_TEXTS["english_lists"], 1024, True),
|
||||
(INPUT_TEXTS["english_lists"], 1024, False),
|
||||
(INPUT_TEXTS["python_code"], 64, True),
|
||||
(INPUT_TEXTS["python_code"], 64, False),
|
||||
(INPUT_TEXTS["python_code"], 256, True),
|
||||
(INPUT_TEXTS["python_code"], 256, False),
|
||||
(INPUT_TEXTS["python_code"], 1024, True),
|
||||
(INPUT_TEXTS["python_code"], 1024, False),
|
||||
(INPUT_TEXTS["chinese_text"], 64, True),
|
||||
(INPUT_TEXTS["chinese_text"], 64, False),
|
||||
(INPUT_TEXTS["chinese_text"], 256, True),
|
||||
(INPUT_TEXTS["chinese_text"], 256, False),
|
||||
(INPUT_TEXTS["chinese_text"], 1024, True),
|
||||
(INPUT_TEXTS["chinese_text"], 1024, False),
|
||||
],
|
||||
list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
|
||||
)
|
||||
def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
|
||||
chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
|
||||
|
|
@ -44,32 +24,7 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para
|
|||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,paragraph_length,batch_paragraphs",
|
||||
[
|
||||
(INPUT_TEXTS["english_text"], 64, True),
|
||||
(INPUT_TEXTS["english_text"], 64, False),
|
||||
(INPUT_TEXTS["english_text"], 256, True),
|
||||
(INPUT_TEXTS["english_text"], 256, False),
|
||||
(INPUT_TEXTS["english_text"], 1024, True),
|
||||
(INPUT_TEXTS["english_text"], 1024, False),
|
||||
(INPUT_TEXTS["english_lists"], 64, True),
|
||||
(INPUT_TEXTS["english_lists"], 64, False),
|
||||
(INPUT_TEXTS["english_lists"], 256, True),
|
||||
(INPUT_TEXTS["english_lists"], 256, False),
|
||||
(INPUT_TEXTS["english_lists"], 1024, True),
|
||||
(INPUT_TEXTS["english_lists"], 1024, False),
|
||||
(INPUT_TEXTS["python_code"], 64, True),
|
||||
(INPUT_TEXTS["python_code"], 64, False),
|
||||
(INPUT_TEXTS["python_code"], 256, True),
|
||||
(INPUT_TEXTS["python_code"], 256, False),
|
||||
(INPUT_TEXTS["python_code"], 1024, True),
|
||||
(INPUT_TEXTS["python_code"], 1024, False),
|
||||
(INPUT_TEXTS["chinese_text"], 64, True),
|
||||
(INPUT_TEXTS["chinese_text"], 64, False),
|
||||
(INPUT_TEXTS["chinese_text"], 256, True),
|
||||
(INPUT_TEXTS["chinese_text"], 256, False),
|
||||
(INPUT_TEXTS["chinese_text"], 1024, True),
|
||||
(INPUT_TEXTS["chinese_text"], 1024, False),
|
||||
],
|
||||
list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
|
||||
)
|
||||
def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
|
||||
chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs))
|
||||
|
|
|
|||
|
|
@ -1,26 +1,17 @@
|
|||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||
|
||||
maximum_length_vals = [None, 8, 64]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,maximum_length",
|
||||
[
|
||||
(INPUT_TEXTS["english_text"], None),
|
||||
(INPUT_TEXTS["english_text"], 8),
|
||||
(INPUT_TEXTS["english_text"], 64),
|
||||
(INPUT_TEXTS["english_lists"], None),
|
||||
(INPUT_TEXTS["english_lists"], 8),
|
||||
(INPUT_TEXTS["english_lists"], 64),
|
||||
(INPUT_TEXTS["python_code"], None),
|
||||
(INPUT_TEXTS["python_code"], 8),
|
||||
(INPUT_TEXTS["python_code"], 64),
|
||||
(INPUT_TEXTS["chinese_text"], None),
|
||||
(INPUT_TEXTS["chinese_text"], 8),
|
||||
(INPUT_TEXTS["chinese_text"], 64),
|
||||
],
|
||||
list(product(list(INPUT_TEXTS.values()), maximum_length_vals)),
|
||||
)
|
||||
def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
|
||||
chunks = chunk_by_sentence(input_text, maximum_length)
|
||||
|
|
@ -32,16 +23,12 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
|
|||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,maximum_length",
|
||||
[
|
||||
(INPUT_TEXTS["english_text"], 8),
|
||||
(INPUT_TEXTS["english_text"], 64),
|
||||
(INPUT_TEXTS["english_lists"], 8),
|
||||
(INPUT_TEXTS["english_lists"], 64),
|
||||
(INPUT_TEXTS["python_code"], 8),
|
||||
(INPUT_TEXTS["python_code"], 64),
|
||||
(INPUT_TEXTS["chinese_text"], 8),
|
||||
(INPUT_TEXTS["chinese_text"], 64),
|
||||
],
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
[val for val in maximum_length_vals if val is not None],
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_paragraph_chunk_length(input_text, maximum_length):
|
||||
chunks = list(chunk_by_sentence(input_text, maximum_length))
|
||||
|
|
|
|||
|
|
@ -272,4 +272,12 @@ Vows made in pain, as violent and void.
|
|||
For never can true reconcilement grow
|
||||
Where wounds of deadly hate have peirc'd so deep:
|
||||
Which would but lead me to a worse relapse [ 100 ]""",
|
||||
"empty": "",
|
||||
"single_char": "x",
|
||||
"whitespace": " \n\t \r\n ",
|
||||
"unicode_special": "Hello 👋 مرحبا שָׁלוֹם",
|
||||
"mixed_endings": "line1\r\nline2\nline3\r\nline4",
|
||||
"many_newlines": "\n\n\n\ntext\n\n\n\n",
|
||||
"html_mixed": "<p>Hello</p>\nPlain text\n<div>World</div>",
|
||||
"urls_emails": "Visit https://example.com or email user@example.com",
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue