Adapt chunk_by_paragraph_test.py

This commit is contained in:
Leon Luithlen 2024-11-13 14:17:00 +01:00
parent f8e5b529c3
commit 1b4a7e4fdc

View file

@ -3,13 +3,13 @@ from cognee.tasks.chunks import chunk_by_paragraph
GROUND_TRUTH = { GROUND_TRUTH = {
"whole_text": [ "whole_text": [
{ {
"text": "This is example text. It contains multiple sentences.", "text": "This is example text. It contains multiple sentences.\n",
"word_count": 8, "word_count": 9,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "This is a second paragraph. First two paragraphs are whole.", "text": "This is a second paragraph. First two paragraphs are whole.\n",
"word_count": 10, "word_count": 11,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
@ -20,30 +20,30 @@ GROUND_TRUTH = {
], ],
"cut_text": [ "cut_text": [
{ {
"text": "This is example text. It contains multiple sentences.", "text": "This is example text. It contains multiple sentences.\n",
"word_count": 8, "word_count": 9,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "This is a second paragraph. First two paragraphs are whole.", "text": "This is a second paragraph. First two paragraphs are whole.\n",
"word_count": 10, "word_count": 11,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "Third paragraph is cut and is missing the dot at the end", "text": "Third paragraph is cut and is missing the dot at the end",
"word_count": 12, "word_count": 12,
"cut_type": "sentence_cut", "cut_type": "word",
}, },
], ],
} }
INPUT_TEXT = { INPUT_TEXT = {
"whole_text": """This is example text. It contains multiple sentences. "whole_text": """This is example text. It contains multiple sentences.
This is a second paragraph. First two paragraphs are whole. This is a second paragraph. First two paragraphs are whole.
Third paragraph is a bit longer and is finished with a dot.""", Third paragraph is a bit longer and is finished with a dot.""",
"cut_text": """This is example text. It contains multiple sentences. "cut_text": """This is example text. It contains multiple sentences.
This is a second paragraph. First two paragraphs are whole. This is a second paragraph. First two paragraphs are whole.
Third paragraph is cut and is missing the dot at the end""", Third paragraph is cut and is missing the dot at the end""",
} }