cognee/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
2025-01-17 10:40:24 +01:00

71 lines
2.3 KiB
Python

from cognee.tasks.chunks import chunk_by_paragraph
GROUND_TRUTH = {
"whole_text": [
{
"text": "This is example text. It contains multiple sentences.",
"word_count": 8,
"cut_type": "paragraph_end",
},
{
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10,
"cut_type": "paragraph_end",
},
{
"text": "\nThird paragraph is a bit longer and is finished with a dot.",
"word_count": 12,
"cut_type": "sentence_end",
},
],
"cut_text": [
{
"text": "This is example text. It contains multiple sentences.",
"word_count": 8,
"cut_type": "paragraph_end",
},
{
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10,
"cut_type": "paragraph_end",
},
{
"text": "\nThird paragraph is cut and is missing the dot at the end",
"word_count": 12,
"cut_type": "sentence_cut",
},
],
}
INPUT_TEXT = {
"whole_text": """This is example text. It contains multiple sentences.
This is a second paragraph. First two paragraphs are whole.
Third paragraph is a bit longer and is finished with a dot.""",
"cut_text": """This is example text. It contains multiple sentences.
This is a second paragraph. First two paragraphs are whole.
Third paragraph is cut and is missing the dot at the end""",
}
def run_chunking_test(test_text, expected_chunks):
chunks = []
for chunk_data in chunk_by_paragraph(
data=test_text, paragraph_length=12, batch_paragraphs=False
):
chunks.append(chunk_data)
assert len(chunks) == 3
for expected_chunks_item, chunk in zip(expected_chunks, chunks):
for key in ["text", "word_count", "cut_type"]:
assert chunk[key] == expected_chunks_item[key], (
f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
)
def test_chunking_whole_text():
run_chunking_test(INPUT_TEXT["whole_text"], GROUND_TRUTH["whole_text"])
def test_chunking_cut_text():
run_chunking_test(INPUT_TEXT["cut_text"], GROUND_TRUTH["cut_text"])