From a52d3ac6ba23835611575ac765f4b5757087625a Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:20:18 +0100 Subject: [PATCH] Change document test ground truth values for new chunk_by_word --- .../documents/AudioDocument_test.py | 6 +++--- .../documents/ImageDocument_test.py | 2 +- .../documents/TextDocument_test.py | 8 ++++---- .../chunks/chunk_by_paragraph_test.py | 20 +++++++++---------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index b20124456..49ddfc92c 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -4,9 +4,9 @@ from unittest.mock import patch from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument GROUND_TRUTH = [ - {"word_count": 60, "len_text": 318, "cut_type": "sentence_end"}, - {"word_count": 64, "len_text": 358, "cut_type": "sentence_end"}, - {"word_count": 56, "len_text": 255, "cut_type": "sentence_cut"}, + {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"}, + {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"}, + {"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"}, ] TEST_TEXT = """ diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index d34127eb3..e9caf3634 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -5,7 +5,7 @@ from cognee.modules.data.processing.document_types.ImageDocument import ImageDoc GROUND_TRUTH = [ {"word_count": 51, "len_text": 298, "cut_type": "sentence_end"}, - {"word_count": 63, "len_text": 369, "cut_type": "sentence_end"}, + {"word_count": 62, "len_text": 369, "cut_type": "sentence_end"}, {"word_count": 44, "len_text": 294, "cut_type": "sentence_end"}, ] diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 1547baa46..9816f0529 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -7,12 +7,12 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum GROUND_TRUTH = { "code.txt": [ - {"word_count": 253, "len_text": 953, "cut_type": "paragraph_end"}, - {"word_count": 157, "len_text": 905, "cut_type": "paragraph_end"}, + {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"}, + {"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"}, ], "Natural_language_processing.txt": [ - {"word_count": 115, "len_text": 839, "cut_type": "paragraph_end"}, - {"word_count": 15, "len_text": 146, "cut_type": "paragraph_end"}, + {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"}, + {"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"}, ], } diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index f8fe00237..5355411d5 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -3,34 +3,34 @@ from cognee.tasks.chunks import chunk_by_paragraph GROUND_TRUTH = { "whole_text": [ { - "text": "This is example text. It contains multiple sentences.\n", - "word_count": 9, + "text": "This is example text. It contains multiple sentences.", + "word_count": 8, "cut_type": "paragraph_end", }, { - "text": "This is a second paragraph. First two paragraphs are whole.\n", - "word_count": 11, + "text": "\nThis is a second paragraph. First two paragraphs are whole.", + "word_count": 10, "cut_type": "paragraph_end", }, { - "text": "Third paragraph is a bit longer and is finished with a dot.", + "text": "\nThird paragraph is a bit longer and is finished with a dot.", "word_count": 12, "cut_type": "sentence_end", }, ], "cut_text": [ { - "text": "This is example text. It contains multiple sentences.\n", - "word_count": 9, + "text": "This is example text. It contains multiple sentences.", + "word_count": 8, "cut_type": "paragraph_end", }, { - "text": "This is a second paragraph. First two paragraphs are whole.\n", - "word_count": 11, + "text": "\nThis is a second paragraph. First two paragraphs are whole.", + "word_count": 10, "cut_type": "paragraph_end", }, { - "text": "Third paragraph is cut and is missing the dot at the end", + "text": "\nThird paragraph is cut and is missing the dot at the end", "word_count": 12, "cut_type": "word", },