Change document test ground truth values for new chunk_by_word
This commit is contained in:
parent
eaf9167fa1
commit
a52d3ac6ba
4 changed files with 18 additions and 18 deletions
|
|
@ -4,9 +4,9 @@ from unittest.mock import patch
|
|||
from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument
|
||||
|
||||
GROUND_TRUTH = [
|
||||
{"word_count": 60, "len_text": 318, "cut_type": "sentence_end"},
|
||||
{"word_count": 64, "len_text": 358, "cut_type": "sentence_end"},
|
||||
{"word_count": 56, "len_text": 255, "cut_type": "sentence_cut"},
|
||||
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
|
||||
{"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
|
||||
{"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"},
|
||||
]
|
||||
|
||||
TEST_TEXT = """
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from cognee.modules.data.processing.document_types.ImageDocument import ImageDoc
|
|||
|
||||
GROUND_TRUTH = [
|
||||
{"word_count": 51, "len_text": 298, "cut_type": "sentence_end"},
|
||||
{"word_count": 63, "len_text": 369, "cut_type": "sentence_end"},
|
||||
{"word_count": 62, "len_text": 369, "cut_type": "sentence_end"},
|
||||
{"word_count": 44, "len_text": 294, "cut_type": "sentence_end"},
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -7,12 +7,12 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum
|
|||
|
||||
GROUND_TRUTH = {
|
||||
"code.txt": [
|
||||
{"word_count": 253, "len_text": 953, "cut_type": "paragraph_end"},
|
||||
{"word_count": 157, "len_text": 905, "cut_type": "paragraph_end"},
|
||||
{"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
|
||||
{"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"},
|
||||
],
|
||||
"Natural_language_processing.txt": [
|
||||
{"word_count": 115, "len_text": 839, "cut_type": "paragraph_end"},
|
||||
{"word_count": 15, "len_text": 146, "cut_type": "paragraph_end"},
|
||||
{"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
|
||||
{"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"},
|
||||
],
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,34 +3,34 @@ from cognee.tasks.chunks import chunk_by_paragraph
|
|||
GROUND_TRUTH = {
|
||||
"whole_text": [
|
||||
{
|
||||
"text": "This is example text. It contains multiple sentences.\n",
|
||||
"word_count": 9,
|
||||
"text": "This is example text. It contains multiple sentences.",
|
||||
"word_count": 8,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "This is a second paragraph. First two paragraphs are whole.\n",
|
||||
"word_count": 11,
|
||||
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
|
||||
"word_count": 10,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "Third paragraph is a bit longer and is finished with a dot.",
|
||||
"text": "\nThird paragraph is a bit longer and is finished with a dot.",
|
||||
"word_count": 12,
|
||||
"cut_type": "sentence_end",
|
||||
},
|
||||
],
|
||||
"cut_text": [
|
||||
{
|
||||
"text": "This is example text. It contains multiple sentences.\n",
|
||||
"word_count": 9,
|
||||
"text": "This is example text. It contains multiple sentences.",
|
||||
"word_count": 8,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "This is a second paragraph. First two paragraphs are whole.\n",
|
||||
"word_count": 11,
|
||||
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
|
||||
"word_count": 10,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "Third paragraph is cut and is missing the dot at the end",
|
||||
"text": "\nThird paragraph is cut and is missing the dot at the end",
|
||||
"word_count": 12,
|
||||
"cut_type": "word",
|
||||
},
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue