From 73f24f9e4db9a94522f3e133791f36a22d41b93a Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:40:42 +0100 Subject: [PATCH] Fix sentence_cut return value in inappropriate places --- cognee/tasks/chunks/chunk_by_paragraph.py | 1 - cognee/tasks/chunks/chunk_by_sentence.py | 16 ++++++++++++++-- .../integration/documents/AudioDocument_test.py | 7 +++---- .../integration/documents/TextDocument_test.py | 4 ++-- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index a85a2de26..cfe73471a 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_index += 1 last_cut_type = end_type - last_paragraph_id = paragraph_id # Yield any remaining text if current_chunk: diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 9159922af..fedc0c9b2 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): paragraph_id = uuid4() chunk_index = 0 word_count = 0 + section_end = False for (word, word_type) in chunk_by_word(data): sentence += word word_count += 1 - if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)): + # this loop is to check if any letters come after a paragraph_end or sentence_end + # and if that is not the case, preserve the word_type for the final yield in the + # function + if word_type in ["paragraph_end", "sentence_end"]: + section_end = word_type + else: + for character in word: + if character.isalpha(): + section_end = "sentence_cut" + break + + if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)): yield (paragraph_id, chunk_index, sentence, word_count, word_type) sentence = "" word_count = 0 @@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): chunk_index, sentence, word_count, - "sentence_cut", + section_end, ) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index 49ddfc92c..f4df19849 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc GROUND_TRUTH = [ {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"}, {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"}, - {"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"}, + {"word_count": 41, "len_text": 219, "cut_type": "sentence_end"}, ] TEST_TEXT = """ @@ -21,8 +21,7 @@ TEST_TEXT = """ "Then you'll ship it without me. I won't stake my reputation on a house of cards." "Are you threatening to quit?" "No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this." -"The feature ships, Sarah. That's final." -""" +"The feature ships, Sarah. That's final.\"""" def test_AudioDocument(): @@ -31,7 +30,7 @@ def test_AudioDocument(): id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="" ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): - + l = list(document.read(chunk_size=64)) for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64) ): diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 9816f0529..ef7d42272 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum GROUND_TRUTH = { "code.txt": [ {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"}, - {"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"}, + {"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"}, ], "Natural_language_processing.txt": [ {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"}, - {"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"}, + {"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"}, ], }