From 73f24f9e4db9a94522f3e133791f36a22d41b93a Mon Sep 17 00:00:00 2001
From: Leon Luithlen <leon@topoteretes.com>
Date: Thu, 14 Nov 2024 14:40:42 +0100
Subject: [PATCH] Fix sentence_cut return value in inappropriate places

---
 cognee/tasks/chunks/chunk_by_paragraph.py        |  1 -
 cognee/tasks/chunks/chunk_by_sentence.py         | 16 ++++++++++++++--
 .../integration/documents/AudioDocument_test.py  |  7 +++----
 .../integration/documents/TextDocument_test.py   |  4 ++--
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index a85a2de26..cfe73471a 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             chunk_index += 1
         
         last_cut_type = end_type
-        last_paragraph_id = paragraph_id
     
     # Yield any remaining text
     if current_chunk:
diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py
index 9159922af..fedc0c9b2 100644
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
     paragraph_id = uuid4()
     chunk_index = 0
     word_count = 0
+    section_end = False
 
     for (word, word_type) in chunk_by_word(data):
         sentence += word
         word_count += 1
 
-        if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)):
+        # this loop is to check if any letters come after a paragraph_end or sentence_end
+        # and if that is not the case, preserve the word_type for the final yield in the
+        # function
+        if word_type in ["paragraph_end", "sentence_end"]:
+            section_end = word_type
+        else:
+            for character in word:
+                if character.isalpha():
+                    section_end = "sentence_cut"
+                    break
+
+        if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
             yield (paragraph_id, chunk_index, sentence, word_count, word_type)
             sentence = ""
             word_count = 0
@@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
             chunk_index,
             sentence,
             word_count,
-            "sentence_cut",
+            section_end,
         )
diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py
index 49ddfc92c..f4df19849 100644
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
 GROUND_TRUTH = [
     {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
     {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
-    {"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"},
+    {"word_count": 41, "len_text": 219, "cut_type": "sentence_end"},
 ]
 
 TEST_TEXT = """
@@ -21,8 +21,7 @@ TEST_TEXT = """
 "Then you'll ship it without me. I won't stake my reputation on a house of cards."
 "Are you threatening to quit?"
 "No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this."
-"The feature ships, Sarah. That's final."
-"""
+"The feature ships, Sarah. That's final.\""""
 
 
 def test_AudioDocument():
@@ -31,7 +30,7 @@ def test_AudioDocument():
         id=uuid.uuid4(), name="audio-dummy-test", raw_data_location=""
     )
     with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
-
+        l = list(document.read(chunk_size=64))
         for ground_truth, paragraph_data in zip(
             GROUND_TRUTH, document.read(chunk_size=64)
         ):
diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py
index 9816f0529..ef7d42272 100644
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum
 GROUND_TRUTH = {
     "code.txt": [
         {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
-        {"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"},
+        {"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"},
     ],
     "Natural_language_processing.txt": [
         {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
-        {"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"},
+        {"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
     ],
 }