From e40e7386a0f61dec7c6aa6dac1fc4a4ab5f3c9cf Mon Sep 17 00:00:00 2001
From: Leon Luithlen <leon@topoteretes.com>
Date: Thu, 14 Nov 2024 17:16:04 +0100
Subject: [PATCH] Refactor word_type yielding in chuck_by_sentence

---
 cognee/tasks/chunks/chunk_by_sentence.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py
index bee074d04..c6848f066 100644
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
     paragraph_id = uuid4()
     word_count = 0
     section_end = False
+    word_type_state = None
 
+    # the yielded word_type_state is identical to word_type, except when
+    # the word type is 'word', the word doesn't contain any letters
+    # and words with the same characteristics connect it to a preceding
+    # word with word_type 'paragraph_end' or 'sentence_end'
     for (word, word_type) in chunk_by_word(data):
         sentence += word
         word_count += 1
 
-        # this loop is to check if any letters come after a paragraph_end or sentence_end
-        # and if that is not the case, preserve the word_type for the final yield in the
-        # function
         if word_type in ["paragraph_end", "sentence_end"]:
-            section_end = word_type
+            word_type_state = word_type
         else:
             for character in word:
                 if character.isalpha():
-                    section_end = "sentence_cut"
+                    word_type_state = word_type
                     break
 
         if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
-            yield (paragraph_id, sentence, word_count, word_type)
+            yield (paragraph_id, sentence, word_count, word_type_state)
             sentence = ""
             word_count = 0
             paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
 
     if len(sentence) > 0:
+        section_end = "sentence_cut" if word_type_state == "word" else word_type_state
         yield (
             paragraph_id,
             sentence,