Refactor word_type yielding in chuck_by_sentence

2024-11-14 17:16:04 +01:00 · 2024-11-14 17:16:04 +01:00 · e40e7386a0
commit e40e7386a0
parent 14dd60576e
1 changed files with 9 additions and 6 deletions
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
    paragraph_id = uuid4()
    word_count = 0
    section_end = False
+    word_type_state = None

+    # the yielded word_type_state is identical to word_type, except when
+    # the word type is 'word', the word doesn't contain any letters
+    # and words with the same characteristics connect it to a preceding
+    # word with word_type 'paragraph_end' or 'sentence_end'
    for (word, word_type) in chunk_by_word(data):
        sentence += word
        word_count += 1

-        # this loop is to check if any letters come after a paragraph_end or sentence_end
-        # and if that is not the case, preserve the word_type for the final yield in the
-        # function
        if word_type in ["paragraph_end", "sentence_end"]:
-            section_end = word_type
+            word_type_state = word_type
        else:
            for character in word:
                if character.isalpha():
-                    section_end = "sentence_cut"
+                    word_type_state = word_type
                    break

        if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
-            yield (paragraph_id, sentence, word_count, word_type)
+            yield (paragraph_id, sentence, word_count, word_type_state)
            sentence = ""
            word_count = 0
            paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id

    if len(sentence) > 0:
+        section_end = "sentence_cut" if word_type_state == "word" else word_type_state
        yield (
            paragraph_id,
            sentence,