Fix sentence_cut return value in inappropriate places

2024-11-14 14:40:42 +01:00 · 2024-11-14 14:40:42 +01:00 · 73f24f9e4d
commit 73f24f9e4d
parent b4d509e682
4 changed files with 19 additions and 9 deletions
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
            chunk_index += 1
        
        last_cut_type = end_type
-        last_paragraph_id = paragraph_id
    
    # Yield any remaining text
    if current_chunk:
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
    paragraph_id = uuid4()
    chunk_index = 0
    word_count = 0
+    section_end = False

    for (word, word_type) in chunk_by_word(data):
        sentence += word
        word_count += 1

-        if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)):
+        # this loop is to check if any letters come after a paragraph_end or sentence_end
+        # and if that is not the case, preserve the word_type for the final yield in the
+        # function
+        if word_type in ["paragraph_end", "sentence_end"]:
+            section_end = word_type
+        else:
+            for character in word:
+                if character.isalpha():
+                    section_end = "sentence_cut"
+                    break
+
+        if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
            yield (paragraph_id, chunk_index, sentence, word_count, word_type)
            sentence = ""
            word_count = 0
@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
            chunk_index,
            sentence,
            word_count,
-            "sentence_cut",
+            section_end,
        )
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
 GROUND_TRUTH = [
    {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
    {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
-    {"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"},
+    {"word_count": 41, "len_text": 219, "cut_type": "sentence_end"},
 ]

 TEST_TEXT = """
@ -21,8 +21,7 @@ TEST_TEXT = """
 "Then you'll ship it without me. I won't stake my reputation on a house of cards."
 "Are you threatening to quit?"
 "No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this."
-"The feature ships, Sarah. That's final."
-"""
+"The feature ships, Sarah. That's final.\""""


 def test_AudioDocument():
@ -31,7 +30,7 @@ def test_AudioDocument():
        id=uuid.uuid4(), name="audio-dummy-test", raw_data_location=""
    )
    with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
-
+        l = list(document.read(chunk_size=64))
        for ground_truth, paragraph_data in zip(
            GROUND_TRUTH, document.read(chunk_size=64)
        ):
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum
 GROUND_TRUTH = {
    "code.txt": [
        {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
-        {"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"},
+        {"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"},
    ],
    "Natural_language_processing.txt": [
        {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
-        {"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"},
+        {"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
    ],
 }