Make chunk_by_word isomorphic

2024-11-13 11:47:13 +01:00 · 2024-11-13 11:47:13 +01:00 · c054e897a3
commit c054e897a3
parent 830c6710e0
1 changed files with 51 additions and 33 deletions
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@ -4,20 +4,29 @@ SENTENCE_ENDINGS = r"[.;!?…]"
 PARAGRAPH_ENDINGS = r"[\n\r]"
 def chunk_by_word(data: str):
    """
    Chunks text into words and endings while preserving whitespace.
    Whitespace is included with the preceding word.
    Outputs can be joined with "" to recreate the original input.
    """
    last_processed_character = ""
-    word = ""
+    current_chunk = ""
    i = 0
    # Handle leading whitespace if any
    while i < len(data) and (re.match(PARAGRAPH_ENDINGS, data[i]) or data[i] == " "):
        current_chunk += data[i]
        i += 1
    if current_chunk:
        yield (current_chunk, "word")
        current_chunk = ""
    while i < len(data):
        character = data[i]
-
+            
        if word == "" and (re.match(PARAGRAPH_ENDINGS, character) or character == " "):
            i = i + 1
            continue
        def is_real_paragraph_end():
            if re.match(SENTENCE_ENDINGS, last_processed_character):
                return True
            j = i + 1
            next_character = data[j] if j < len(data) else None
            while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
@ -25,35 +34,44 @@ def chunk_by_word(data: str):
                next_character = data[j] if j < len(data) else None
            if next_character and next_character.isupper():
                return True
            return False
-
+            
        if re.match(PARAGRAPH_ENDINGS, character):
-            yield (word, "paragraph_end" if is_real_paragraph_end() else "word")
+            if current_chunk:
-            word = ""
+                yield (current_chunk, "word")
-            i = i + 1
+                current_chunk = ""
            yield (character, "paragraph_end" if is_real_paragraph_end() else "word")
            i += 1
            continue
-
+            
-        if character == " ":
+        current_chunk += character
            yield [word, "word"]
            word = ""
            i = i + 1
            continue
        word += character
        last_processed_character = character
-
+        
        if character == " ":
            yield (current_chunk, "word")
            current_chunk = ""
            i += 1
            continue
        if re.match(SENTENCE_ENDINGS, character):
-            # Check for ellipses.
+            # Check for ellipses
-            if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".":
+            if i + 2 < len(data) and data[i:i+3] == "...":
-                word += ".."
+                current_chunk += ".."
-                i = i + 2
+                i += 2
-
+                
-            is_paragraph_end = i + 1 < len(data) and re.match(PARAGRAPH_ENDINGS, data[i + 1])
+            # Look ahead for whitespace
-            yield (word, "paragraph_end" if is_paragraph_end else "sentence_end")
+            next_i = i + 1
-            word = ""
+            while next_i < len(data) and data[next_i] == " ":
-
+                current_chunk += data[next_i]
                next_i += 1
            is_paragraph_end = next_i < len(data) and re.match(PARAGRAPH_ENDINGS, data[next_i])
            yield (current_chunk, "paragraph_end" if is_paragraph_end else "sentence_end")
            current_chunk = ""
            i = next_i
            continue
        i += 1
-
+        
-    if len(word) > 0:
+    if current_chunk:
-        yield (word, "word")
+        yield (current_chunk, "word")