Change chunk_by_word to collect newlines in prior words

2024-11-14 14:19:34 +01:00 · 2024-11-14 14:19:34 +01:00 · eaf9167fa1
commit eaf9167fa1
parent 57d8149732
1 changed files with 0 additions and 10 deletions
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@ -38,23 +38,13 @@ def chunk_by_word(data: str):
    Whitespace is included with the preceding word.
    Outputs can be joined with "" to recreate the original input.
    """
-    last_processed_character = ""
    current_chunk = ""
    i = 0
    
    while i < len(data):
        character = data[i]
            
-        if re.match(PARAGRAPH_ENDINGS, character):
-            if current_chunk:
-                yield (current_chunk, "word")
-                current_chunk = ""
-            yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word")
-            i += 1
-            continue
-            
        current_chunk += character
-        last_processed_character = character
        
        if character == " ":
            yield (current_chunk, "word")