From eaf9167fa170113ca6173a518d679ef5536647c4 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:19:34 +0100 Subject: [PATCH] Change chunk_by_word to collect newlines in prior words --- cognee/tasks/chunks/chunk_by_word.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index 45f11f1c2..ab4d8343e 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -38,23 +38,13 @@ def chunk_by_word(data: str): Whitespace is included with the preceding word. Outputs can be joined with "" to recreate the original input. """ - last_processed_character = "" current_chunk = "" i = 0 while i < len(data): character = data[i] - if re.match(PARAGRAPH_ENDINGS, character): - if current_chunk: - yield (current_chunk, "word") - current_chunk = "" - yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word") - i += 1 - continue - current_chunk += character - last_processed_character = character if character == " ": yield (current_chunk, "word")