From d90698305bb3d1884577749a7408d7376e6149b3 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 09:43:10 +0100 Subject: [PATCH] Simplify chunk_by_word --- cognee/tasks/chunks/chunk_by_word.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index 120c759e6..a93f9acdb 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -25,14 +25,6 @@ def chunk_by_word(data: str): current_chunk = "" i = 0 - # Handle leading whitespace if any - while i < len(data) and (re.match(PARAGRAPH_ENDINGS, data[i]) or data[i] == " "): - current_chunk += data[i] - i += 1 - if current_chunk: - yield (current_chunk, "word") - current_chunk = "" - while i < len(data): character = data[i] @@ -53,12 +45,7 @@ def chunk_by_word(data: str): i += 1 continue - if re.match(SENTENCE_ENDINGS, character): - # Check for ellipses - if i + 2 < len(data) and data[i:i+3] == "...": - current_chunk += ".." - i += 2 - + if re.match(SENTENCE_ENDINGS, character): # Look ahead for whitespace next_i = i + 1 while next_i < len(data) and data[next_i] == " ":