Change chunk_by_word to collect newlines in prior words

2024-11-14 14:19:34 +01:00 · 2024-11-14 14:19:34 +01:00 · eaf9167fa1
commit eaf9167fa1
parent 57d8149732
1 changed files with 0 additions and 10 deletions
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@ -38,23 +38,13 @@ def chunk_by_word(data: str):
    Whitespace is included with the preceding word.
    Outputs can be joined with "" to recreate the original input.
    """
    last_processed_character = ""
    current_chunk = ""
    i = 0
    while i < len(data):
        character = data[i]
        if re.match(PARAGRAPH_ENDINGS, character):
            if current_chunk:
                yield (current_chunk, "word")
                current_chunk = ""
            yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word")
            i += 1
            continue
        current_chunk += character
        last_processed_character = character
        if character == " ":
            yield (current_chunk, "word")