cognee/cognee/tasks/chunking/chunk_by_word.py

import re

def chunk_by_word(data: str):
    sentence_endings = r"[.;!?…]"
    paragraph_endings = r"[\n\r]"
    last_processed_character = ""

    word = ""
    i = 0

    while i < len(data):
        character = data[i]

        if word == "" and (re.match(paragraph_endings, character) or character == " "):
            i = i + 1
            continue

        def is_real_paragraph_end():
            if re.match(sentence_endings, last_processed_character):
                return True

            j = i + 1
            next_character = data[j] if j < len(data) else None
            while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
                j += 1
                next_character = data[j] if j < len(data) else None
            if next_character and next_character.isupper():
                return True

            return False

        if re.match(paragraph_endings, character):
            yield (word, "paragraph_end" if is_real_paragraph_end() else "word")
            word = ""
            i = i + 1
            continue

        if character == " ":
            yield [word, "word"]
            word = ""
            i = i + 1
            continue

        word += character
        last_processed_character = character

        if re.match(sentence_endings, character):
            # Check for ellipses.
            if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".":
                word += ".."
                i = i + 2

            is_paragraph_end = i + 1 < len(data) and re.match(paragraph_endings, data[i + 1])
            yield (word, "paragraph_end" if is_paragraph_end else "sentence_end")
            word = ""

        i += 1

    if len(word) > 0:
        yield (word, "word")