cognee/cognee/tasks/chunking/chunk_by_word.py
Boris a9433e9283
feat: add sqlalchemy as dlt destination (#137)
* feat: add sqlalchemy as dlt destination

* Fix the demo, update Readme

* fix: add 1.5 notebook

---------

Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
2024-09-21 15:58:28 +02:00

60 lines
1.8 KiB
Python

import re
def chunk_by_word(data: str):
sentence_endings = r"[.;!?…]"
paragraph_endings = r"[\n\r]"
last_processed_character = ""
word = ""
i = 0
while i < len(data):
character = data[i]
if word == "" and (re.match(paragraph_endings, character) or character == " "):
i = i + 1
continue
def is_real_paragraph_end():
if re.match(sentence_endings, last_processed_character):
return True
j = i + 1
next_character = data[j] if j < len(data) else None
while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
j += 1
next_character = data[j] if j < len(data) else None
if next_character and next_character.isupper():
return True
return False
if re.match(paragraph_endings, character):
yield (word, "paragraph_end" if is_real_paragraph_end() else "word")
word = ""
i = i + 1
continue
if character == " ":
yield [word, "word"]
word = ""
i = i + 1
continue
word += character
last_processed_character = character
if re.match(sentence_endings, character):
# Check for ellipses.
if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".":
word += ".."
i = i + 2
is_paragraph_end = i + 1 < len(data) and re.match(paragraph_endings, data[i + 1])
yield (word, "paragraph_end" if is_paragraph_end else "sentence_end")
word = ""
i += 1
if len(word) > 0:
yield (word, "word")