Update cognee/tasks/chunks/chunk_by_word.py
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
parent
8260647497
commit
f2206a09c0
1 changed files with 24 additions and 7 deletions
|
|
@ -3,15 +3,32 @@ import re
|
||||||
SENTENCE_ENDINGS = r"[.;!?…]"
|
SENTENCE_ENDINGS = r"[.;!?…]"
|
||||||
PARAGRAPH_ENDINGS = r"[\n\r]"
|
PARAGRAPH_ENDINGS = r"[\n\r]"
|
||||||
|
|
||||||
def is_real_paragraph_end(last_processed_character, i, data):
|
def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
|
||||||
if re.match(SENTENCE_ENDINGS, last_processed_character):
|
"""
|
||||||
|
Determines if the current position represents a real paragraph ending.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
last_char: The last processed character
|
||||||
|
current_pos: Current position in the text
|
||||||
|
text: The input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if this is a real paragraph end, False otherwise
|
||||||
|
"""
|
||||||
|
if re.match(SENTENCE_ENDINGS, last_char):
|
||||||
return True
|
return True
|
||||||
j = i + 1
|
j = current_pos + 1
|
||||||
next_character = data[j] if j < len(data) else None
|
if j >= len(text):
|
||||||
while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
|
return False
|
||||||
|
|
||||||
|
next_character = text[j]
|
||||||
|
while j < len(text) and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
|
||||||
j += 1
|
j += 1
|
||||||
next_character = data[j] if j < len(data) else None
|
if j >= len(text):
|
||||||
if next_character and next_character.isupper():
|
return False
|
||||||
|
next_character = text[j]
|
||||||
|
|
||||||
|
if next_character.isupper():
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue