cognee/cognee/tasks/chunks/chunk_by_sentence.py
2025-01-17 10:30:34 +01:00

54 lines
2 KiB
Python

from uuid import uuid4, UUID
from typing import Optional, Iterator, Tuple
from .chunk_by_word import chunk_by_word
def chunk_by_sentence(
data: str, maximum_length: Optional[int] = None
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
"""
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
Notes:
- Relies on the `chunk_by_word` function for word-level tokenization and classification.
- Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
- Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
"""
sentence = ""
paragraph_id = uuid4()
word_count = 0
section_end = False
word_type_state = None
# the yielded word_type_state is identical to word_type, except when
# the word type is 'word', the word doesn't contain any letters
# and words with the same characteristics connect it to a preceding
# word with word_type 'paragraph_end' or 'sentence_end'
for word, word_type in chunk_by_word(data):
sentence += word
word_count += 1
if word_type in ["paragraph_end", "sentence_end"]:
word_type_state = word_type
else:
for character in word:
if character.isalpha():
word_type_state = word_type
break
if word_type in ["paragraph_end", "sentence_end"] or (
maximum_length and (word_count == maximum_length)
):
yield (paragraph_id, sentence, word_count, word_type_state)
sentence = ""
word_count = 0
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
if len(sentence) > 0:
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield (
paragraph_id,
sentence,
word_count,
section_end,
)