From e40e7386a0f61dec7c6aa6dac1fc4a4ab5f3c9cf Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 17:16:04 +0100 Subject: [PATCH] Refactor word_type yielding in chuck_by_sentence --- cognee/tasks/chunks/chunk_by_sentence.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index bee074d04..c6848f066 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): paragraph_id = uuid4() word_count = 0 section_end = False + word_type_state = None + # the yielded word_type_state is identical to word_type, except when + # the word type is 'word', the word doesn't contain any letters + # and words with the same characteristics connect it to a preceding + # word with word_type 'paragraph_end' or 'sentence_end' for (word, word_type) in chunk_by_word(data): sentence += word word_count += 1 - # this loop is to check if any letters come after a paragraph_end or sentence_end - # and if that is not the case, preserve the word_type for the final yield in the - # function if word_type in ["paragraph_end", "sentence_end"]: - section_end = word_type + word_type_state = word_type else: for character in word: if character.isalpha(): - section_end = "sentence_cut" + word_type_state = word_type break if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)): - yield (paragraph_id, sentence, word_count, word_type) + yield (paragraph_id, sentence, word_count, word_type_state) sentence = "" word_count = 0 paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id if len(sentence) > 0: + section_end = "sentence_cut" if word_type_state == "word" else word_type_state yield ( paragraph_id, sentence,