Refactor word_type yielding in chuck_by_sentence

This commit is contained in:
Leon Luithlen 2024-11-14 17:16:04 +01:00
parent 14dd60576e
commit e40e7386a0

View file

@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
paragraph_id = uuid4()
word_count = 0
section_end = False
word_type_state = None
# the yielded word_type_state is identical to word_type, except when
# the word type is 'word', the word doesn't contain any letters
# and words with the same characteristics connect it to a preceding
# word with word_type 'paragraph_end' or 'sentence_end'
for (word, word_type) in chunk_by_word(data):
sentence += word
word_count += 1
# this loop is to check if any letters come after a paragraph_end or sentence_end
# and if that is not the case, preserve the word_type for the final yield in the
# function
if word_type in ["paragraph_end", "sentence_end"]:
section_end = word_type
word_type_state = word_type
else:
for character in word:
if character.isalpha():
section_end = "sentence_cut"
word_type_state = word_type
break
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
yield (paragraph_id, sentence, word_count, word_type)
yield (paragraph_id, sentence, word_count, word_type_state)
sentence = ""
word_count = 0
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
if len(sentence) > 0:
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield (
paragraph_id,
sentence,