Refactor word_type yielding in chuck_by_sentence

This commit is contained in:
Leon Luithlen 2024-11-14 17:16:04 +01:00
parent 14dd60576e
commit e40e7386a0

View file

@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
paragraph_id = uuid4() paragraph_id = uuid4()
word_count = 0 word_count = 0
section_end = False section_end = False
word_type_state = None
# the yielded word_type_state is identical to word_type, except when
# the word type is 'word', the word doesn't contain any letters
# and words with the same characteristics connect it to a preceding
# word with word_type 'paragraph_end' or 'sentence_end'
for (word, word_type) in chunk_by_word(data): for (word, word_type) in chunk_by_word(data):
sentence += word sentence += word
word_count += 1 word_count += 1
# this loop is to check if any letters come after a paragraph_end or sentence_end
# and if that is not the case, preserve the word_type for the final yield in the
# function
if word_type in ["paragraph_end", "sentence_end"]: if word_type in ["paragraph_end", "sentence_end"]:
section_end = word_type word_type_state = word_type
else: else:
for character in word: for character in word:
if character.isalpha(): if character.isalpha():
section_end = "sentence_cut" word_type_state = word_type
break break
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)): if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
yield (paragraph_id, sentence, word_count, word_type) yield (paragraph_id, sentence, word_count, word_type_state)
sentence = "" sentence = ""
word_count = 0 word_count = 0
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
if len(sentence) > 0: if len(sentence) > 0:
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield ( yield (
paragraph_id, paragraph_id,
sentence, sentence,