Refactor word_type yielding in chuck_by_sentence
This commit is contained in:
parent
14dd60576e
commit
e40e7386a0
1 changed files with 9 additions and 6 deletions
|
|
@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
|||
paragraph_id = uuid4()
|
||||
word_count = 0
|
||||
section_end = False
|
||||
word_type_state = None
|
||||
|
||||
# the yielded word_type_state is identical to word_type, except when
|
||||
# the word type is 'word', the word doesn't contain any letters
|
||||
# and words with the same characteristics connect it to a preceding
|
||||
# word with word_type 'paragraph_end' or 'sentence_end'
|
||||
for (word, word_type) in chunk_by_word(data):
|
||||
sentence += word
|
||||
word_count += 1
|
||||
|
||||
# this loop is to check if any letters come after a paragraph_end or sentence_end
|
||||
# and if that is not the case, preserve the word_type for the final yield in the
|
||||
# function
|
||||
if word_type in ["paragraph_end", "sentence_end"]:
|
||||
section_end = word_type
|
||||
word_type_state = word_type
|
||||
else:
|
||||
for character in word:
|
||||
if character.isalpha():
|
||||
section_end = "sentence_cut"
|
||||
word_type_state = word_type
|
||||
break
|
||||
|
||||
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
|
||||
yield (paragraph_id, sentence, word_count, word_type)
|
||||
yield (paragraph_id, sentence, word_count, word_type_state)
|
||||
sentence = ""
|
||||
word_count = 0
|
||||
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
|
||||
|
||||
if len(sentence) > 0:
|
||||
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
|
||||
yield (
|
||||
paragraph_id,
|
||||
sentence,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue