Change chunk_by_word to collect newlines in prior words

This commit is contained in:
Leon Luithlen 2024-11-14 14:19:34 +01:00
parent 57d8149732
commit eaf9167fa1

View file

@ -38,23 +38,13 @@ def chunk_by_word(data: str):
Whitespace is included with the preceding word. Whitespace is included with the preceding word.
Outputs can be joined with "" to recreate the original input. Outputs can be joined with "" to recreate the original input.
""" """
last_processed_character = ""
current_chunk = "" current_chunk = ""
i = 0 i = 0
while i < len(data): while i < len(data):
character = data[i] character = data[i]
if re.match(PARAGRAPH_ENDINGS, character):
if current_chunk:
yield (current_chunk, "word")
current_chunk = ""
yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word")
i += 1
continue
current_chunk += character current_chunk += character
last_processed_character = character
if character == " ": if character == " ":
yield (current_chunk, "word") yield (current_chunk, "word")