Change chunk_by_word to collect newlines in prior words
This commit is contained in:
parent
57d8149732
commit
eaf9167fa1
1 changed files with 0 additions and 10 deletions
|
|
@ -38,23 +38,13 @@ def chunk_by_word(data: str):
|
|||
Whitespace is included with the preceding word.
|
||||
Outputs can be joined with "" to recreate the original input.
|
||||
"""
|
||||
last_processed_character = ""
|
||||
current_chunk = ""
|
||||
i = 0
|
||||
|
||||
while i < len(data):
|
||||
character = data[i]
|
||||
|
||||
if re.match(PARAGRAPH_ENDINGS, character):
|
||||
if current_chunk:
|
||||
yield (current_chunk, "word")
|
||||
current_chunk = ""
|
||||
yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word")
|
||||
i += 1
|
||||
continue
|
||||
|
||||
current_chunk += character
|
||||
last_processed_character = character
|
||||
|
||||
if character == " ":
|
||||
yield (current_chunk, "word")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue