Change chunk_by_word to collect newlines in prior words
This commit is contained in:
parent
57d8149732
commit
eaf9167fa1
1 changed files with 0 additions and 10 deletions
|
|
@ -38,23 +38,13 @@ def chunk_by_word(data: str):
|
||||||
Whitespace is included with the preceding word.
|
Whitespace is included with the preceding word.
|
||||||
Outputs can be joined with "" to recreate the original input.
|
Outputs can be joined with "" to recreate the original input.
|
||||||
"""
|
"""
|
||||||
last_processed_character = ""
|
|
||||||
current_chunk = ""
|
current_chunk = ""
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
while i < len(data):
|
while i < len(data):
|
||||||
character = data[i]
|
character = data[i]
|
||||||
|
|
||||||
if re.match(PARAGRAPH_ENDINGS, character):
|
|
||||||
if current_chunk:
|
|
||||||
yield (current_chunk, "word")
|
|
||||||
current_chunk = ""
|
|
||||||
yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word")
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
current_chunk += character
|
current_chunk += character
|
||||||
last_processed_character = character
|
|
||||||
|
|
||||||
if character == " ":
|
if character == " ":
|
||||||
yield (current_chunk, "word")
|
yield (current_chunk, "word")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue