Make chunk_by_word isomorphic
This commit is contained in:
parent
830c6710e0
commit
c054e897a3
1 changed files with 51 additions and 33 deletions
|
|
@ -4,20 +4,29 @@ SENTENCE_ENDINGS = r"[.;!?…]"
|
||||||
PARAGRAPH_ENDINGS = r"[\n\r]"
|
PARAGRAPH_ENDINGS = r"[\n\r]"
|
||||||
|
|
||||||
def chunk_by_word(data: str):
|
def chunk_by_word(data: str):
|
||||||
|
"""
|
||||||
|
Chunks text into words and endings while preserving whitespace.
|
||||||
|
Whitespace is included with the preceding word.
|
||||||
|
Outputs can be joined with "" to recreate the original input.
|
||||||
|
"""
|
||||||
last_processed_character = ""
|
last_processed_character = ""
|
||||||
word = ""
|
current_chunk = ""
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
|
# Handle leading whitespace if any
|
||||||
|
while i < len(data) and (re.match(PARAGRAPH_ENDINGS, data[i]) or data[i] == " "):
|
||||||
|
current_chunk += data[i]
|
||||||
|
i += 1
|
||||||
|
if current_chunk:
|
||||||
|
yield (current_chunk, "word")
|
||||||
|
current_chunk = ""
|
||||||
|
|
||||||
while i < len(data):
|
while i < len(data):
|
||||||
character = data[i]
|
character = data[i]
|
||||||
|
|
||||||
if word == "" and (re.match(PARAGRAPH_ENDINGS, character) or character == " "):
|
|
||||||
i = i + 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
def is_real_paragraph_end():
|
def is_real_paragraph_end():
|
||||||
if re.match(SENTENCE_ENDINGS, last_processed_character):
|
if re.match(SENTENCE_ENDINGS, last_processed_character):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
j = i + 1
|
j = i + 1
|
||||||
next_character = data[j] if j < len(data) else None
|
next_character = data[j] if j < len(data) else None
|
||||||
while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
|
while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
|
||||||
|
|
@ -25,35 +34,44 @@ def chunk_by_word(data: str):
|
||||||
next_character = data[j] if j < len(data) else None
|
next_character = data[j] if j < len(data) else None
|
||||||
if next_character and next_character.isupper():
|
if next_character and next_character.isupper():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if re.match(PARAGRAPH_ENDINGS, character):
|
if re.match(PARAGRAPH_ENDINGS, character):
|
||||||
yield (word, "paragraph_end" if is_real_paragraph_end() else "word")
|
if current_chunk:
|
||||||
word = ""
|
yield (current_chunk, "word")
|
||||||
i = i + 1
|
current_chunk = ""
|
||||||
|
yield (character, "paragraph_end" if is_real_paragraph_end() else "word")
|
||||||
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if character == " ":
|
current_chunk += character
|
||||||
yield [word, "word"]
|
|
||||||
word = ""
|
|
||||||
i = i + 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
word += character
|
|
||||||
last_processed_character = character
|
last_processed_character = character
|
||||||
|
|
||||||
|
if character == " ":
|
||||||
|
yield (current_chunk, "word")
|
||||||
|
current_chunk = ""
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
if re.match(SENTENCE_ENDINGS, character):
|
if re.match(SENTENCE_ENDINGS, character):
|
||||||
# Check for ellipses.
|
# Check for ellipses
|
||||||
if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".":
|
if i + 2 < len(data) and data[i:i+3] == "...":
|
||||||
word += ".."
|
current_chunk += ".."
|
||||||
i = i + 2
|
i += 2
|
||||||
|
|
||||||
is_paragraph_end = i + 1 < len(data) and re.match(PARAGRAPH_ENDINGS, data[i + 1])
|
# Look ahead for whitespace
|
||||||
yield (word, "paragraph_end" if is_paragraph_end else "sentence_end")
|
next_i = i + 1
|
||||||
word = ""
|
while next_i < len(data) and data[next_i] == " ":
|
||||||
|
current_chunk += data[next_i]
|
||||||
|
next_i += 1
|
||||||
|
|
||||||
|
is_paragraph_end = next_i < len(data) and re.match(PARAGRAPH_ENDINGS, data[next_i])
|
||||||
|
yield (current_chunk, "paragraph_end" if is_paragraph_end else "sentence_end")
|
||||||
|
current_chunk = ""
|
||||||
|
i = next_i
|
||||||
|
continue
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
if len(word) > 0:
|
if current_chunk:
|
||||||
yield (word, "word")
|
yield (current_chunk, "word")
|
||||||
Loading…
Add table
Reference in a new issue