Remove chunk_index attribute from chunk_by_sentence return value
This commit is contained in:
parent
15420dd864
commit
84c98f16bb
1 changed files with 1 additions and 4 deletions
|
|
@ -8,7 +8,6 @@ from .chunk_by_word import chunk_by_word
|
||||||
def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
||||||
sentence = ""
|
sentence = ""
|
||||||
paragraph_id = uuid4()
|
paragraph_id = uuid4()
|
||||||
chunk_index = 0
|
|
||||||
word_count = 0
|
word_count = 0
|
||||||
section_end = False
|
section_end = False
|
||||||
|
|
||||||
|
|
@ -28,16 +27,14 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
||||||
break
|
break
|
||||||
|
|
||||||
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
|
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
|
||||||
yield (paragraph_id, chunk_index, sentence, word_count, word_type)
|
yield (paragraph_id, sentence, word_count, word_type)
|
||||||
sentence = ""
|
sentence = ""
|
||||||
word_count = 0
|
word_count = 0
|
||||||
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
|
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
|
||||||
chunk_index = 0 if word_type == "paragraph_end" else chunk_index + 1
|
|
||||||
|
|
||||||
if len(sentence) > 0:
|
if len(sentence) > 0:
|
||||||
yield (
|
yield (
|
||||||
paragraph_id,
|
paragraph_id,
|
||||||
chunk_index,
|
|
||||||
sentence,
|
sentence,
|
||||||
word_count,
|
word_count,
|
||||||
section_end,
|
section_end,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue