Return sentence_cut instead of word in chunk_by_paragraph
This commit is contained in:
parent
8b681529b1
commit
d6a6a9eaba
2 changed files with 2 additions and 2 deletions
|
|
@ -67,7 +67,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
|
||||||
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
||||||
"paragraph_ids": paragraph_ids,
|
"paragraph_ids": paragraph_ids,
|
||||||
"chunk_index": chunk_index,
|
"chunk_index": chunk_index,
|
||||||
"cut_type": last_cut_type
|
"cut_type": "sentence_cut" if last_cut_type == "word" else last_cut_type
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ GROUND_TRUTH = {
|
||||||
{
|
{
|
||||||
"text": "\nThird paragraph is cut and is missing the dot at the end",
|
"text": "\nThird paragraph is cut and is missing the dot at the end",
|
||||||
"word_count": 12,
|
"word_count": 12,
|
||||||
"cut_type": "word",
|
"cut_type": "sentence_cut",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue