Return sentence_cut instead of word in chunk_by_paragraph

This commit is contained in:
Leon Luithlen 2024-11-14 15:03:09 +01:00
parent 8b681529b1
commit d6a6a9eaba
2 changed files with 2 additions and 2 deletions

View file

@ -67,7 +67,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,
"cut_type": last_cut_type
"cut_type": "sentence_cut" if last_cut_type == "word" else last_cut_type
}

View file

@ -32,7 +32,7 @@ GROUND_TRUTH = {
{
"text": "\nThird paragraph is cut and is missing the dot at the end",
"word_count": 12,
"cut_type": "word",
"cut_type": "sentence_cut",
},
],
}