Fix paragraph_ids handling

This commit is contained in:
Leon Luithlen 2024-11-14 16:47:51 +01:00
parent 7cf8c74cf9
commit 15420dd864

View file

@ -13,10 +13,8 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
paragraph_ids = [] paragraph_ids = []
last_cut_type = None last_cut_type = None
for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
# Check if this sentence would exceed length limit # Check if this sentence would exceed length limit
paragraph_ids.append(paragraph_id)
if current_word_count > 0 and current_word_count + word_count > paragraph_length: if current_word_count > 0 and current_word_count + word_count > paragraph_length:
# Yield current chunk # Yield current chunk
chunk_dict = { chunk_dict = {
@ -32,13 +30,13 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
# Start new chunk with current sentence # Start new chunk with current sentence
paragraph_ids = [] paragraph_ids = []
current_chunk = sentence current_chunk = ""
current_word_count = word_count current_word_count = 0
chunk_index += 1 chunk_index += 1
else:
# Just concatenate directly - no space handling paragraph_ids.append(paragraph_id)
current_chunk += sentence current_chunk += sentence
current_word_count += word_count current_word_count += word_count
# Handle end of paragraph # Handle end of paragraph
if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: