Fix PdfDocument teset, give chunk_by_sentence a maximum_length arg

This commit is contained in:
Leon Luithlen 2024-11-13 15:39:17 +01:00
parent 1b4a7e4fdc
commit 9b2fb09c59
4 changed files with 4 additions and 4 deletions

View file

@ -5,7 +5,7 @@ from uuid import uuid4
from typing import Optional
from .chunk_by_word import chunk_by_word
def chunk_by_sentence(data: str, maximum_length: Optional[int]):
def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
sentence = ""
paragraph_id = uuid4()
chunk_index = 0

View file

@ -4,8 +4,8 @@ import uuid
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
GROUND_TRUTH = [
{"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
{"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
{"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
{"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"},
]

View file

@ -49,7 +49,7 @@ Third paragraph is cut and is missing the dot at the end""",
def run_chunking_test(test_text, expected_chunks):
chunks = []
for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False):
for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=True):
chunks.append(chunk_data)
assert len(chunks) == 3