Fix PdfDocument teset, give chunk_by_sentence a maximum_length arg
This commit is contained in:
parent
1b4a7e4fdc
commit
9b2fb09c59
4 changed files with 4 additions and 4 deletions
|
|
@ -5,7 +5,7 @@ from uuid import uuid4
|
|||
from typing import Optional
|
||||
from .chunk_by_word import chunk_by_word
|
||||
|
||||
def chunk_by_sentence(data: str, maximum_length: Optional[int]):
|
||||
def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
||||
sentence = ""
|
||||
paragraph_id = uuid4()
|
||||
chunk_index = 0
|
||||
|
|
|
|||
|
|
@ -4,8 +4,8 @@ import uuid
|
|||
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
|
||||
|
||||
GROUND_TRUTH = [
|
||||
{"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
|
||||
{"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
|
||||
{"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
|
||||
{"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"},
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ Third paragraph is cut and is missing the dot at the end""",
|
|||
|
||||
def run_chunking_test(test_text, expected_chunks):
|
||||
chunks = []
|
||||
for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False):
|
||||
for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=True):
|
||||
chunks.append(chunk_data)
|
||||
|
||||
assert len(chunks) == 3
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue