Fix PdfDocument teset, give chunk_by_sentence a maximum_length arg
This commit is contained in:
parent
1b4a7e4fdc
commit
9b2fb09c59
4 changed files with 4 additions and 4 deletions
|
|
@ -5,7 +5,7 @@ from uuid import uuid4
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from .chunk_by_word import chunk_by_word
|
from .chunk_by_word import chunk_by_word
|
||||||
|
|
||||||
def chunk_by_sentence(data: str, maximum_length: Optional[int]):
|
def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
||||||
sentence = ""
|
sentence = ""
|
||||||
paragraph_id = uuid4()
|
paragraph_id = uuid4()
|
||||||
chunk_index = 0
|
chunk_index = 0
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,8 @@ import uuid
|
||||||
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
|
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
|
||||||
|
|
||||||
GROUND_TRUTH = [
|
GROUND_TRUTH = [
|
||||||
{"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
|
{"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
|
||||||
{"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
|
{"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ Third paragraph is cut and is missing the dot at the end""",
|
||||||
|
|
||||||
def run_chunking_test(test_text, expected_chunks):
|
def run_chunking_test(test_text, expected_chunks):
|
||||||
chunks = []
|
chunks = []
|
||||||
for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False):
|
for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=True):
|
||||||
chunks.append(chunk_data)
|
chunks.append(chunk_data)
|
||||||
|
|
||||||
assert len(chunks) == 3
|
assert len(chunks) == 3
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue