From 9b2fb09c5920641a40305154b1a1acb833021107 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 15:39:17 +0100 Subject: [PATCH] Fix PdfDocument teset, give chunk_by_sentence a maximum_length arg --- cognee/tasks/chunks/chunk_by_sentence.py | 2 +- cognee/tests/unit/documents/PdfDocument_test.py | 4 ++-- ...unk_by_paragraph_test2.py => chunk_by_paragraph_2_test.py} | 0 .../tests/unit/processing/chunks/chunk_by_paragraph_test.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename cognee/tests/unit/processing/chunks/{chunk_by_paragraph_test2.py => chunk_by_paragraph_2_test.py} (100%) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 7191a78c4..1ce052a6c 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -5,7 +5,7 @@ from uuid import uuid4 from typing import Optional from .chunk_by_word import chunk_by_word -def chunk_by_sentence(data: str, maximum_length: Optional[int]): +def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): sentence = "" paragraph_id = uuid4() chunk_index = 0 diff --git a/cognee/tests/unit/documents/PdfDocument_test.py b/cognee/tests/unit/documents/PdfDocument_test.py index 917e9c3e0..108d61273 100644 --- a/cognee/tests/unit/documents/PdfDocument_test.py +++ b/cognee/tests/unit/documents/PdfDocument_test.py @@ -4,8 +4,8 @@ import uuid from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument GROUND_TRUTH = [ - {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"}, - {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"}, + {"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"}, + {"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"}, ] diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py similarity index 100% rename from cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py rename to cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index 28b4b37c3..f8fe00237 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -49,7 +49,7 @@ Third paragraph is cut and is missing the dot at the end""", def run_chunking_test(test_text, expected_chunks): chunks = [] - for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False): + for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=True): chunks.append(chunk_data) assert len(chunks) == 3