diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index 9719d90fc..38b547140 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -34,7 +34,7 @@ def test_AudioDocument(): ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): for ground_truth, paragraph_data in zip( - GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") + GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512) ): assert ground_truth["word_count"] == paragraph_data.word_count, ( f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index bd15961ee..faa54fa27 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -23,7 +23,7 @@ def test_ImageDocument(): ) with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT): for ground_truth, paragraph_data in zip( - GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") + GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512) ): assert ground_truth["word_count"] == paragraph_data.word_count, ( f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index 82d304b6c..e9530fc12 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -25,7 +25,7 @@ def test_PdfDocument(): ) for ground_truth, paragraph_data in zip( - GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") + GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker", max_chunk_tokens=2048) ): assert ground_truth["word_count"] == paragraph_data.word_count, ( f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 17db39be8..99e28a3ac 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -37,7 +37,8 @@ def test_TextDocument(input_file, chunk_size): ) for ground_truth, paragraph_data in zip( - GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") + GROUND_TRUTH[input_file], + document.read(chunk_size=chunk_size, chunker="text_chunker", max_chunk_tokens=1024), ): assert ground_truth["word_count"] == paragraph_data.word_count, ( f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 81e804f07..d76843c0a 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -68,7 +68,9 @@ def test_UnstructuredDocument(): ) # Test PPTX - for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): + for paragraph_data in pptx_document.read( + chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024 + ): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert "sentence_cut" == paragraph_data.cut_type, ( @@ -76,7 +78,9 @@ def test_UnstructuredDocument(): ) # Test DOCX - for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): + for paragraph_data in docx_document.read( + chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024 + ): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert "sentence_end" == paragraph_data.cut_type, ( @@ -84,7 +88,9 @@ def test_UnstructuredDocument(): ) # TEST CSV - for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): + for paragraph_data in csv_document.read( + chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024 + ): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( f"Read text doesn't match expected text: {paragraph_data.text}" @@ -94,7 +100,9 @@ def test_UnstructuredDocument(): ) # Test XLSX - for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): + for paragraph_data in xlsx_document.read( + chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024 + ): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert "sentence_cut" == paragraph_data.cut_type, (