Add TextDocument_test.py

2024-11-14 11:39:14 +01:00 · 2024-11-14 11:39:14 +01:00 · e6636754ff
commit e6636754ff
parent 8afb25e0d4
2 changed files with 48 additions and 2 deletions
--- a/cognee/tests/integration/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@ -16,12 +16,12 @@ def test_PdfDocument():
        "test_data",
        "artificial-intelligence.pdf",
    )
-    pdf_doc = PdfDocument(
+    document = PdfDocument(
        id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path
    )
    for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH, pdf_doc.read(chunk_size=1024)
+        GROUND_TRUTH, document.read(chunk_size=1024)
    ):
        assert (
            ground_truth["word_count"] == paragraph_data.word_count
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@ -0,0 +1,46 @@
 import os
 import uuid
 import pytest
 from cognee.modules.data.processing.document_types.TextDocument import TextDocument
 GROUND_TRUTH = {
    "code.txt": [
        {"word_count": 253, "len_text": 953, "cut_type": "paragraph_end"},
        {"word_count": 157, "len_text": 905, "cut_type": "paragraph_end"},
    ],
    "Natural_language_processing.txt": [
        {"word_count": 115, "len_text": 839, "cut_type": "paragraph_end"},
        {"word_count": 15, "len_text": 146, "cut_type": "paragraph_end"},
    ],
 }
@pytest.mark.parametrize(
    "input_file,chunk_size",
    [("code.txt", 256), ("Natural_language_processing.txt", 128)],
 )
 def test_TextDocument(input_file, chunk_size):
    test_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
        "test_data",
        input_file,
    )
    document = TextDocument(
        id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path
    )
    for ground_truth, paragraph_data in zip(
        GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size)
    ):
        assert (
            ground_truth["word_count"] == paragraph_data.word_count
        ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
        assert ground_truth["len_text"] == len(
            paragraph_data.text
        ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
        assert (
            ground_truth["cut_type"] == paragraph_data.cut_type
        ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'