test: Add tests for different document types

Add tests for unstructured reading for different document types Test COG-685
2024-12-09 15:20:50 +01:00 · 2024-12-09 15:20:50 +01:00 · d7d559f4f7
commit d7d559f4f7
parent df289deb18
4 changed files with 63 additions and 2 deletions
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@ -4,19 +4,77 @@ import uuid
 from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
 def test_UnstructuredDocument():
-    docx_file_path = os.path.join(
+    # Define file paths of test data
    pptx_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
        "test_data",
        "example.pptx",
    )
    docx_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
        "test_data",
        "example.docx",
    )
    csv_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
        "test_data",
        "example.csv",
    )
    xlsx_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
        "test_data",
        "example.xlsx",
    )
    # Define test documents
    pptx_document = UnstructuredDocument(
-        id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
+        id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(),
        mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
    )
    docx_document = UnstructuredDocument(
        id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
        mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    )
    csv_document = UnstructuredDocument(
        id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(),
        mime_type="text/csv"
    )
    xlsx_document = UnstructuredDocument(
        id=uuid.uuid4(), name="example.xslx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(),
        mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )
    # Test PPTX
    for paragraph_data in pptx_document.read(chunk_size=1024):
        assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
        assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
        assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
    # Test DOCX
    for paragraph_data in docx_document.read(chunk_size=1024):
        assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }'
        assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }'
        assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }'
    # TEST CSV
    for paragraph_data in csv_document.read(chunk_size=1024):
        assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }'
        assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \
            f'Read text doesn\'t match expected text: {paragraph_data.text}'
        assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
    # Test XLSX
    for paragraph_data in xlsx_document.read(chunk_size=1024):
        assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }'
        assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }'
        assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
--- a/cognee/tests/test_data/example.csv
+++ b/cognee/tests/test_data/example.csv
@ -0,0 +1,3 @@
 A,A,A,A,A
 A,A,A,"A,A",A
 A,A,A,"A,A",A
--- a/cognee/tests/test_data/example.docx
+++ b/cognee/tests/test_data/example.docx
--- a/cognee/tests/test_data/example.xlsx
+++ b/cognee/tests/test_data/example.xlsx