diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 56c3c827f..7f6e20ba0 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -4,19 +4,77 @@ import uuid from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument def test_UnstructuredDocument(): - docx_file_path = os.path.join( + # Define file paths of test data + pptx_file_path = os.path.join( os.sep, *(os.path.dirname(__file__).split(os.sep)[:-2]), "test_data", "example.pptx", ) + docx_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.docx", + ) + + csv_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.csv", + ) + + xlsx_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.xlsx", + ) + + # Define test documents pptx_document = UnstructuredDocument( - id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(), + id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(), mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation" ) + docx_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(), + mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + + csv_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(), + mime_type="text/csv" + ) + + xlsx_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.xslx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(), + mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + + # Test PPTX for paragraph_data in pptx_document.read(chunk_size=1024): assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }' assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }' assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' + + # Test DOCX + for paragraph_data in docx_document.read(chunk_size=1024): + assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }' + assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }' + assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }' + + # TEST CSV + for paragraph_data in csv_document.read(chunk_size=1024): + assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }' + assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \ + f'Read text doesn\'t match expected text: {paragraph_data.text}' + assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' + + # Test XLSX + for paragraph_data in xlsx_document.read(chunk_size=1024): + assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }' + assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }' + assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' diff --git a/cognee/tests/test_data/example.csv b/cognee/tests/test_data/example.csv new file mode 100644 index 000000000..004bbf122 --- /dev/null +++ b/cognee/tests/test_data/example.csv @@ -0,0 +1,3 @@ +A,A,A,A,A +A,A,A,"A,A",A +A,A,A,"A,A",A diff --git a/cognee/tests/test_data/example.docx b/cognee/tests/test_data/example.docx new file mode 100644 index 000000000..81ff7f2d4 Binary files /dev/null and b/cognee/tests/test_data/example.docx differ diff --git a/cognee/tests/test_data/example.xlsx b/cognee/tests/test_data/example.xlsx new file mode 100644 index 000000000..944533d38 Binary files /dev/null and b/cognee/tests/test_data/example.xlsx differ