test: Add test for Unstructured pptx document type

Added pptx example file and tested Unstructured pptx document type handling

Test COG-685
This commit is contained in:
Igor Ilic 2024-12-08 15:18:42 +01:00
parent 07d9330e4a
commit 596b3edf72
2 changed files with 22 additions and 0 deletions

View file

@ -0,0 +1,22 @@
import os
import uuid
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
def test_UnstructuredDocument():
docx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.pptx",
)
pptx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
for paragraph_data in pptx_document.read(chunk_size=1024):
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'

Binary file not shown.