test: Add tests for different document types

Add tests for unstructured reading for different document types

Test COG-685
This commit is contained in:
Igor Ilic 2024-12-09 15:20:50 +01:00
parent df289deb18
commit d7d559f4f7
4 changed files with 63 additions and 2 deletions

View file

@ -4,19 +4,77 @@ import uuid
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
def test_UnstructuredDocument():
docx_file_path = os.path.join(
# Define file paths of test data
pptx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.pptx",
)
docx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.docx",
)
csv_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.csv",
)
xlsx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.xlsx",
)
# Define test documents
pptx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
docx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
csv_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(),
mime_type="text/csv"
)
xlsx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.xslx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Test PPTX
for paragraph_data in pptx_document.read(chunk_size=1024):
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
# Test DOCX
for paragraph_data in docx_document.read(chunk_size=1024):
assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }'
assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }'
assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }'
# TEST CSV
for paragraph_data in csv_document.read(chunk_size=1024):
assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }'
assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \
f'Read text doesn\'t match expected text: {paragraph_data.text}'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
# Test XLSX
for paragraph_data in xlsx_document.read(chunk_size=1024):
assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }'
assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'

View file

@ -0,0 +1,3 @@
A,A,A,A,A
A,A,A,"A,A",A
A,A,A,"A,A",A
1 A A A A A
2 A A A A,A A
3 A A A A,A A

Binary file not shown.

Binary file not shown.