test: Add tests for different document types
Add tests for unstructured reading for different document types Test COG-685
This commit is contained in:
parent
df289deb18
commit
d7d559f4f7
4 changed files with 63 additions and 2 deletions
|
|
@ -4,19 +4,77 @@ import uuid
|
||||||
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
|
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
|
||||||
|
|
||||||
def test_UnstructuredDocument():
|
def test_UnstructuredDocument():
|
||||||
docx_file_path = os.path.join(
|
# Define file paths of test data
|
||||||
|
pptx_file_path = os.path.join(
|
||||||
os.sep,
|
os.sep,
|
||||||
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||||
"test_data",
|
"test_data",
|
||||||
"example.pptx",
|
"example.pptx",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
docx_file_path = os.path.join(
|
||||||
|
os.sep,
|
||||||
|
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||||
|
"test_data",
|
||||||
|
"example.docx",
|
||||||
|
)
|
||||||
|
|
||||||
|
csv_file_path = os.path.join(
|
||||||
|
os.sep,
|
||||||
|
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||||
|
"test_data",
|
||||||
|
"example.csv",
|
||||||
|
)
|
||||||
|
|
||||||
|
xlsx_file_path = os.path.join(
|
||||||
|
os.sep,
|
||||||
|
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||||
|
"test_data",
|
||||||
|
"example.xlsx",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define test documents
|
||||||
pptx_document = UnstructuredDocument(
|
pptx_document = UnstructuredDocument(
|
||||||
id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
|
id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(),
|
||||||
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
docx_document = UnstructuredDocument(
|
||||||
|
id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
|
||||||
|
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
)
|
||||||
|
|
||||||
|
csv_document = UnstructuredDocument(
|
||||||
|
id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(),
|
||||||
|
mime_type="text/csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
xlsx_document = UnstructuredDocument(
|
||||||
|
id=uuid.uuid4(), name="example.xslx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(),
|
||||||
|
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test PPTX
|
||||||
for paragraph_data in pptx_document.read(chunk_size=1024):
|
for paragraph_data in pptx_document.read(chunk_size=1024):
|
||||||
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
|
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
|
||||||
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
|
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
|
||||||
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
|
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
|
||||||
|
|
||||||
|
# Test DOCX
|
||||||
|
for paragraph_data in docx_document.read(chunk_size=1024):
|
||||||
|
assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }'
|
||||||
|
assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }'
|
||||||
|
assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }'
|
||||||
|
|
||||||
|
# TEST CSV
|
||||||
|
for paragraph_data in csv_document.read(chunk_size=1024):
|
||||||
|
assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }'
|
||||||
|
assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \
|
||||||
|
f'Read text doesn\'t match expected text: {paragraph_data.text}'
|
||||||
|
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
|
||||||
|
|
||||||
|
# Test XLSX
|
||||||
|
for paragraph_data in xlsx_document.read(chunk_size=1024):
|
||||||
|
assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }'
|
||||||
|
assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }'
|
||||||
|
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
|
||||||
|
|
|
||||||
3
cognee/tests/test_data/example.csv
Normal file
3
cognee/tests/test_data/example.csv
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
A,A,A,A,A
|
||||||
|
A,A,A,"A,A",A
|
||||||
|
A,A,A,"A,A",A
|
||||||
|
BIN
cognee/tests/test_data/example.docx
Normal file
BIN
cognee/tests/test_data/example.docx
Normal file
Binary file not shown.
BIN
cognee/tests/test_data/example.xlsx
Normal file
BIN
cognee/tests/test_data/example.xlsx
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue