110 lines
3.9 KiB
Python
110 lines
3.9 KiB
Python
import os
|
|
import sys
|
|
import uuid
|
|
import pytest
|
|
import pathlib
|
|
from unittest.mock import patch
|
|
|
|
from cognee.modules.chunking.TextChunker import TextChunker
|
|
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
|
|
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
|
|
|
|
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
|
|
|
|
|
@patch.object(
|
|
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
|
)
|
|
@pytest.mark.asyncio
|
|
async def test_UnstructuredDocument(mock_engine):
|
|
# Define file paths of test data
|
|
pptx_file_path = os.path.join(
|
|
pathlib.Path(__file__).parent.parent.parent,
|
|
"test_data",
|
|
"example.pptx",
|
|
)
|
|
|
|
docx_file_path = os.path.join(
|
|
pathlib.Path(__file__).parent.parent.parent,
|
|
"test_data",
|
|
"example.docx",
|
|
)
|
|
|
|
csv_file_path = os.path.join(
|
|
pathlib.Path(__file__).parent.parent.parent,
|
|
"test_data",
|
|
"example.csv",
|
|
)
|
|
|
|
xlsx_file_path = os.path.join(
|
|
pathlib.Path(__file__).parent.parent.parent,
|
|
"test_data",
|
|
"example.xlsx",
|
|
)
|
|
|
|
# Define test documents
|
|
pptx_document = UnstructuredDocument(
|
|
id=uuid.uuid4(),
|
|
name="example.pptx",
|
|
raw_data_location=pptx_file_path,
|
|
external_metadata="",
|
|
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
)
|
|
|
|
docx_document = UnstructuredDocument(
|
|
id=uuid.uuid4(),
|
|
name="example.docx",
|
|
raw_data_location=docx_file_path,
|
|
external_metadata="",
|
|
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
)
|
|
|
|
csv_document = UnstructuredDocument(
|
|
id=uuid.uuid4(),
|
|
name="example.csv",
|
|
raw_data_location=csv_file_path,
|
|
external_metadata="",
|
|
mime_type="text/csv",
|
|
)
|
|
|
|
xlsx_document = UnstructuredDocument(
|
|
id=uuid.uuid4(),
|
|
name="example.xlsx",
|
|
raw_data_location=xlsx_file_path,
|
|
external_metadata="",
|
|
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
)
|
|
|
|
# Test PPTX
|
|
async for paragraph_data in pptx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
|
assert 19 == paragraph_data.chunk_size, f" 19 != {paragraph_data.chunk_size = }"
|
|
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
|
|
assert "sentence_cut" == paragraph_data.cut_type, (
|
|
f" sentence_cut != {paragraph_data.cut_type = }"
|
|
)
|
|
|
|
# Test DOCX
|
|
async for paragraph_data in docx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
|
assert 16 == paragraph_data.chunk_size, f" 16 != {paragraph_data.chunk_size = }"
|
|
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
|
|
assert "sentence_end" == paragraph_data.cut_type, (
|
|
f" sentence_end != {paragraph_data.cut_type = }"
|
|
)
|
|
|
|
# TEST CSV
|
|
async for paragraph_data in csv_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
|
assert 15 == paragraph_data.chunk_size, f" 15 != {paragraph_data.chunk_size = }"
|
|
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
|
|
f"Read text doesn't match expected text: {paragraph_data.text}"
|
|
)
|
|
assert "sentence_cut" == paragraph_data.cut_type, (
|
|
f" sentence_cut != {paragraph_data.cut_type = }"
|
|
)
|
|
|
|
# Test XLSX
|
|
async for paragraph_data in xlsx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
|
assert 36 == paragraph_data.chunk_size, f" 36 != {paragraph_data.chunk_size = }"
|
|
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
|
|
assert "sentence_cut" == paragraph_data.cut_type, (
|
|
f" sentence_cut != {paragraph_data.cut_type = }"
|
|
)
|