Fix for now

This commit is contained in:
vasilije 2025-01-16 20:58:10 +01:00
parent 5aaf420f02
commit 72b503f198

View file

@ -3,100 +3,100 @@ import uuid
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
#
# def test_UnstructuredDocument(): def test_UnstructuredDocument():
# # Define file paths of test data # Define file paths of test data
# pptx_file_path = os.path.join( pptx_file_path = os.path.join(
# os.sep, os.sep,
# *(os.path.dirname(__file__).split(os.sep)[:-2]), *(os.path.dirname(__file__).split(os.sep)[:-2]),
# "test_data", "test_data",
# "example.pptx", "example.pptx",
# ) )
#
# docx_file_path = os.path.join( docx_file_path = os.path.join(
# os.sep, os.sep,
# *(os.path.dirname(__file__).split(os.sep)[:-2]), *(os.path.dirname(__file__).split(os.sep)[:-2]),
# "test_data", "test_data",
# "example.docx", "example.docx",
# ) )
#
# csv_file_path = os.path.join( csv_file_path = os.path.join(
# os.sep, os.sep,
# *(os.path.dirname(__file__).split(os.sep)[:-2]), *(os.path.dirname(__file__).split(os.sep)[:-2]),
# "test_data", "test_data",
# "example.csv", "example.csv",
# ) )
#
# xlsx_file_path = os.path.join( xlsx_file_path = os.path.join(
# os.sep, os.sep,
# *(os.path.dirname(__file__).split(os.sep)[:-2]), *(os.path.dirname(__file__).split(os.sep)[:-2]),
# "test_data", "test_data",
# "example.xlsx", "example.xlsx",
# ) )
#
# # Define test documents # Define test documents
# pptx_document = UnstructuredDocument( pptx_document = UnstructuredDocument(
# id=uuid.uuid4(), id=uuid.uuid4(),
# name="example.pptx", name="example.pptx",
# raw_data_location=pptx_file_path, raw_data_location=pptx_file_path,
# metadata_id=uuid.uuid4(), metadata_id=uuid.uuid4(),
# mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation", mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
# ) )
#
# docx_document = UnstructuredDocument( docx_document = UnstructuredDocument(
# id=uuid.uuid4(), id=uuid.uuid4(),
# name="example.docx", name="example.docx",
# raw_data_location=docx_file_path, raw_data_location=docx_file_path,
# metadata_id=uuid.uuid4(), metadata_id=uuid.uuid4(),
# mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
# ) )
#
# csv_document = UnstructuredDocument( csv_document = UnstructuredDocument(
# id=uuid.uuid4(), id=uuid.uuid4(),
# name="example.csv", name="example.csv",
# raw_data_location=csv_file_path, raw_data_location=csv_file_path,
# metadata_id=uuid.uuid4(), metadata_id=uuid.uuid4(),
# mime_type="text/csv", mime_type="text/csv",
# ) )
#
# xlsx_document = UnstructuredDocument( xlsx_document = UnstructuredDocument(
# id=uuid.uuid4(), id=uuid.uuid4(),
# name="example.xlsx", name="example.xlsx",
# raw_data_location=xlsx_file_path, raw_data_location=xlsx_file_path,
# metadata_id=uuid.uuid4(), metadata_id=uuid.uuid4(),
# mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
# ) )
#
# # Test PPTX # Test PPTX
# for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
# assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
# assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
# assert "sentence_cut" == paragraph_data.cut_type, ( assert "sentence_cut" == paragraph_data.cut_type, (
# f" sentence_cut != {paragraph_data.cut_type = }" f" sentence_cut != {paragraph_data.cut_type = }"
# ) )
#
# # Test DOCX # Test DOCX
# for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
# assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
# assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
# assert "sentence_end" == paragraph_data.cut_type, ( assert "sentence_end" == paragraph_data.cut_type, (
# f" sentence_end != {paragraph_data.cut_type = }" f" sentence_end != {paragraph_data.cut_type = }"
# ) )
#
# # TEST CSV # TEST CSV
# for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
# assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
# assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
# f"Read text doesn't match expected text: {paragraph_data.text}" f"Read text doesn't match expected text: {paragraph_data.text}"
# ) )
# assert "sentence_cut" == paragraph_data.cut_type, ( assert "sentence_cut" == paragraph_data.cut_type, (
# f" sentence_cut != {paragraph_data.cut_type = }" f" sentence_cut != {paragraph_data.cut_type = }"
# ) )
#
# # Test XLSX # Test XLSX
# for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
# assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
# assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
# assert "sentence_cut" == paragraph_data.cut_type, ( assert "sentence_cut" == paragraph_data.cut_type, (
# f" sentence_cut != {paragraph_data.cut_type = }" f" sentence_cut != {paragraph_data.cut_type = }"
# ) )