diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 773fc30c8..45441dcce 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -6,6 +6,7 @@ class Document(DataPoint): name: str raw_data_location: str metadata_id: UUID + mime_type: str def read(self, chunk_size: int) -> str: pass diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index c48423653..68ccbe1f2 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -9,7 +9,7 @@ class UnstructuredDocument(Document): def read(self, chunk_size: int): def get_text(): from unstructured.partition.auto import partition - elements = partition(self.raw_data_location) + elements = partition(self.raw_data_location, content_type=self.mime_type) in_memory_file = StringIO("\n\n".join([str(el) for el in elements])) in_memory_file.seek(0) diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index abef4ea9e..47beeb917 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -13,6 +13,14 @@ EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents "txt": TextDocument, "docx": UnstructuredDocument, + "doc": UnstructuredDocument, + "odt": UnstructuredDocument, + "xls": UnstructuredDocument, + "xlsx": UnstructuredDocument, + "ppt": UnstructuredDocument, + "pptx": UnstructuredDocument, + "odp": UnstructuredDocument, + "ods": UnstructuredDocument, "png": ImageDocument, # Image documents "dwg": ImageDocument, "xcf": ImageDocument, @@ -50,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]: title = f"{data_item.name}.{data_item.extension}", raw_data_location = data_item.raw_data_location, name = data_item.name, + mime_type = data_item.mime_type, metadata_id = metadata.id ) documents.append(document)