feat: Remove the need for libmagic for unstructured documents

Remove the need for libmagic so for unstructured documents by providing mime_type information Feature COG-685
2024-12-08 14:37:50 +01:00 · 2024-12-08 14:37:50 +01:00 · 62db3f8598
commit 62db3f8598
parent 78214456a6
3 changed files with 11 additions and 1 deletions
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@ -6,6 +6,7 @@ class Document(DataPoint):
    name: str
    raw_data_location: str
    metadata_id: UUID
+    mime_type: str

    def read(self, chunk_size: int) -> str:
        pass
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@ -9,7 +9,7 @@ class UnstructuredDocument(Document):
    def read(self, chunk_size: int):
        def get_text():
            from unstructured.partition.auto import partition
-            elements = partition(self.raw_data_location)
+            elements = partition(self.raw_data_location, content_type=self.mime_type)
            in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
            in_memory_file.seek(0)

--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -13,6 +13,14 @@ EXTENSION_TO_DOCUMENT_CLASS = {
    "pdf": PdfDocument,  # Text documents
    "txt": TextDocument,
    "docx": UnstructuredDocument,
+    "doc": UnstructuredDocument,
+    "odt": UnstructuredDocument,
+    "xls": UnstructuredDocument,
+    "xlsx": UnstructuredDocument,
+    "ppt": UnstructuredDocument,
+    "pptx": UnstructuredDocument,
+    "odp": UnstructuredDocument,
+    "ods": UnstructuredDocument,
    "png": ImageDocument,  # Image documents
    "dwg": ImageDocument,
    "xcf": ImageDocument,
@ -50,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
            title = f"{data_item.name}.{data_item.extension}",
            raw_data_location = data_item.raw_data_location,
            name = data_item.name,
+            mime_type = data_item.mime_type,
            metadata_id = metadata.id
        )
        documents.append(document)