feat: Remove the need for libmagic for unstructured documents

Remove the need for libmagic so for unstructured documents by providing mime_type information

Feature COG-685
This commit is contained in:
Igor Ilic 2024-12-08 14:37:50 +01:00
parent 78214456a6
commit 62db3f8598
3 changed files with 11 additions and 1 deletions

View file

@ -6,6 +6,7 @@ class Document(DataPoint):
name: str
raw_data_location: str
metadata_id: UUID
mime_type: str
def read(self, chunk_size: int) -> str:
pass

View file

@ -9,7 +9,7 @@ class UnstructuredDocument(Document):
def read(self, chunk_size: int):
def get_text():
from unstructured.partition.auto import partition
elements = partition(self.raw_data_location)
elements = partition(self.raw_data_location, content_type=self.mime_type)
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
in_memory_file.seek(0)

View file

@ -13,6 +13,14 @@ EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
"txt": TextDocument,
"docx": UnstructuredDocument,
"doc": UnstructuredDocument,
"odt": UnstructuredDocument,
"xls": UnstructuredDocument,
"xlsx": UnstructuredDocument,
"ppt": UnstructuredDocument,
"pptx": UnstructuredDocument,
"odp": UnstructuredDocument,
"ods": UnstructuredDocument,
"png": ImageDocument, # Image documents
"dwg": ImageDocument,
"xcf": ImageDocument,
@ -50,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
title = f"{data_item.name}.{data_item.extension}",
raw_data_location = data_item.raw_data_location,
name = data_item.name,
mime_type = data_item.mime_type,
metadata_id = metadata.id
)
documents.append(document)