feat: Remove the need for libmagic for unstructured documents
Remove the need for libmagic so for unstructured documents by providing mime_type information Feature COG-685
This commit is contained in:
parent
78214456a6
commit
62db3f8598
3 changed files with 11 additions and 1 deletions
|
|
@ -6,6 +6,7 @@ class Document(DataPoint):
|
|||
name: str
|
||||
raw_data_location: str
|
||||
metadata_id: UUID
|
||||
mime_type: str
|
||||
|
||||
def read(self, chunk_size: int) -> str:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ class UnstructuredDocument(Document):
|
|||
def read(self, chunk_size: int):
|
||||
def get_text():
|
||||
from unstructured.partition.auto import partition
|
||||
elements = partition(self.raw_data_location)
|
||||
elements = partition(self.raw_data_location, content_type=self.mime_type)
|
||||
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
|
||||
in_memory_file.seek(0)
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,14 @@ EXTENSION_TO_DOCUMENT_CLASS = {
|
|||
"pdf": PdfDocument, # Text documents
|
||||
"txt": TextDocument,
|
||||
"docx": UnstructuredDocument,
|
||||
"doc": UnstructuredDocument,
|
||||
"odt": UnstructuredDocument,
|
||||
"xls": UnstructuredDocument,
|
||||
"xlsx": UnstructuredDocument,
|
||||
"ppt": UnstructuredDocument,
|
||||
"pptx": UnstructuredDocument,
|
||||
"odp": UnstructuredDocument,
|
||||
"ods": UnstructuredDocument,
|
||||
"png": ImageDocument, # Image documents
|
||||
"dwg": ImageDocument,
|
||||
"xcf": ImageDocument,
|
||||
|
|
@ -50,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
|||
title = f"{data_item.name}.{data_item.extension}",
|
||||
raw_data_location = data_item.raw_data_location,
|
||||
name = data_item.name,
|
||||
mime_type = data_item.mime_type,
|
||||
metadata_id = metadata.id
|
||||
)
|
||||
documents.append(document)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue