feat: Remove the need for libmagic for unstructured documents
Remove the need for libmagic so for unstructured documents by providing mime_type information Feature COG-685
This commit is contained in:
parent
78214456a6
commit
62db3f8598
3 changed files with 11 additions and 1 deletions
|
|
@ -6,6 +6,7 @@ class Document(DataPoint):
|
||||||
name: str
|
name: str
|
||||||
raw_data_location: str
|
raw_data_location: str
|
||||||
metadata_id: UUID
|
metadata_id: UUID
|
||||||
|
mime_type: str
|
||||||
|
|
||||||
def read(self, chunk_size: int) -> str:
|
def read(self, chunk_size: int) -> str:
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ class UnstructuredDocument(Document):
|
||||||
def read(self, chunk_size: int):
|
def read(self, chunk_size: int):
|
||||||
def get_text():
|
def get_text():
|
||||||
from unstructured.partition.auto import partition
|
from unstructured.partition.auto import partition
|
||||||
elements = partition(self.raw_data_location)
|
elements = partition(self.raw_data_location, content_type=self.mime_type)
|
||||||
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
|
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
|
||||||
in_memory_file.seek(0)
|
in_memory_file.seek(0)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,14 @@ EXTENSION_TO_DOCUMENT_CLASS = {
|
||||||
"pdf": PdfDocument, # Text documents
|
"pdf": PdfDocument, # Text documents
|
||||||
"txt": TextDocument,
|
"txt": TextDocument,
|
||||||
"docx": UnstructuredDocument,
|
"docx": UnstructuredDocument,
|
||||||
|
"doc": UnstructuredDocument,
|
||||||
|
"odt": UnstructuredDocument,
|
||||||
|
"xls": UnstructuredDocument,
|
||||||
|
"xlsx": UnstructuredDocument,
|
||||||
|
"ppt": UnstructuredDocument,
|
||||||
|
"pptx": UnstructuredDocument,
|
||||||
|
"odp": UnstructuredDocument,
|
||||||
|
"ods": UnstructuredDocument,
|
||||||
"png": ImageDocument, # Image documents
|
"png": ImageDocument, # Image documents
|
||||||
"dwg": ImageDocument,
|
"dwg": ImageDocument,
|
||||||
"xcf": ImageDocument,
|
"xcf": ImageDocument,
|
||||||
|
|
@ -50,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
||||||
title = f"{data_item.name}.{data_item.extension}",
|
title = f"{data_item.name}.{data_item.extension}",
|
||||||
raw_data_location = data_item.raw_data_location,
|
raw_data_location = data_item.raw_data_location,
|
||||||
name = data_item.name,
|
name = data_item.name,
|
||||||
|
mime_type = data_item.mime_type,
|
||||||
metadata_id = metadata.id
|
metadata_id = metadata.id
|
||||||
)
|
)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue