From 78214456a69c60c0aaf48a221e20250c83115d5e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 6 Dec 2024 17:50:22 +0100 Subject: [PATCH] feat: Add unstructured document handler Added unstructured library and handling of certain document types through their library Feature COG-685 --- .../document_types/UnstructuredDocument.py | 26 +++++++++++++++++++ .../processing/document_types/__init__.py | 1 + cognee/tasks/documents/classify_documents.py | 2 ++ 3 files changed, 29 insertions(+) create mode 100644 cognee/modules/data/processing/document_types/UnstructuredDocument.py diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py new file mode 100644 index 000000000..c48423653 --- /dev/null +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -0,0 +1,26 @@ +from io import StringIO + +from cognee.modules.chunking.TextChunker import TextChunker +from .Document import Document + +class UnstructuredDocument(Document): + type: str = "unstructured" + + def read(self, chunk_size: int): + def get_text(): + from unstructured.partition.auto import partition + elements = partition(self.raw_data_location) + in_memory_file = StringIO("\n\n".join([str(el) for el in elements])) + in_memory_file.seek(0) + + while True: + text = in_memory_file.read(1024) + + if len(text.strip()) == 0: + break + + yield text + + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) + + yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/__init__.py b/cognee/modules/data/processing/document_types/__init__.py index 9682cc101..2e862f4ba 100644 --- a/cognee/modules/data/processing/document_types/__init__.py +++ b/cognee/modules/data/processing/document_types/__init__.py @@ -3,3 +3,4 @@ from .PdfDocument import PdfDocument from .TextDocument import TextDocument from .ImageDocument import ImageDocument from .AudioDocument import AudioDocument +from .UnstructuredDocument import UnstructuredDocument diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index d007b6888..abef4ea9e 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -5,12 +5,14 @@ from cognee.modules.data.processing.document_types import ( AudioDocument, ImageDocument, TextDocument, + UnstructuredDocument, ) from cognee.modules.data.operations.get_metadata import get_metadata EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents "txt": TextDocument, + "docx": UnstructuredDocument, "png": ImageDocument, # Image documents "dwg": ImageDocument, "xcf": ImageDocument,