feat: Add unstructured document handler

Added unstructured library and handling of certain document types through their library

Feature COG-685
This commit is contained in:
Igor Ilic 2024-12-06 17:50:22 +01:00
parent 8415279cb2
commit 78214456a6
3 changed files with 29 additions and 0 deletions

View file

@ -0,0 +1,26 @@
from io import StringIO
from cognee.modules.chunking.TextChunker import TextChunker
from .Document import Document
class UnstructuredDocument(Document):
type: str = "unstructured"
def read(self, chunk_size: int):
def get_text():
from unstructured.partition.auto import partition
elements = partition(self.raw_data_location)
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
in_memory_file.seek(0)
while True:
text = in_memory_file.read(1024)
if len(text.strip()) == 0:
break
yield text
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
yield from chunker.read()

View file

@ -3,3 +3,4 @@ from .PdfDocument import PdfDocument
from .TextDocument import TextDocument
from .ImageDocument import ImageDocument
from .AudioDocument import AudioDocument
from .UnstructuredDocument import UnstructuredDocument

View file

@ -5,12 +5,14 @@ from cognee.modules.data.processing.document_types import (
AudioDocument,
ImageDocument,
TextDocument,
UnstructuredDocument,
)
from cognee.modules.data.operations.get_metadata import get_metadata
EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
"txt": TextDocument,
"docx": UnstructuredDocument,
"png": ImageDocument, # Image documents
"dwg": ImageDocument,
"xcf": ImageDocument,