feat: Add unstructured document handler

Added unstructured library and handling of certain document types through their library

Feature COG-685
This commit is contained in:
Igor Ilic 2024-12-06 17:50:22 +01:00
parent 8415279cb2
commit 78214456a6
3 changed files with 29 additions and 0 deletions

View file

@ -0,0 +1,26 @@
from io import StringIO
from cognee.modules.chunking.TextChunker import TextChunker
from .Document import Document
class UnstructuredDocument(Document):
type: str = "unstructured"
def read(self, chunk_size: int):
def get_text():
from unstructured.partition.auto import partition
elements = partition(self.raw_data_location)
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
in_memory_file.seek(0)
while True:
text = in_memory_file.read(1024)
if len(text.strip()) == 0:
break
yield text
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
yield from chunker.read()

View file

@ -3,3 +3,4 @@ from .PdfDocument import PdfDocument
from .TextDocument import TextDocument from .TextDocument import TextDocument
from .ImageDocument import ImageDocument from .ImageDocument import ImageDocument
from .AudioDocument import AudioDocument from .AudioDocument import AudioDocument
from .UnstructuredDocument import UnstructuredDocument

View file

@ -5,12 +5,14 @@ from cognee.modules.data.processing.document_types import (
AudioDocument, AudioDocument,
ImageDocument, ImageDocument,
TextDocument, TextDocument,
UnstructuredDocument,
) )
from cognee.modules.data.operations.get_metadata import get_metadata from cognee.modules.data.operations.get_metadata import get_metadata
EXTENSION_TO_DOCUMENT_CLASS = { EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents "pdf": PdfDocument, # Text documents
"txt": TextDocument, "txt": TextDocument,
"docx": UnstructuredDocument,
"png": ImageDocument, # Image documents "png": ImageDocument, # Image documents
"dwg": ImageDocument, "dwg": ImageDocument,
"xcf": ImageDocument, "xcf": ImageDocument,