feat: Add unstructured document handler
Added unstructured library and handling of certain document types through their library Feature COG-685
This commit is contained in:
parent
8415279cb2
commit
78214456a6
3 changed files with 29 additions and 0 deletions
|
|
@ -0,0 +1,26 @@
|
|||
from io import StringIO
|
||||
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from .Document import Document
|
||||
|
||||
class UnstructuredDocument(Document):
|
||||
type: str = "unstructured"
|
||||
|
||||
def read(self, chunk_size: int):
|
||||
def get_text():
|
||||
from unstructured.partition.auto import partition
|
||||
elements = partition(self.raw_data_location)
|
||||
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
|
||||
in_memory_file.seek(0)
|
||||
|
||||
while True:
|
||||
text = in_memory_file.read(1024)
|
||||
|
||||
if len(text.strip()) == 0:
|
||||
break
|
||||
|
||||
yield text
|
||||
|
||||
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
@ -3,3 +3,4 @@ from .PdfDocument import PdfDocument
|
|||
from .TextDocument import TextDocument
|
||||
from .ImageDocument import ImageDocument
|
||||
from .AudioDocument import AudioDocument
|
||||
from .UnstructuredDocument import UnstructuredDocument
|
||||
|
|
|
|||
|
|
@ -5,12 +5,14 @@ from cognee.modules.data.processing.document_types import (
|
|||
AudioDocument,
|
||||
ImageDocument,
|
||||
TextDocument,
|
||||
UnstructuredDocument,
|
||||
)
|
||||
from cognee.modules.data.operations.get_metadata import get_metadata
|
||||
|
||||
EXTENSION_TO_DOCUMENT_CLASS = {
|
||||
"pdf": PdfDocument, # Text documents
|
||||
"txt": TextDocument,
|
||||
"docx": UnstructuredDocument,
|
||||
"png": ImageDocument, # Image documents
|
||||
"dwg": ImageDocument,
|
||||
"xcf": ImageDocument,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue