cognee/cognee/infrastructure/loaders/external/unstructured_loader.py
2025-07-13 22:34:27 +02:00

168 lines
5.9 KiB
Python

import os
from typing import List
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
from cognee.shared.logging_utils import get_logger
class UnstructuredLoader(LoaderInterface):
"""
Document loader using the unstructured library.
Handles various document formats including docx, pptx, xlsx, odt, etc.
Uses the unstructured library's auto-partition functionality.
"""
def __init__(self):
self.logger = get_logger(__name__)
@property
def supported_extensions(self) -> List[str]:
return [
".docx",
".doc",
".odt", # Word documents
".xlsx",
".xls",
".ods", # Spreadsheets
".pptx",
".ppt",
".odp", # Presentations
".rtf",
".html",
".htm", # Rich text and HTML
".eml",
".msg", # Email formats
".epub", # eBooks
]
@property
def supported_mime_types(self) -> List[str]:
return [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # docx
"application/msword", # doc
"application/vnd.oasis.opendocument.text", # odt
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # xlsx
"application/vnd.ms-excel", # xls
"application/vnd.oasis.opendocument.spreadsheet", # ods
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # pptx
"application/vnd.ms-powerpoint", # ppt
"application/vnd.oasis.opendocument.presentation", # odp
"application/rtf", # rtf
"text/html", # html
"message/rfc822", # eml
"application/epub+zip", # epub
]
@property
def loader_name(self) -> str:
return "unstructured_loader"
def get_dependencies(self) -> List[str]:
return ["unstructured>=0.10.0"]
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
"""Check if file can be handled by this loader."""
# Check file extension
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in self.supported_extensions:
return False
# Check MIME type if provided
if mime_type and mime_type not in self.supported_mime_types:
return False
# Validate dependencies
return self.validate_dependencies()
async def load(self, file_path: str, strategy: str = "auto", **kwargs) -> LoaderResult:
"""
Load document using unstructured library.
Args:
file_path: Path to the document file
strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
**kwargs: Additional arguments passed to unstructured partition
Returns:
LoaderResult with extracted text content and metadata
Raises:
ImportError: If unstructured is not installed
Exception: If document processing fails
"""
try:
from unstructured.partition.auto import partition
except ImportError as e:
raise ImportError(
"unstructured is required for document processing. "
"Install with: pip install unstructured"
) from e
try:
self.logger.info(f"Processing document: {file_path}")
# Determine content type from file extension
file_ext = os.path.splitext(file_path)[1].lower()
# Get file size and basic info
file_size = os.path.getsize(file_path)
file_name = os.path.basename(file_path)
# Set partitioning parameters
partition_kwargs = {"filename": file_path, "strategy": strategy, **kwargs}
# Use partition to extract elements
elements = partition(**partition_kwargs)
# Process elements into text content
text_parts = []
element_info = []
for element in elements:
element_text = str(element).strip()
if element_text:
text_parts.append(element_text)
element_info.append(
{
"type": type(element).__name__,
"text": element_text[:100] + "..."
if len(element_text) > 100
else element_text,
}
)
# Combine all text content
full_content = "\n\n".join(text_parts)
# Determine content type based on structure
content_type = ContentType.STRUCTURED if len(element_info) > 1 else ContentType.TEXT
# Gather metadata
metadata = {
"name": file_name,
"size": file_size,
"extension": file_ext,
"loader": self.loader_name,
"elements_count": len(elements),
"text_elements_count": len(text_parts),
"strategy": strategy,
"element_types": list(set(info["type"] for info in element_info)),
}
return LoaderResult(
content=full_content,
metadata=metadata,
content_type=content_type,
chunks=text_parts, # Pre-chunked by elements
source_info={
"file_path": file_path,
"strategy": strategy,
"elements": element_info[:10], # First 10 elements for debugging
"total_elements": len(elements),
},
)
except Exception as e:
self.logger.error(f"Failed to process document {file_path}: {e}")
raise Exception(f"Document processing failed: {e}") from e