cognee/infrastructure/loaders/models/LoaderResult.py

from pydantic import BaseModel
from typing import Optional, Dict, Any, List
from enum import Enum


class ContentType(Enum):
    """Content type classification for loaded files"""

    TEXT = "text"
    STRUCTURED = "structured"
    BINARY = "binary"


class LoaderResult(BaseModel):
    """
    Standardized output format for all file loaders.

    This model ensures consistent data structure across all loader implementations,
    following cognee's pattern of using Pydantic models for data validation.
    """

    content: str  # Primary text content extracted from file
    metadata: Dict[str, Any]  # File metadata (name, size, type, loader info, etc.)
    content_type: ContentType  # Content classification
    chunks: Optional[List[str]] = None  # Pre-chunked content if available
    source_info: Optional[Dict[str, Any]] = None  # Source-specific information

    def to_dict(self) -> Dict[str, Any]:
        """
        Convert the loader result to a dictionary format.

        Returns:
            Dict containing all loader result data with string-serialized content_type
        """
        return {
            "content": self.content,
            "metadata": self.metadata,
            "content_type": self.content_type.value,
            "source_info": self.source_info or {},
            "chunks": self.chunks,
        }

    class Config:
        """Pydantic configuration following cognee patterns"""

        use_enum_values = True
        validate_assignment = True