added loader separation
This commit is contained in:
parent
bd892652ad
commit
98882ba1d1
40 changed files with 3065 additions and 89 deletions
|
|
@ -15,14 +15,19 @@ async def add(
|
||||||
vector_db_config: dict = None,
|
vector_db_config: dict = None,
|
||||||
graph_db_config: dict = None,
|
graph_db_config: dict = None,
|
||||||
dataset_id: UUID = None,
|
dataset_id: UUID = None,
|
||||||
|
preferred_loaders: Optional[List[str]] = None,
|
||||||
|
loader_config: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Add data to Cognee for knowledge graph processing.
|
Add data to Cognee for knowledge graph processing using a plugin-based loader system.
|
||||||
|
|
||||||
This is the first step in the Cognee workflow - it ingests raw data and prepares it
|
This is the first step in the Cognee workflow - it ingests raw data and prepares it
|
||||||
for processing. The function accepts various data formats including text, files, and
|
for processing. The function accepts various data formats including text, files, and
|
||||||
binary streams, then stores them in a specified dataset for further processing.
|
binary streams, then stores them in a specified dataset for further processing.
|
||||||
|
|
||||||
|
This version supports both the original ingestion system (for backward compatibility)
|
||||||
|
and the new plugin-based loader system (when loader parameters are provided).
|
||||||
|
|
||||||
Prerequisites:
|
Prerequisites:
|
||||||
- **LLM_API_KEY**: Must be set in environment variables for content processing
|
- **LLM_API_KEY**: Must be set in environment variables for content processing
|
||||||
- **Database Setup**: Relational and vector databases must be configured
|
- **Database Setup**: Relational and vector databases must be configured
|
||||||
|
|
@ -38,16 +43,38 @@ async def add(
|
||||||
- **Lists**: Multiple files or text strings in a single call
|
- **Lists**: Multiple files or text strings in a single call
|
||||||
|
|
||||||
Supported File Formats:
|
Supported File Formats:
|
||||||
- Text files (.txt, .md, .csv)
|
- Text files (.txt, .md, .csv) - processed by text_loader
|
||||||
- PDFs (.pdf)
|
- PDFs (.pdf) - processed by pypdf_loader (if available)
|
||||||
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
|
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
|
||||||
- Audio files (.mp3, .wav) - transcribed to text
|
- Audio files (.mp3, .wav) - transcribed to text
|
||||||
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
|
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
|
||||||
- Office documents (.docx, .pptx)
|
- Office documents (.docx, .pptx) - processed by unstructured_loader (if available)
|
||||||
|
- Data files (.json, .jsonl, .parquet) - processed by dlt_loader (if available)
|
||||||
|
|
||||||
Workflow:
|
Plugin System:
|
||||||
|
The function automatically uses the best available loader for each file type.
|
||||||
|
You can customize this behavior using the loader parameters:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Use specific loaders in priority order
|
||||||
|
await cognee.add(
|
||||||
|
"/path/to/document.pdf",
|
||||||
|
preferred_loaders=["pypdf_loader", "text_loader"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure loader-specific options
|
||||||
|
await cognee.add(
|
||||||
|
"/path/to/document.pdf",
|
||||||
|
loader_config={
|
||||||
|
"pypdf_loader": {"strict": False},
|
||||||
|
"unstructured_loader": {"strategy": "hi_res"}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Workflow:
|
||||||
1. **Data Resolution**: Resolves file paths and validates accessibility
|
1. **Data Resolution**: Resolves file paths and validates accessibility
|
||||||
2. **Content Extraction**: Extracts text content from various file formats
|
2. **Content Extraction**: Uses plugin system or falls back to existing classification
|
||||||
3. **Dataset Storage**: Stores processed content in the specified dataset
|
3. **Dataset Storage**: Stores processed content in the specified dataset
|
||||||
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
|
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
|
||||||
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
|
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
|
||||||
|
|
@ -70,6 +97,10 @@ async def add(
|
||||||
vector_db_config: Optional configuration for vector database (for custom setups).
|
vector_db_config: Optional configuration for vector database (for custom setups).
|
||||||
graph_db_config: Optional configuration for graph database (for custom setups).
|
graph_db_config: Optional configuration for graph database (for custom setups).
|
||||||
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
|
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
|
||||||
|
preferred_loaders: Optional list of loader names to try first (e.g., ["pypdf_loader", "text_loader"]).
|
||||||
|
If not provided, uses default loader priority.
|
||||||
|
loader_config: Optional configuration for specific loaders. Dictionary mapping loader names
|
||||||
|
to their configuration options (e.g., {"pypdf_loader": {"strict": False}}).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
||||||
|
|
@ -138,10 +169,32 @@ async def add(
|
||||||
UnsupportedFileTypeError: If file format cannot be processed
|
UnsupportedFileTypeError: If file format cannot be processed
|
||||||
InvalidValueError: If LLM_API_KEY is not set or invalid
|
InvalidValueError: If LLM_API_KEY is not set or invalid
|
||||||
"""
|
"""
|
||||||
tasks = [
|
|
||||||
Task(resolve_data_directories, include_subdirectories=True),
|
# Determine which ingestion system to use
|
||||||
Task(ingest_data, dataset_name, user, node_set, dataset_id),
|
use_plugin_system = preferred_loaders is not None or loader_config is not None
|
||||||
]
|
|
||||||
|
if use_plugin_system:
|
||||||
|
# Use new plugin-based ingestion system
|
||||||
|
from cognee.tasks.ingestion.plugin_ingest_data import plugin_ingest_data
|
||||||
|
|
||||||
|
tasks = [
|
||||||
|
Task(resolve_data_directories, include_subdirectories=True),
|
||||||
|
Task(
|
||||||
|
plugin_ingest_data,
|
||||||
|
dataset_name,
|
||||||
|
user,
|
||||||
|
node_set,
|
||||||
|
dataset_id,
|
||||||
|
preferred_loaders,
|
||||||
|
loader_config,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# Use existing ingestion system for backward compatibility
|
||||||
|
tasks = [
|
||||||
|
Task(resolve_data_directories, include_subdirectories=True),
|
||||||
|
Task(ingest_data, dataset_name, user, node_set, dataset_id),
|
||||||
|
]
|
||||||
|
|
||||||
pipeline_run_info = None
|
pipeline_run_info = None
|
||||||
|
|
||||||
|
|
|
||||||
237
cognee/infrastructure/loaders/LoaderEngine.py
Normal file
237
cognee/infrastructure/loaders/LoaderEngine.py
Normal file
|
|
@ -0,0 +1,237 @@
|
||||||
|
import os
|
||||||
|
import importlib.util
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from .LoaderInterface import LoaderInterface
|
||||||
|
from .models.LoaderResult import LoaderResult
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderEngine:
|
||||||
|
"""
|
||||||
|
Main loader engine for managing file loaders.
|
||||||
|
|
||||||
|
Follows cognee's adapter pattern similar to database engines,
|
||||||
|
providing a centralized system for file loading operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
loader_directories: List[str],
|
||||||
|
default_loader_priority: List[str],
|
||||||
|
fallback_loader: str = "text_loader",
|
||||||
|
enable_dependency_validation: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the loader engine.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_directories: Directories to search for loader implementations
|
||||||
|
default_loader_priority: Priority order for loader selection
|
||||||
|
fallback_loader: Default loader to use when no other matches
|
||||||
|
enable_dependency_validation: Whether to validate loader dependencies
|
||||||
|
"""
|
||||||
|
self._loaders: Dict[str, LoaderInterface] = {}
|
||||||
|
self._extension_map: Dict[str, List[LoaderInterface]] = {}
|
||||||
|
self._mime_type_map: Dict[str, List[LoaderInterface]] = {}
|
||||||
|
self.loader_directories = loader_directories
|
||||||
|
self.default_loader_priority = default_loader_priority
|
||||||
|
self.fallback_loader = fallback_loader
|
||||||
|
self.enable_dependency_validation = enable_dependency_validation
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
def register_loader(self, loader: LoaderInterface) -> bool:
|
||||||
|
"""
|
||||||
|
Register a loader with the engine.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader: LoaderInterface implementation to register
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if loader was registered successfully, False otherwise
|
||||||
|
"""
|
||||||
|
# Validate dependencies if enabled
|
||||||
|
if self.enable_dependency_validation and not loader.validate_dependencies():
|
||||||
|
self.logger.warning(
|
||||||
|
f"Skipping loader '{loader.loader_name}' - missing dependencies: "
|
||||||
|
f"{loader.get_dependencies()}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
self._loaders[loader.loader_name] = loader
|
||||||
|
|
||||||
|
# Map extensions to loaders
|
||||||
|
for ext in loader.supported_extensions:
|
||||||
|
ext_lower = ext.lower()
|
||||||
|
if ext_lower not in self._extension_map:
|
||||||
|
self._extension_map[ext_lower] = []
|
||||||
|
self._extension_map[ext_lower].append(loader)
|
||||||
|
|
||||||
|
# Map mime types to loaders
|
||||||
|
for mime_type in loader.supported_mime_types:
|
||||||
|
if mime_type not in self._mime_type_map:
|
||||||
|
self._mime_type_map[mime_type] = []
|
||||||
|
self._mime_type_map[mime_type].append(loader)
|
||||||
|
|
||||||
|
self.logger.info(f"Registered loader: {loader.loader_name}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_loader(
|
||||||
|
self, file_path: str, mime_type: str = None, preferred_loaders: List[str] = None
|
||||||
|
) -> Optional[LoaderInterface]:
|
||||||
|
"""
|
||||||
|
Get appropriate loader for a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
mime_type: Optional MIME type of the file
|
||||||
|
preferred_loaders: List of preferred loader names to try first
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderInterface that can handle the file, or None if not found
|
||||||
|
"""
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
|
||||||
|
# Try preferred loaders first
|
||||||
|
if preferred_loaders:
|
||||||
|
for loader_name in preferred_loaders:
|
||||||
|
if loader_name in self._loaders:
|
||||||
|
loader = self._loaders[loader_name]
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Try priority order
|
||||||
|
for loader_name in self.default_loader_priority:
|
||||||
|
if loader_name in self._loaders:
|
||||||
|
loader = self._loaders[loader_name]
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Try mime type mapping
|
||||||
|
if mime_type and mime_type in self._mime_type_map:
|
||||||
|
for loader in self._mime_type_map[mime_type]:
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Try extension mapping
|
||||||
|
if ext in self._extension_map:
|
||||||
|
for loader in self._extension_map[ext]:
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Fallback loader
|
||||||
|
if self.fallback_loader in self._loaders:
|
||||||
|
fallback = self._loaders[self.fallback_loader]
|
||||||
|
if fallback.can_handle(file_path, mime_type):
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def load_file(
|
||||||
|
self, file_path: str, mime_type: str = None, preferred_loaders: List[str] = None, **kwargs
|
||||||
|
) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load file using appropriate loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
mime_type: Optional MIME type of the file
|
||||||
|
preferred_loaders: List of preferred loader names to try first
|
||||||
|
**kwargs: Additional loader-specific configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult containing processed content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If no suitable loader is found
|
||||||
|
Exception: If file processing fails
|
||||||
|
"""
|
||||||
|
loader = self.get_loader(file_path, mime_type, preferred_loaders)
|
||||||
|
if not loader:
|
||||||
|
raise ValueError(f"No loader found for file: {file_path}")
|
||||||
|
|
||||||
|
self.logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
||||||
|
return await loader.load(file_path, **kwargs)
|
||||||
|
|
||||||
|
def discover_loaders(self):
|
||||||
|
"""
|
||||||
|
Auto-discover loaders from configured directories.
|
||||||
|
|
||||||
|
Scans loader directories for Python modules containing
|
||||||
|
LoaderInterface implementations and registers them.
|
||||||
|
"""
|
||||||
|
for directory in self.loader_directories:
|
||||||
|
if os.path.exists(directory):
|
||||||
|
self._discover_in_directory(directory)
|
||||||
|
|
||||||
|
def _discover_in_directory(self, directory: str):
|
||||||
|
"""
|
||||||
|
Discover loaders in a specific directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Directory path to scan for loader implementations
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
for file_name in os.listdir(directory):
|
||||||
|
if file_name.endswith(".py") and not file_name.startswith("_"):
|
||||||
|
module_name = file_name[:-3]
|
||||||
|
file_path = os.path.join(directory, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||||
|
if spec and spec.loader:
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
|
||||||
|
# Look for loader classes
|
||||||
|
for attr_name in dir(module):
|
||||||
|
attr = getattr(module, attr_name)
|
||||||
|
if (
|
||||||
|
isinstance(attr, type)
|
||||||
|
and issubclass(attr, LoaderInterface)
|
||||||
|
and attr != LoaderInterface
|
||||||
|
):
|
||||||
|
# Instantiate and register the loader
|
||||||
|
try:
|
||||||
|
loader_instance = attr()
|
||||||
|
self.register_loader(loader_instance)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Failed to instantiate loader {attr_name}: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Failed to load module {module_name}: {e}")
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
self.logger.warning(f"Failed to scan directory {directory}: {e}")
|
||||||
|
|
||||||
|
def get_available_loaders(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get list of available loader names.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of registered loader names
|
||||||
|
"""
|
||||||
|
return list(self._loaders.keys())
|
||||||
|
|
||||||
|
def get_loader_info(self, loader_name: str) -> Dict[str, any]:
|
||||||
|
"""
|
||||||
|
Get information about a specific loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_name: Name of the loader to inspect
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing loader information
|
||||||
|
"""
|
||||||
|
if loader_name not in self._loaders:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
loader = self._loaders[loader_name]
|
||||||
|
return {
|
||||||
|
"name": loader.loader_name,
|
||||||
|
"extensions": loader.supported_extensions,
|
||||||
|
"mime_types": loader.supported_mime_types,
|
||||||
|
"dependencies": loader.get_dependencies(),
|
||||||
|
"available": loader.validate_dependencies(),
|
||||||
|
}
|
||||||
101
cognee/infrastructure/loaders/LoaderInterface.py
Normal file
101
cognee/infrastructure/loaders/LoaderInterface.py
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List
|
||||||
|
from .models.LoaderResult import LoaderResult
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderInterface(ABC):
|
||||||
|
"""
|
||||||
|
Base interface for all file loaders in cognee.
|
||||||
|
|
||||||
|
This interface follows cognee's established pattern for database adapters,
|
||||||
|
ensuring consistent behavior across all loader implementations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
List of file extensions this loader supports.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of extensions including the dot (e.g., ['.txt', '.md'])
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
List of MIME types this loader supports.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Unique name identifier for this loader.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String identifier used for registration and configuration
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader can handle the given file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
mime_type: Optional MIME type of the file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if this loader can process the file, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load and process the file, returning standardized result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
**kwargs: Additional loader-specific configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult containing processed content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If file cannot be processed
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_dependencies(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Optional: Return list of required dependencies for this loader.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of package names with optional version specifications
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def validate_dependencies(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if all required dependencies are available.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all dependencies are installed, False otherwise
|
||||||
|
"""
|
||||||
|
for dep in self.get_dependencies():
|
||||||
|
# Extract package name from version specification
|
||||||
|
package_name = dep.split(">=")[0].split("==")[0].split("<")[0]
|
||||||
|
try:
|
||||||
|
__import__(package_name)
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
19
cognee/infrastructure/loaders/__init__.py
Normal file
19
cognee/infrastructure/loaders/__init__.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""
|
||||||
|
File loader infrastructure for cognee.
|
||||||
|
|
||||||
|
This package provides a plugin-based system for loading different file formats
|
||||||
|
into cognee, following the same patterns as database adapters.
|
||||||
|
|
||||||
|
Main exports:
|
||||||
|
- get_loader_engine(): Factory function to get configured loader engine
|
||||||
|
- use_loader(): Register custom loaders at runtime
|
||||||
|
- LoaderInterface: Base interface for implementing loaders
|
||||||
|
- LoaderResult, ContentType: Data models for loader results
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .get_loader_engine import get_loader_engine
|
||||||
|
from .use_loader import use_loader
|
||||||
|
from .LoaderInterface import LoaderInterface
|
||||||
|
from .models.LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
__all__ = ["get_loader_engine", "use_loader", "LoaderInterface", "LoaderResult", "ContentType"]
|
||||||
57
cognee/infrastructure/loaders/config.py
Normal file
57
cognee/infrastructure/loaders/config.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
from cognee.root_dir import get_absolute_path
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderConfig(BaseSettings):
|
||||||
|
"""
|
||||||
|
Configuration for file loader system.
|
||||||
|
|
||||||
|
Follows cognee's pattern using pydantic_settings.BaseSettings for
|
||||||
|
environment variable support and validation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
loader_directories: List[str] = [
|
||||||
|
get_absolute_path("infrastructure/loaders/core"),
|
||||||
|
get_absolute_path("infrastructure/loaders/external"),
|
||||||
|
]
|
||||||
|
default_loader_priority: List[str] = [
|
||||||
|
"text_loader",
|
||||||
|
"pypdf_loader",
|
||||||
|
"unstructured_loader",
|
||||||
|
"dlt_loader",
|
||||||
|
]
|
||||||
|
auto_discover: bool = True
|
||||||
|
fallback_loader: str = "text_loader"
|
||||||
|
enable_dependency_validation: bool = True
|
||||||
|
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", extra="allow", env_prefix="LOADER_")
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert configuration to dictionary format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing all loader configuration settings
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"loader_directories": self.loader_directories,
|
||||||
|
"default_loader_priority": self.default_loader_priority,
|
||||||
|
"auto_discover": self.auto_discover,
|
||||||
|
"fallback_loader": self.fallback_loader,
|
||||||
|
"enable_dependency_validation": self.enable_dependency_validation,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_loader_config() -> LoaderConfig:
|
||||||
|
"""
|
||||||
|
Get cached loader configuration.
|
||||||
|
|
||||||
|
Uses LRU cache following cognee's pattern for configuration objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderConfig instance with current settings
|
||||||
|
"""
|
||||||
|
return LoaderConfig()
|
||||||
5
cognee/infrastructure/loaders/core/__init__.py
Normal file
5
cognee/infrastructure/loaders/core/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
"""Core loader implementations that are always available."""
|
||||||
|
|
||||||
|
from .text_loader import TextLoader
|
||||||
|
|
||||||
|
__all__ = ["TextLoader"]
|
||||||
128
cognee/infrastructure/loaders/core/text_loader.py
Normal file
128
cognee/infrastructure/loaders/core/text_loader.py
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
|
||||||
|
class TextLoader(LoaderInterface):
|
||||||
|
"""
|
||||||
|
Core text file loader that handles basic text file formats.
|
||||||
|
|
||||||
|
This loader is always available and serves as the fallback for
|
||||||
|
text-based files when no specialized loader is available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
"""Supported text file extensions."""
|
||||||
|
return [".txt", ".md", ".csv", ".json", ".xml", ".yaml", ".yml", ".log"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
"""Supported MIME types for text content."""
|
||||||
|
return [
|
||||||
|
"text/plain",
|
||||||
|
"text/markdown",
|
||||||
|
"text/csv",
|
||||||
|
"application/json",
|
||||||
|
"text/xml",
|
||||||
|
"application/xml",
|
||||||
|
"text/yaml",
|
||||||
|
"application/yaml",
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
"""Unique identifier for this loader."""
|
||||||
|
return "text_loader"
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader can handle the given file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file
|
||||||
|
mime_type: Optional MIME type
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if file can be handled, False otherwise
|
||||||
|
"""
|
||||||
|
# Check by extension
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
if ext in self.supported_extensions:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check by MIME type
|
||||||
|
if mime_type and mime_type in self.supported_mime_types:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# As fallback loader, can attempt to handle any text-like file
|
||||||
|
# This is useful when other loaders fail
|
||||||
|
try:
|
||||||
|
# Quick check if file appears to be text
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
sample = f.read(512)
|
||||||
|
# Simple heuristic: if most bytes are printable, consider it text
|
||||||
|
if sample:
|
||||||
|
try:
|
||||||
|
sample.decode("utf-8")
|
||||||
|
return True
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
sample.decode("latin-1")
|
||||||
|
return True
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
except (OSError, IOError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def load(self, file_path: str, encoding: str = "utf-8", **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load and process the text file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to load
|
||||||
|
encoding: Text encoding to use (default: utf-8)
|
||||||
|
**kwargs: Additional configuration (unused)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult containing the file content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If file doesn't exist
|
||||||
|
UnicodeDecodeError: If file cannot be decoded with specified encoding
|
||||||
|
OSError: If file cannot be read
|
||||||
|
"""
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, "r", encoding=encoding) as f:
|
||||||
|
content = f.read()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Try with fallback encoding
|
||||||
|
if encoding == "utf-8":
|
||||||
|
return await self.load(file_path, encoding="latin-1", **kwargs)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Extract basic metadata
|
||||||
|
file_stat = os.stat(file_path)
|
||||||
|
metadata = {
|
||||||
|
"name": os.path.basename(file_path),
|
||||||
|
"size": file_stat.st_size,
|
||||||
|
"extension": os.path.splitext(file_path)[1],
|
||||||
|
"encoding": encoding,
|
||||||
|
"loader": self.loader_name,
|
||||||
|
"lines": len(content.splitlines()) if content else 0,
|
||||||
|
"characters": len(content),
|
||||||
|
}
|
||||||
|
|
||||||
|
return LoaderResult(
|
||||||
|
content=content,
|
||||||
|
metadata=metadata,
|
||||||
|
content_type=ContentType.TEXT,
|
||||||
|
source_info={"file_path": file_path, "encoding": encoding},
|
||||||
|
)
|
||||||
49
cognee/infrastructure/loaders/create_loader_engine.py
Normal file
49
cognee/infrastructure/loaders/create_loader_engine.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
from typing import List
|
||||||
|
from .LoaderEngine import LoaderEngine
|
||||||
|
from .supported_loaders import supported_loaders
|
||||||
|
|
||||||
|
|
||||||
|
def create_loader_engine(
|
||||||
|
loader_directories: List[str],
|
||||||
|
default_loader_priority: List[str],
|
||||||
|
auto_discover: bool = True,
|
||||||
|
fallback_loader: str = "text_loader",
|
||||||
|
enable_dependency_validation: bool = True,
|
||||||
|
) -> LoaderEngine:
|
||||||
|
"""
|
||||||
|
Create loader engine with given configuration.
|
||||||
|
|
||||||
|
Follows cognee's pattern for engine creation functions used
|
||||||
|
in database adapters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_directories: Directories to search for loader implementations
|
||||||
|
default_loader_priority: Priority order for loader selection
|
||||||
|
auto_discover: Whether to auto-discover loaders from directories
|
||||||
|
fallback_loader: Default loader to use when no other matches
|
||||||
|
enable_dependency_validation: Whether to validate loader dependencies
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured LoaderEngine instance
|
||||||
|
"""
|
||||||
|
engine = LoaderEngine(
|
||||||
|
loader_directories=loader_directories,
|
||||||
|
default_loader_priority=default_loader_priority,
|
||||||
|
fallback_loader=fallback_loader,
|
||||||
|
enable_dependency_validation=enable_dependency_validation,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register supported loaders from registry
|
||||||
|
for loader_name, loader_class in supported_loaders.items():
|
||||||
|
try:
|
||||||
|
loader_instance = loader_class()
|
||||||
|
engine.register_loader(loader_instance)
|
||||||
|
except Exception as e:
|
||||||
|
# Log but don't fail - allow engine to continue with other loaders
|
||||||
|
engine.logger.warning(f"Failed to register loader {loader_name}: {e}")
|
||||||
|
|
||||||
|
# Auto-discover loaders if enabled
|
||||||
|
if auto_discover:
|
||||||
|
engine.discover_loaders()
|
||||||
|
|
||||||
|
return engine
|
||||||
34
cognee/infrastructure/loaders/external/__init__.py
vendored
Normal file
34
cognee/infrastructure/loaders/external/__init__.py
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
"""
|
||||||
|
External loader implementations for cognee.
|
||||||
|
|
||||||
|
This module contains loaders that depend on external libraries:
|
||||||
|
- pypdf_loader: PDF processing using pypdf
|
||||||
|
- unstructured_loader: Document processing using unstructured
|
||||||
|
- dlt_loader: Data lake/warehouse integration using DLT
|
||||||
|
|
||||||
|
These loaders are optional and only available if their dependencies are installed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__all__ = []
|
||||||
|
|
||||||
|
# Conditional imports based on dependency availability
|
||||||
|
try:
|
||||||
|
from .pypdf_loader import PyPdfLoader
|
||||||
|
|
||||||
|
__all__.append("PyPdfLoader")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .unstructured_loader import UnstructuredLoader
|
||||||
|
|
||||||
|
__all__.append("UnstructuredLoader")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .dlt_loader import DltLoader
|
||||||
|
|
||||||
|
__all__.append("DltLoader")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
203
cognee/infrastructure/loaders/external/dlt_loader.py
vendored
Normal file
203
cognee/infrastructure/loaders/external/dlt_loader.py
vendored
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class DltLoader(LoaderInterface):
|
||||||
|
"""
|
||||||
|
Data loader using DLT (Data Load Tool) for various data sources.
|
||||||
|
|
||||||
|
Supports loading data from REST APIs, databases, cloud storage,
|
||||||
|
and other data sources through DLT pipelines.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
return [
|
||||||
|
".dlt", # DLT pipeline configuration
|
||||||
|
".json", # JSON data
|
||||||
|
".jsonl", # JSON Lines
|
||||||
|
".csv", # CSV data
|
||||||
|
".parquet", # Parquet files
|
||||||
|
".yaml", # YAML configuration
|
||||||
|
".yml", # YAML configuration
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
return [
|
||||||
|
"application/json",
|
||||||
|
"application/x-ndjson", # JSON Lines
|
||||||
|
"text/csv",
|
||||||
|
"application/x-parquet",
|
||||||
|
"application/yaml",
|
||||||
|
"text/yaml",
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
return "dlt_loader"
|
||||||
|
|
||||||
|
def get_dependencies(self) -> List[str]:
|
||||||
|
return ["dlt>=0.4.0"]
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""Check if file can be handled by this loader."""
|
||||||
|
# Check file extension
|
||||||
|
file_ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
if file_ext not in self.supported_extensions:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check MIME type if provided
|
||||||
|
if mime_type and mime_type not in self.supported_mime_types:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate dependencies
|
||||||
|
return self.validate_dependencies()
|
||||||
|
|
||||||
|
async def load(self, file_path: str, source_type: str = "auto", **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load data using DLT pipeline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the data file or DLT configuration
|
||||||
|
source_type: Type of data source ("auto", "json", "csv", "parquet", "api")
|
||||||
|
**kwargs: Additional DLT pipeline configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult with loaded data and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If DLT is not installed
|
||||||
|
Exception: If data loading fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import dlt
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"dlt is required for data loading. Install with: pip install dlt"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.logger.info(f"Loading data with DLT: {file_path}")
|
||||||
|
|
||||||
|
file_ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
|
||||||
|
# Determine source type if auto
|
||||||
|
if source_type == "auto":
|
||||||
|
if file_ext == ".json":
|
||||||
|
source_type = "json"
|
||||||
|
elif file_ext == ".jsonl":
|
||||||
|
source_type = "jsonl"
|
||||||
|
elif file_ext == ".csv":
|
||||||
|
source_type = "csv"
|
||||||
|
elif file_ext == ".parquet":
|
||||||
|
source_type = "parquet"
|
||||||
|
elif file_ext in [".yaml", ".yml"]:
|
||||||
|
source_type = "yaml"
|
||||||
|
else:
|
||||||
|
source_type = "file"
|
||||||
|
|
||||||
|
# Load data based on source type
|
||||||
|
if source_type == "json":
|
||||||
|
content = self._load_json(file_path)
|
||||||
|
elif source_type == "jsonl":
|
||||||
|
content = self._load_jsonl(file_path)
|
||||||
|
elif source_type == "csv":
|
||||||
|
content = self._load_csv(file_path)
|
||||||
|
elif source_type == "parquet":
|
||||||
|
content = self._load_parquet(file_path)
|
||||||
|
elif source_type == "yaml":
|
||||||
|
content = self._load_yaml(file_path)
|
||||||
|
else:
|
||||||
|
# Default: read as text
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# Determine content type
|
||||||
|
if isinstance(content, (dict, list)):
|
||||||
|
content_type = ContentType.STRUCTURED
|
||||||
|
text_content = str(content)
|
||||||
|
else:
|
||||||
|
content_type = ContentType.TEXT
|
||||||
|
text_content = content
|
||||||
|
|
||||||
|
# Gather metadata
|
||||||
|
metadata = {
|
||||||
|
"name": file_name,
|
||||||
|
"size": file_size,
|
||||||
|
"extension": file_ext,
|
||||||
|
"loader": self.loader_name,
|
||||||
|
"source_type": source_type,
|
||||||
|
"dlt_version": dlt.__version__,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add data-specific metadata
|
||||||
|
if isinstance(content, list):
|
||||||
|
metadata["records_count"] = len(content)
|
||||||
|
elif isinstance(content, dict):
|
||||||
|
metadata["keys_count"] = len(content)
|
||||||
|
|
||||||
|
return LoaderResult(
|
||||||
|
content=text_content,
|
||||||
|
metadata=metadata,
|
||||||
|
content_type=content_type,
|
||||||
|
chunks=[text_content], # Single chunk for now
|
||||||
|
source_info={
|
||||||
|
"file_path": file_path,
|
||||||
|
"source_type": source_type,
|
||||||
|
"raw_data": content if isinstance(content, (dict, list)) else None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to load data with DLT from {file_path}: {e}")
|
||||||
|
raise Exception(f"DLT data loading failed: {e}") from e
|
||||||
|
|
||||||
|
def _load_json(self, file_path: str) -> Dict[str, Any]:
|
||||||
|
"""Load JSON file."""
|
||||||
|
import json
|
||||||
|
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def _load_jsonl(self, file_path: str) -> List[Dict[str, Any]]:
|
||||||
|
"""Load JSON Lines file."""
|
||||||
|
import json
|
||||||
|
|
||||||
|
data = []
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
data.append(json.loads(line))
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _load_csv(self, file_path: str) -> str:
|
||||||
|
"""Load CSV file as text."""
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
def _load_parquet(self, file_path: str) -> str:
|
||||||
|
"""Load Parquet file (requires pandas)."""
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_parquet(file_path)
|
||||||
|
return df.to_string()
|
||||||
|
except ImportError:
|
||||||
|
# Fallback: read as binary and convert to string representation
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
return f"<Parquet file: {os.path.basename(file_path)}, size: {len(f.read())} bytes>"
|
||||||
|
|
||||||
|
def _load_yaml(self, file_path: str) -> str:
|
||||||
|
"""Load YAML file as text."""
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
127
cognee/infrastructure/loaders/external/pypdf_loader.py
vendored
Normal file
127
cognee/infrastructure/loaders/external/pypdf_loader.py
vendored
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class PyPdfLoader(LoaderInterface):
|
||||||
|
"""
|
||||||
|
PDF loader using pypdf library.
|
||||||
|
|
||||||
|
Extracts text content from PDF files page by page, providing
|
||||||
|
structured page information and handling PDF-specific errors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
return [".pdf"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
return ["application/pdf"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
return "pypdf_loader"
|
||||||
|
|
||||||
|
def get_dependencies(self) -> List[str]:
|
||||||
|
return ["pypdf>=4.0.0"]
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""Check if file can be handled by this loader."""
|
||||||
|
# Check file extension
|
||||||
|
if not file_path.lower().endswith(".pdf"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check MIME type if provided
|
||||||
|
if mime_type and mime_type != "application/pdf":
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate dependencies
|
||||||
|
return self.validate_dependencies()
|
||||||
|
|
||||||
|
async def load(self, file_path: str, strict: bool = False, **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load PDF file and extract text content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the PDF file
|
||||||
|
strict: Whether to use strict mode for PDF reading
|
||||||
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult with extracted text content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If pypdf is not installed
|
||||||
|
Exception: If PDF processing fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"pypdf is required for PDF processing. Install with: pip install pypdf"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, "rb") as file:
|
||||||
|
self.logger.info(f"Reading PDF: {file_path}")
|
||||||
|
reader = PdfReader(file, strict=strict)
|
||||||
|
|
||||||
|
content_parts = []
|
||||||
|
page_texts = []
|
||||||
|
|
||||||
|
for page_num, page in enumerate(reader.pages, 1):
|
||||||
|
try:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text.strip(): # Only add non-empty pages
|
||||||
|
page_texts.append(page_text)
|
||||||
|
content_parts.append(f"Page {page_num}:\n{page_text}\n")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Failed to extract text from page {page_num}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Combine all content
|
||||||
|
full_content = "\n".join(content_parts)
|
||||||
|
|
||||||
|
# Gather metadata
|
||||||
|
metadata = {
|
||||||
|
"name": os.path.basename(file_path),
|
||||||
|
"size": os.path.getsize(file_path),
|
||||||
|
"extension": ".pdf",
|
||||||
|
"pages": len(reader.pages),
|
||||||
|
"pages_with_text": len(page_texts),
|
||||||
|
"loader": self.loader_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add PDF metadata if available
|
||||||
|
if reader.metadata:
|
||||||
|
metadata["pdf_metadata"] = {
|
||||||
|
"title": reader.metadata.get("/Title", ""),
|
||||||
|
"author": reader.metadata.get("/Author", ""),
|
||||||
|
"subject": reader.metadata.get("/Subject", ""),
|
||||||
|
"creator": reader.metadata.get("/Creator", ""),
|
||||||
|
"producer": reader.metadata.get("/Producer", ""),
|
||||||
|
"creation_date": str(reader.metadata.get("/CreationDate", "")),
|
||||||
|
"modification_date": str(reader.metadata.get("/ModDate", "")),
|
||||||
|
}
|
||||||
|
|
||||||
|
return LoaderResult(
|
||||||
|
content=full_content,
|
||||||
|
metadata=metadata,
|
||||||
|
content_type=ContentType.TEXT,
|
||||||
|
chunks=page_texts, # Pre-chunked by page
|
||||||
|
source_info={
|
||||||
|
"file_path": file_path,
|
||||||
|
"pages": len(reader.pages),
|
||||||
|
"strict_mode": strict,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to process PDF {file_path}: {e}")
|
||||||
|
raise Exception(f"PDF processing failed: {e}") from e
|
||||||
169
cognee/infrastructure/loaders/external/unstructured_loader.py
vendored
Normal file
169
cognee/infrastructure/loaders/external/unstructured_loader.py
vendored
Normal file
|
|
@ -0,0 +1,169 @@
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredLoader(LoaderInterface):
|
||||||
|
"""
|
||||||
|
Document loader using the unstructured library.
|
||||||
|
|
||||||
|
Handles various document formats including docx, pptx, xlsx, odt, etc.
|
||||||
|
Uses the unstructured library's auto-partition functionality.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
return [
|
||||||
|
".docx",
|
||||||
|
".doc",
|
||||||
|
".odt", # Word documents
|
||||||
|
".xlsx",
|
||||||
|
".xls",
|
||||||
|
".ods", # Spreadsheets
|
||||||
|
".pptx",
|
||||||
|
".ppt",
|
||||||
|
".odp", # Presentations
|
||||||
|
".rtf",
|
||||||
|
".html",
|
||||||
|
".htm", # Rich text and HTML
|
||||||
|
".eml",
|
||||||
|
".msg", # Email formats
|
||||||
|
".epub", # eBooks
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
return [
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # docx
|
||||||
|
"application/msword", # doc
|
||||||
|
"application/vnd.oasis.opendocument.text", # odt
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # xlsx
|
||||||
|
"application/vnd.ms-excel", # xls
|
||||||
|
"application/vnd.oasis.opendocument.spreadsheet", # ods
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # pptx
|
||||||
|
"application/vnd.ms-powerpoint", # ppt
|
||||||
|
"application/vnd.oasis.opendocument.presentation", # odp
|
||||||
|
"application/rtf", # rtf
|
||||||
|
"text/html", # html
|
||||||
|
"message/rfc822", # eml
|
||||||
|
"application/epub+zip", # epub
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
return "unstructured_loader"
|
||||||
|
|
||||||
|
def get_dependencies(self) -> List[str]:
|
||||||
|
return ["unstructured>=0.10.0"]
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""Check if file can be handled by this loader."""
|
||||||
|
# Check file extension
|
||||||
|
file_ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
if file_ext not in self.supported_extensions:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check MIME type if provided
|
||||||
|
if mime_type and mime_type not in self.supported_mime_types:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate dependencies
|
||||||
|
return self.validate_dependencies()
|
||||||
|
|
||||||
|
async def load(self, file_path: str, strategy: str = "auto", **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load document using unstructured library.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the document file
|
||||||
|
strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
|
||||||
|
**kwargs: Additional arguments passed to unstructured partition
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult with extracted text content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If unstructured is not installed
|
||||||
|
Exception: If document processing fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"unstructured is required for document processing. "
|
||||||
|
"Install with: pip install unstructured"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.logger.info(f"Processing document: {file_path}")
|
||||||
|
|
||||||
|
# Determine content type from file extension
|
||||||
|
file_ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
content_type_hint = None
|
||||||
|
|
||||||
|
# Get file size and basic info
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
|
||||||
|
# Set partitioning parameters
|
||||||
|
partition_kwargs = {"filename": file_path, "strategy": strategy, **kwargs}
|
||||||
|
|
||||||
|
# Use partition to extract elements
|
||||||
|
elements = partition(**partition_kwargs)
|
||||||
|
|
||||||
|
# Process elements into text content
|
||||||
|
text_parts = []
|
||||||
|
element_info = []
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
element_text = str(element).strip()
|
||||||
|
if element_text:
|
||||||
|
text_parts.append(element_text)
|
||||||
|
element_info.append(
|
||||||
|
{
|
||||||
|
"type": type(element).__name__,
|
||||||
|
"text": element_text[:100] + "..."
|
||||||
|
if len(element_text) > 100
|
||||||
|
else element_text,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine all text content
|
||||||
|
full_content = "\n\n".join(text_parts)
|
||||||
|
|
||||||
|
# Determine content type based on structure
|
||||||
|
content_type = ContentType.STRUCTURED if len(element_info) > 1 else ContentType.TEXT
|
||||||
|
|
||||||
|
# Gather metadata
|
||||||
|
metadata = {
|
||||||
|
"name": file_name,
|
||||||
|
"size": file_size,
|
||||||
|
"extension": file_ext,
|
||||||
|
"loader": self.loader_name,
|
||||||
|
"elements_count": len(elements),
|
||||||
|
"text_elements_count": len(text_parts),
|
||||||
|
"strategy": strategy,
|
||||||
|
"element_types": list(set(info["type"] for info in element_info)),
|
||||||
|
}
|
||||||
|
|
||||||
|
return LoaderResult(
|
||||||
|
content=full_content,
|
||||||
|
metadata=metadata,
|
||||||
|
content_type=content_type,
|
||||||
|
chunks=text_parts, # Pre-chunked by elements
|
||||||
|
source_info={
|
||||||
|
"file_path": file_path,
|
||||||
|
"strategy": strategy,
|
||||||
|
"elements": element_info[:10], # First 10 elements for debugging
|
||||||
|
"total_elements": len(elements),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to process document {file_path}: {e}")
|
||||||
|
raise Exception(f"Document processing failed: {e}") from e
|
||||||
20
cognee/infrastructure/loaders/get_loader_engine.py
Normal file
20
cognee/infrastructure/loaders/get_loader_engine.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from .config import get_loader_config
|
||||||
|
from .LoaderEngine import LoaderEngine
|
||||||
|
from .create_loader_engine import create_loader_engine
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_loader_engine() -> LoaderEngine:
|
||||||
|
"""
|
||||||
|
Factory function to get loader engine.
|
||||||
|
|
||||||
|
Follows cognee's pattern with @lru_cache for efficient reuse
|
||||||
|
of engine instances. Configuration is loaded from environment
|
||||||
|
variables and settings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached LoaderEngine instance configured with current settings
|
||||||
|
"""
|
||||||
|
config = get_loader_config()
|
||||||
|
return create_loader_engine(**config.to_dict())
|
||||||
47
cognee/infrastructure/loaders/models/LoaderResult.py
Normal file
47
cognee/infrastructure/loaders/models/LoaderResult.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional, Dict, Any, List
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class ContentType(Enum):
|
||||||
|
"""Content type classification for loaded files"""
|
||||||
|
|
||||||
|
TEXT = "text"
|
||||||
|
STRUCTURED = "structured"
|
||||||
|
BINARY = "binary"
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderResult(BaseModel):
|
||||||
|
"""
|
||||||
|
Standardized output format for all file loaders.
|
||||||
|
|
||||||
|
This model ensures consistent data structure across all loader implementations,
|
||||||
|
following cognee's pattern of using Pydantic models for data validation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
content: str # Primary text content extracted from file
|
||||||
|
metadata: Dict[str, Any] # File metadata (name, size, type, loader info, etc.)
|
||||||
|
content_type: ContentType # Content classification
|
||||||
|
chunks: Optional[List[str]] = None # Pre-chunked content if available
|
||||||
|
source_info: Optional[Dict[str, Any]] = None # Source-specific information
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert the loader result to a dictionary format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing all loader result data with string-serialized content_type
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"content": self.content,
|
||||||
|
"metadata": self.metadata,
|
||||||
|
"content_type": self.content_type.value,
|
||||||
|
"source_info": self.source_info or {},
|
||||||
|
"chunks": self.chunks,
|
||||||
|
}
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
"""Pydantic configuration following cognee patterns"""
|
||||||
|
|
||||||
|
use_enum_values = True
|
||||||
|
validate_assignment = True
|
||||||
3
cognee/infrastructure/loaders/models/__init__.py
Normal file
3
cognee/infrastructure/loaders/models/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
from .LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
__all__ = ["LoaderResult", "ContentType"]
|
||||||
3
cognee/infrastructure/loaders/supported_loaders.py
Normal file
3
cognee/infrastructure/loaders/supported_loaders.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
# Registry for loader implementations
|
||||||
|
# Follows cognee's pattern used in databases/vector/supported_databases.py
|
||||||
|
supported_loaders = {}
|
||||||
22
cognee/infrastructure/loaders/use_loader.py
Normal file
22
cognee/infrastructure/loaders/use_loader.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
from .supported_loaders import supported_loaders
|
||||||
|
|
||||||
|
|
||||||
|
def use_loader(loader_name: str, loader_class):
|
||||||
|
"""
|
||||||
|
Register a loader at runtime.
|
||||||
|
|
||||||
|
Follows cognee's pattern used in databases for adapter registration.
|
||||||
|
This allows external packages and custom loaders to be registered
|
||||||
|
into the loader system.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_name: Unique name for the loader
|
||||||
|
loader_class: Loader class implementing LoaderInterface
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from cognee.infrastructure.loaders import use_loader
|
||||||
|
from my_package import MyCustomLoader
|
||||||
|
|
||||||
|
use_loader("my_custom_loader", MyCustomLoader)
|
||||||
|
"""
|
||||||
|
supported_loaders[loader_name] = loader_class
|
||||||
|
|
@ -25,7 +25,7 @@ def _process_ontology_nodes(
|
||||||
ontology_nodes: list,
|
ontology_nodes: list,
|
||||||
data_chunk: DocumentChunk,
|
data_chunk: DocumentChunk,
|
||||||
added_nodes_map: dict,
|
added_nodes_map: dict,
|
||||||
added_ontology_nodes_map: dict
|
added_ontology_nodes_map: dict,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process and store ontology nodes"""
|
"""Process and store ontology nodes"""
|
||||||
for ontology_node in ontology_nodes:
|
for ontology_node in ontology_nodes:
|
||||||
|
|
@ -55,9 +55,7 @@ def _process_ontology_nodes(
|
||||||
|
|
||||||
|
|
||||||
def _process_ontology_edges(
|
def _process_ontology_edges(
|
||||||
ontology_edges: list,
|
ontology_edges: list, existing_edges_map: dict, ontology_relationships: list
|
||||||
existing_edges_map: dict,
|
|
||||||
ontology_relationships: list
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process ontology edges and add them if new"""
|
"""Process ontology edges and add them if new"""
|
||||||
for source, relation, target in ontology_edges:
|
for source, relation, target in ontology_edges:
|
||||||
|
|
@ -92,7 +90,7 @@ def _create_type_node(
|
||||||
key_mapping: dict,
|
key_mapping: dict,
|
||||||
data_chunk: DocumentChunk,
|
data_chunk: DocumentChunk,
|
||||||
existing_edges_map: dict,
|
existing_edges_map: dict,
|
||||||
ontology_relationships: list
|
ontology_relationships: list,
|
||||||
) -> EntityType:
|
) -> EntityType:
|
||||||
"""Create or retrieve a type node with ontology validation"""
|
"""Create or retrieve a type node with ontology validation"""
|
||||||
node_id = generate_node_id(node_type)
|
node_id = generate_node_id(node_type)
|
||||||
|
|
@ -100,7 +98,9 @@ def _create_type_node(
|
||||||
type_node_key = _create_node_key(node_id, "type")
|
type_node_key = _create_node_key(node_id, "type")
|
||||||
|
|
||||||
if type_node_key in added_nodes_map or type_node_key in key_mapping:
|
if type_node_key in added_nodes_map or type_node_key in key_mapping:
|
||||||
return added_nodes_map.get(type_node_key) or added_nodes_map.get(key_mapping.get(type_node_key))
|
return added_nodes_map.get(type_node_key) or added_nodes_map.get(
|
||||||
|
key_mapping.get(type_node_key)
|
||||||
|
)
|
||||||
|
|
||||||
# Get ontology validation
|
# Get ontology validation
|
||||||
ontology_nodes, ontology_edges, closest_class = ontology_resolver.get_subgraph(
|
ontology_nodes, ontology_edges, closest_class = ontology_resolver.get_subgraph(
|
||||||
|
|
@ -148,7 +148,7 @@ def _create_entity_node(
|
||||||
key_mapping: dict,
|
key_mapping: dict,
|
||||||
data_chunk: DocumentChunk,
|
data_chunk: DocumentChunk,
|
||||||
existing_edges_map: dict,
|
existing_edges_map: dict,
|
||||||
ontology_relationships: list
|
ontology_relationships: list,
|
||||||
) -> Entity:
|
) -> Entity:
|
||||||
"""Create or retrieve an entity node with ontology validation"""
|
"""Create or retrieve an entity node with ontology validation"""
|
||||||
generated_node_id = generate_node_id(node_id)
|
generated_node_id = generate_node_id(node_id)
|
||||||
|
|
@ -156,7 +156,9 @@ def _create_entity_node(
|
||||||
entity_node_key = _create_node_key(generated_node_id, "entity")
|
entity_node_key = _create_node_key(generated_node_id, "entity")
|
||||||
|
|
||||||
if entity_node_key in added_nodes_map or entity_node_key in key_mapping:
|
if entity_node_key in added_nodes_map or entity_node_key in key_mapping:
|
||||||
return added_nodes_map.get(entity_node_key) or added_nodes_map.get(key_mapping.get(entity_node_key))
|
return added_nodes_map.get(entity_node_key) or added_nodes_map.get(
|
||||||
|
key_mapping.get(entity_node_key)
|
||||||
|
)
|
||||||
|
|
||||||
# Get ontology validation
|
# Get ontology validation
|
||||||
ontology_nodes, ontology_edges, start_ent_ont = ontology_resolver.get_subgraph(
|
ontology_nodes, ontology_edges, start_ent_ont = ontology_resolver.get_subgraph(
|
||||||
|
|
@ -202,21 +204,37 @@ def _process_graph_nodes(
|
||||||
name_mapping: dict,
|
name_mapping: dict,
|
||||||
key_mapping: dict,
|
key_mapping: dict,
|
||||||
existing_edges_map: dict,
|
existing_edges_map: dict,
|
||||||
ontology_relationships: list
|
ontology_relationships: list,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process nodes in a knowledge graph"""
|
"""Process nodes in a knowledge graph"""
|
||||||
for node in graph.nodes:
|
for node in graph.nodes:
|
||||||
# Create type node
|
# Create type node
|
||||||
type_node = _create_type_node(
|
type_node = _create_type_node(
|
||||||
node.type, ontology_resolver, added_nodes_map, added_ontology_nodes_map,
|
node.type,
|
||||||
name_mapping, key_mapping, data_chunk, existing_edges_map, ontology_relationships
|
ontology_resolver,
|
||||||
|
added_nodes_map,
|
||||||
|
added_ontology_nodes_map,
|
||||||
|
name_mapping,
|
||||||
|
key_mapping,
|
||||||
|
data_chunk,
|
||||||
|
existing_edges_map,
|
||||||
|
ontology_relationships,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create entity node
|
# Create entity node
|
||||||
entity_node = _create_entity_node(
|
entity_node = _create_entity_node(
|
||||||
node.id, node.name, node.description, type_node, ontology_resolver,
|
node.id,
|
||||||
added_nodes_map, added_ontology_nodes_map, name_mapping, key_mapping,
|
node.name,
|
||||||
data_chunk, existing_edges_map, ontology_relationships
|
node.description,
|
||||||
|
type_node,
|
||||||
|
ontology_resolver,
|
||||||
|
added_nodes_map,
|
||||||
|
added_ontology_nodes_map,
|
||||||
|
name_mapping,
|
||||||
|
key_mapping,
|
||||||
|
data_chunk,
|
||||||
|
existing_edges_map,
|
||||||
|
ontology_relationships,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add entity to data chunk
|
# Add entity to data chunk
|
||||||
|
|
@ -226,10 +244,7 @@ def _process_graph_nodes(
|
||||||
|
|
||||||
|
|
||||||
def _process_graph_edges(
|
def _process_graph_edges(
|
||||||
graph: KnowledgeGraph,
|
graph: KnowledgeGraph, name_mapping: dict, existing_edges_map: dict, relationships: list
|
||||||
name_mapping: dict,
|
|
||||||
existing_edges_map: dict,
|
|
||||||
relationships: list
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process edges in a knowledge graph"""
|
"""Process edges in a knowledge graph"""
|
||||||
for edge in graph.edges:
|
for edge in graph.edges:
|
||||||
|
|
@ -288,8 +303,15 @@ def expand_with_nodes_and_edges(
|
||||||
|
|
||||||
# Process nodes first
|
# Process nodes first
|
||||||
_process_graph_nodes(
|
_process_graph_nodes(
|
||||||
data_chunk, graph, ontology_resolver, added_nodes_map, added_ontology_nodes_map,
|
data_chunk,
|
||||||
name_mapping, key_mapping, existing_edges_map, ontology_relationships
|
graph,
|
||||||
|
ontology_resolver,
|
||||||
|
added_nodes_map,
|
||||||
|
added_ontology_nodes_map,
|
||||||
|
name_mapping,
|
||||||
|
key_mapping,
|
||||||
|
existing_edges_map,
|
||||||
|
ontology_relationships,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Then process edges
|
# Then process edges
|
||||||
|
|
|
||||||
11
cognee/tasks/ingestion/adapters/__init__.py
Normal file
11
cognee/tasks/ingestion/adapters/__init__.py
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
"""
|
||||||
|
Adapters for bridging the new loader system with existing ingestion pipeline.
|
||||||
|
|
||||||
|
This module provides compatibility layers to integrate the plugin-based loader
|
||||||
|
system with cognee's existing data processing pipeline while maintaining
|
||||||
|
backward compatibility and preserving permission logic.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .loader_to_ingestion_adapter import LoaderToIngestionAdapter
|
||||||
|
|
||||||
|
__all__ = ["LoaderToIngestionAdapter"]
|
||||||
240
cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py
Normal file
240
cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from typing import BinaryIO, Union, Optional, Any
|
||||||
|
from io import StringIO, BytesIO
|
||||||
|
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
from cognee.modules.ingestion.data_types import IngestionData, TextData, BinaryData
|
||||||
|
from cognee.infrastructure.files import get_file_metadata
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderResultToIngestionData(IngestionData):
|
||||||
|
"""
|
||||||
|
Adapter class that wraps LoaderResult to be compatible with IngestionData interface.
|
||||||
|
|
||||||
|
This maintains backward compatibility with existing cognee ingestion pipeline
|
||||||
|
while enabling the new loader system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, loader_result: LoaderResult, original_file_path: str = None):
|
||||||
|
self.loader_result = loader_result
|
||||||
|
self.original_file_path = original_file_path
|
||||||
|
self._cached_metadata = None
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
def get_identifier(self) -> str:
|
||||||
|
"""
|
||||||
|
Get content identifier for deduplication.
|
||||||
|
|
||||||
|
Uses the loader result's source info or generates hash from content.
|
||||||
|
"""
|
||||||
|
# Try to get file hash from metadata first
|
||||||
|
if "content_hash" in self.loader_result.metadata:
|
||||||
|
return self.loader_result.metadata["content_hash"]
|
||||||
|
|
||||||
|
# Fallback: generate hash from content
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
content_bytes = self.loader_result.content.encode("utf-8")
|
||||||
|
content_hash = hashlib.md5(content_bytes).hexdigest()
|
||||||
|
|
||||||
|
# Add content type prefix for better identification
|
||||||
|
content_type = self.loader_result.content_type.value
|
||||||
|
return f"{content_type}_{content_hash}"
|
||||||
|
|
||||||
|
def get_metadata(self) -> dict:
|
||||||
|
"""
|
||||||
|
Get file metadata in the format expected by existing pipeline.
|
||||||
|
|
||||||
|
Converts LoaderResult metadata to the format used by IngestionData.
|
||||||
|
"""
|
||||||
|
if self._cached_metadata is not None:
|
||||||
|
return self._cached_metadata
|
||||||
|
|
||||||
|
# Start with loader result metadata
|
||||||
|
metadata = self.loader_result.metadata.copy()
|
||||||
|
|
||||||
|
# Ensure required fields are present
|
||||||
|
if "name" not in metadata:
|
||||||
|
if self.original_file_path:
|
||||||
|
metadata["name"] = os.path.basename(self.original_file_path)
|
||||||
|
else:
|
||||||
|
# Generate name from content hash
|
||||||
|
content_hash = self.get_identifier().split("_")[-1][:8]
|
||||||
|
ext = metadata.get("extension", ".txt")
|
||||||
|
metadata["name"] = f"content_{content_hash}{ext}"
|
||||||
|
|
||||||
|
if "content_hash" not in metadata:
|
||||||
|
metadata["content_hash"] = self.get_identifier()
|
||||||
|
|
||||||
|
if "file_path" not in metadata and self.original_file_path:
|
||||||
|
metadata["file_path"] = self.original_file_path
|
||||||
|
|
||||||
|
# Add mime type if not present
|
||||||
|
if "mime_type" not in metadata:
|
||||||
|
ext = metadata.get("extension", "").lower()
|
||||||
|
mime_type_map = {
|
||||||
|
".txt": "text/plain",
|
||||||
|
".md": "text/markdown",
|
||||||
|
".csv": "text/csv",
|
||||||
|
".json": "application/json",
|
||||||
|
".pdf": "application/pdf",
|
||||||
|
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
}
|
||||||
|
metadata["mime_type"] = mime_type_map.get(ext, "application/octet-stream")
|
||||||
|
|
||||||
|
self._cached_metadata = metadata
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def get_data(self) -> Union[str, BinaryIO]:
|
||||||
|
"""
|
||||||
|
Get data content in format expected by existing pipeline.
|
||||||
|
|
||||||
|
Returns content as string for text data or creates a file-like object
|
||||||
|
for binary data to maintain compatibility.
|
||||||
|
"""
|
||||||
|
if self.loader_result.content_type == ContentType.TEXT:
|
||||||
|
return self.loader_result.content
|
||||||
|
|
||||||
|
# For structured or binary content, return as string for now
|
||||||
|
# The existing pipeline expects text content for processing
|
||||||
|
return self.loader_result.content
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderToIngestionAdapter:
|
||||||
|
"""
|
||||||
|
Adapter that bridges the new loader system with existing ingestion pipeline.
|
||||||
|
|
||||||
|
This class provides methods to process files using the loader system
|
||||||
|
while maintaining compatibility with the existing IngestionData interface.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
async def process_file_with_loaders(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
s3fs: Optional[Any] = None,
|
||||||
|
preferred_loaders: Optional[list] = None,
|
||||||
|
loader_config: Optional[dict] = None,
|
||||||
|
) -> IngestionData:
|
||||||
|
"""
|
||||||
|
Process a file using the loader system and return IngestionData.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to process
|
||||||
|
s3fs: S3 filesystem (for compatibility with existing code)
|
||||||
|
preferred_loaders: List of preferred loader names
|
||||||
|
loader_config: Configuration for specific loaders
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
IngestionData compatible object
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If no loader can handle the file
|
||||||
|
"""
|
||||||
|
from cognee.infrastructure.loaders import get_loader_engine
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the loader engine
|
||||||
|
engine = get_loader_engine()
|
||||||
|
|
||||||
|
# Determine MIME type if possible
|
||||||
|
mime_type = None
|
||||||
|
try:
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
mime_type, _ = mimetypes.guess_type(file_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Load file using loader system
|
||||||
|
self.logger.info(f"Processing file with loaders: {file_path}")
|
||||||
|
|
||||||
|
# Extract loader-specific config if provided
|
||||||
|
kwargs = {}
|
||||||
|
if loader_config:
|
||||||
|
# Find the first available loader that matches our preferred loaders
|
||||||
|
loader = engine.get_loader(file_path, mime_type, preferred_loaders)
|
||||||
|
if loader and loader.loader_name in loader_config:
|
||||||
|
kwargs = loader_config[loader.loader_name]
|
||||||
|
|
||||||
|
loader_result = await engine.load_file(
|
||||||
|
file_path, mime_type=mime_type, preferred_loaders=preferred_loaders, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to IngestionData compatible format
|
||||||
|
return LoaderResultToIngestionData(loader_result, file_path)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Loader system failed for {file_path}: {e}")
|
||||||
|
# Fallback to existing classification system
|
||||||
|
return await self._fallback_to_existing_system(file_path, s3fs)
|
||||||
|
|
||||||
|
async def _fallback_to_existing_system(
|
||||||
|
self, file_path: str, s3fs: Optional[Any] = None
|
||||||
|
) -> IngestionData:
|
||||||
|
"""
|
||||||
|
Fallback to existing ingestion.classify() system for backward compatibility.
|
||||||
|
|
||||||
|
This ensures that even if the loader system fails, we can still process
|
||||||
|
files using the original classification method.
|
||||||
|
"""
|
||||||
|
from cognee.modules.ingestion import classify
|
||||||
|
|
||||||
|
self.logger.info(f"Falling back to existing classification system for: {file_path}")
|
||||||
|
|
||||||
|
# Open file and classify using existing system
|
||||||
|
if file_path.startswith("s3://"):
|
||||||
|
if s3fs:
|
||||||
|
with s3fs.open(file_path, "rb") as file:
|
||||||
|
return classify(file, s3fs=s3fs)
|
||||||
|
else:
|
||||||
|
raise ValueError("S3 file path provided but no s3fs available")
|
||||||
|
else:
|
||||||
|
# Handle local files and file:// URLs
|
||||||
|
local_path = file_path.replace("file://", "")
|
||||||
|
with open(local_path, "rb") as file:
|
||||||
|
return classify(file)
|
||||||
|
|
||||||
|
def is_text_content(self, data: Union[str, Any]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the provided data is text content (not a file path).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The data to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if data is text content, False if it's a file path
|
||||||
|
"""
|
||||||
|
if not isinstance(data, str):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if it's a file path
|
||||||
|
if (
|
||||||
|
data.startswith("/")
|
||||||
|
or data.startswith("file://")
|
||||||
|
or data.startswith("s3://")
|
||||||
|
or (len(data) > 1 and data[1] == ":")
|
||||||
|
): # Windows drive paths
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def create_text_ingestion_data(self, content: str) -> IngestionData:
|
||||||
|
"""
|
||||||
|
Create IngestionData for text content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Text content to wrap
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
IngestionData compatible object
|
||||||
|
"""
|
||||||
|
from cognee.modules.ingestion.data_types import TextData
|
||||||
|
|
||||||
|
return TextData(content)
|
||||||
223
cognee/tasks/ingestion/plugin_ingest_data.py
Normal file
223
cognee/tasks/ingestion/plugin_ingest_data.py
Normal file
|
|
@ -0,0 +1,223 @@
|
||||||
|
import json
|
||||||
|
import inspect
|
||||||
|
from uuid import UUID
|
||||||
|
from typing import Union, BinaryIO, Any, List, Optional
|
||||||
|
|
||||||
|
import cognee.modules.ingestion as ingestion
|
||||||
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
|
from cognee.modules.data.models import Data
|
||||||
|
from cognee.modules.users.models import User
|
||||||
|
from cognee.modules.users.methods import get_default_user
|
||||||
|
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
||||||
|
from cognee.modules.data.methods import (
|
||||||
|
get_authorized_existing_datasets,
|
||||||
|
get_dataset_data,
|
||||||
|
load_or_create_datasets,
|
||||||
|
)
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
from .save_data_item_to_storage import save_data_item_to_storage
|
||||||
|
from .adapters import LoaderToIngestionAdapter
|
||||||
|
from cognee.api.v1.add.config import get_s3_config
|
||||||
|
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def plugin_ingest_data(
|
||||||
|
data: Any,
|
||||||
|
dataset_name: str,
|
||||||
|
user: User,
|
||||||
|
node_set: Optional[List[str]] = None,
|
||||||
|
dataset_id: UUID = None,
|
||||||
|
preferred_loaders: Optional[List[str]] = None,
|
||||||
|
loader_config: Optional[dict] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Plugin-based data ingestion using the loader system.
|
||||||
|
|
||||||
|
This function maintains full backward compatibility with the existing
|
||||||
|
ingest_data function while adding support for the new loader system.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The data to ingest
|
||||||
|
dataset_name: Name of the dataset
|
||||||
|
user: User object for permissions
|
||||||
|
node_set: Optional node set for organization
|
||||||
|
dataset_id: Optional specific dataset ID
|
||||||
|
preferred_loaders: List of preferred loader names to try first
|
||||||
|
loader_config: Configuration for specific loaders
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Data objects that were ingested
|
||||||
|
"""
|
||||||
|
if not user:
|
||||||
|
user = await get_default_user()
|
||||||
|
|
||||||
|
# Initialize S3 support (maintain existing behavior)
|
||||||
|
s3_config = get_s3_config()
|
||||||
|
fs = None
|
||||||
|
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
|
||||||
|
import s3fs
|
||||||
|
|
||||||
|
fs = s3fs.S3FileSystem(
|
||||||
|
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize the loader adapter
|
||||||
|
loader_adapter = LoaderToIngestionAdapter()
|
||||||
|
|
||||||
|
def open_data_file(file_path: str):
|
||||||
|
"""Open file with S3 support (preserves existing behavior)."""
|
||||||
|
if file_path.startswith("s3://"):
|
||||||
|
return fs.open(file_path, mode="rb")
|
||||||
|
else:
|
||||||
|
local_path = file_path.replace("file://", "")
|
||||||
|
return open(local_path, mode="rb")
|
||||||
|
|
||||||
|
def get_external_metadata_dict(data_item: Union[BinaryIO, str, Any]) -> dict[str, Any]:
|
||||||
|
"""Get external metadata (preserves existing behavior)."""
|
||||||
|
if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")):
|
||||||
|
return {"metadata": data_item.dict(), "origin": str(type(data_item))}
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
async def store_data_to_dataset(
|
||||||
|
data: Any,
|
||||||
|
dataset_name: str,
|
||||||
|
user: User,
|
||||||
|
node_set: Optional[List[str]] = None,
|
||||||
|
dataset_id: UUID = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Core data storage logic with plugin-based file processing.
|
||||||
|
|
||||||
|
This function preserves all existing permission and database logic
|
||||||
|
while using the new loader system for file processing.
|
||||||
|
"""
|
||||||
|
logger.info(f"Plugin-based ingestion starting for dataset: {dataset_name}")
|
||||||
|
|
||||||
|
# Preserve existing dataset creation and permission logic
|
||||||
|
user_datasets = await get_specific_user_permission_datasets(user.id, ["write"])
|
||||||
|
existing_datasets = await get_authorized_existing_datasets(user.id, dataset_name, ["write"])
|
||||||
|
|
||||||
|
datasets = await load_or_create_datasets(
|
||||||
|
user_datasets, existing_datasets, dataset_name, user, dataset_id
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = datasets[0]
|
||||||
|
|
||||||
|
new_datapoints = []
|
||||||
|
existing_data_points = []
|
||||||
|
dataset_new_data_points = []
|
||||||
|
|
||||||
|
# Get existing dataset data for deduplication (preserve existing logic)
|
||||||
|
dataset_data: list[Data] = await get_dataset_data(dataset.id)
|
||||||
|
dataset_data_map = {str(data.id): True for data in dataset_data}
|
||||||
|
|
||||||
|
for data_item in data:
|
||||||
|
file_path = await save_data_item_to_storage(data_item, dataset_name)
|
||||||
|
|
||||||
|
# NEW: Use loader system or existing classification based on data type
|
||||||
|
try:
|
||||||
|
if loader_adapter.is_text_content(data_item):
|
||||||
|
# Handle text content (preserve existing behavior)
|
||||||
|
logger.info("Processing text content with existing system")
|
||||||
|
classified_data = ingestion.classify(data_item)
|
||||||
|
else:
|
||||||
|
# Use loader system for file paths
|
||||||
|
logger.info(f"Processing file with loader system: {file_path}")
|
||||||
|
classified_data = await loader_adapter.process_file_with_loaders(
|
||||||
|
file_path,
|
||||||
|
s3fs=fs,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
loader_config=loader_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Plugin system failed for {file_path}, falling back: {e}")
|
||||||
|
# Fallback to existing system for full backward compatibility
|
||||||
|
with open_data_file(file_path) as file:
|
||||||
|
classified_data = ingestion.classify(file, s3fs=fs)
|
||||||
|
|
||||||
|
# Preserve all existing data processing logic
|
||||||
|
data_id = ingestion.identify(classified_data, user)
|
||||||
|
file_metadata = classified_data.get_metadata()
|
||||||
|
|
||||||
|
from sqlalchemy import select
|
||||||
|
|
||||||
|
db_engine = get_relational_engine()
|
||||||
|
|
||||||
|
# Check if data should be updated (preserve existing logic)
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
data_point = (
|
||||||
|
await session.execute(select(Data).filter(Data.id == data_id))
|
||||||
|
).scalar_one_or_none()
|
||||||
|
|
||||||
|
ext_metadata = get_external_metadata_dict(data_item)
|
||||||
|
|
||||||
|
if node_set:
|
||||||
|
ext_metadata["node_set"] = node_set
|
||||||
|
|
||||||
|
# Preserve existing data point creation/update logic
|
||||||
|
if data_point is not None:
|
||||||
|
data_point.name = file_metadata["name"]
|
||||||
|
data_point.raw_data_location = file_metadata["file_path"]
|
||||||
|
data_point.extension = file_metadata["extension"]
|
||||||
|
data_point.mime_type = file_metadata["mime_type"]
|
||||||
|
data_point.owner_id = user.id
|
||||||
|
data_point.content_hash = file_metadata["content_hash"]
|
||||||
|
data_point.external_metadata = ext_metadata
|
||||||
|
data_point.node_set = json.dumps(node_set) if node_set else None
|
||||||
|
|
||||||
|
if str(data_point.id) in dataset_data_map:
|
||||||
|
existing_data_points.append(data_point)
|
||||||
|
else:
|
||||||
|
dataset_new_data_points.append(data_point)
|
||||||
|
dataset_data_map[str(data_point.id)] = True
|
||||||
|
else:
|
||||||
|
if str(data_id) in dataset_data_map:
|
||||||
|
continue
|
||||||
|
|
||||||
|
data_point = Data(
|
||||||
|
id=data_id,
|
||||||
|
name=file_metadata["name"],
|
||||||
|
raw_data_location=file_metadata["file_path"],
|
||||||
|
extension=file_metadata["extension"],
|
||||||
|
mime_type=file_metadata["mime_type"],
|
||||||
|
owner_id=user.id,
|
||||||
|
content_hash=file_metadata["content_hash"],
|
||||||
|
external_metadata=ext_metadata,
|
||||||
|
node_set=json.dumps(node_set) if node_set else None,
|
||||||
|
token_count=-1,
|
||||||
|
)
|
||||||
|
|
||||||
|
new_datapoints.append(data_point)
|
||||||
|
dataset_data_map[str(data_point.id)] = True
|
||||||
|
|
||||||
|
# Preserve existing database operations
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
if dataset not in session:
|
||||||
|
session.add(dataset)
|
||||||
|
|
||||||
|
if len(new_datapoints) > 0:
|
||||||
|
dataset.data.extend(new_datapoints)
|
||||||
|
|
||||||
|
if len(existing_data_points) > 0:
|
||||||
|
for data_point in existing_data_points:
|
||||||
|
await session.merge(data_point)
|
||||||
|
|
||||||
|
if len(dataset_new_data_points) > 0:
|
||||||
|
dataset.data.extend(dataset_new_data_points)
|
||||||
|
|
||||||
|
await session.merge(dataset)
|
||||||
|
await session.commit()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Plugin-based ingestion completed. New: {len(new_datapoints)}, "
|
||||||
|
+ f"Updated: {len(existing_data_points)}, Dataset new: {len(dataset_new_data_points)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return existing_data_points + dataset_new_data_points + new_datapoints
|
||||||
|
|
||||||
|
return await store_data_to_dataset(data, dataset_name, user, node_set, dataset_id)
|
||||||
237
infrastructure/loaders/LoaderEngine.py
Normal file
237
infrastructure/loaders/LoaderEngine.py
Normal file
|
|
@ -0,0 +1,237 @@
|
||||||
|
import os
|
||||||
|
import importlib.util
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from .LoaderInterface import LoaderInterface
|
||||||
|
from .models.LoaderResult import LoaderResult
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderEngine:
|
||||||
|
"""
|
||||||
|
Main loader engine for managing file loaders.
|
||||||
|
|
||||||
|
Follows cognee's adapter pattern similar to database engines,
|
||||||
|
providing a centralized system for file loading operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
loader_directories: List[str],
|
||||||
|
default_loader_priority: List[str],
|
||||||
|
fallback_loader: str = "text_loader",
|
||||||
|
enable_dependency_validation: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the loader engine.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_directories: Directories to search for loader implementations
|
||||||
|
default_loader_priority: Priority order for loader selection
|
||||||
|
fallback_loader: Default loader to use when no other matches
|
||||||
|
enable_dependency_validation: Whether to validate loader dependencies
|
||||||
|
"""
|
||||||
|
self._loaders: Dict[str, LoaderInterface] = {}
|
||||||
|
self._extension_map: Dict[str, List[LoaderInterface]] = {}
|
||||||
|
self._mime_type_map: Dict[str, List[LoaderInterface]] = {}
|
||||||
|
self.loader_directories = loader_directories
|
||||||
|
self.default_loader_priority = default_loader_priority
|
||||||
|
self.fallback_loader = fallback_loader
|
||||||
|
self.enable_dependency_validation = enable_dependency_validation
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
|
def register_loader(self, loader: LoaderInterface) -> bool:
|
||||||
|
"""
|
||||||
|
Register a loader with the engine.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader: LoaderInterface implementation to register
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if loader was registered successfully, False otherwise
|
||||||
|
"""
|
||||||
|
# Validate dependencies if enabled
|
||||||
|
if self.enable_dependency_validation and not loader.validate_dependencies():
|
||||||
|
self.logger.warning(
|
||||||
|
f"Skipping loader '{loader.loader_name}' - missing dependencies: "
|
||||||
|
f"{loader.get_dependencies()}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
self._loaders[loader.loader_name] = loader
|
||||||
|
|
||||||
|
# Map extensions to loaders
|
||||||
|
for ext in loader.supported_extensions:
|
||||||
|
ext_lower = ext.lower()
|
||||||
|
if ext_lower not in self._extension_map:
|
||||||
|
self._extension_map[ext_lower] = []
|
||||||
|
self._extension_map[ext_lower].append(loader)
|
||||||
|
|
||||||
|
# Map mime types to loaders
|
||||||
|
for mime_type in loader.supported_mime_types:
|
||||||
|
if mime_type not in self._mime_type_map:
|
||||||
|
self._mime_type_map[mime_type] = []
|
||||||
|
self._mime_type_map[mime_type].append(loader)
|
||||||
|
|
||||||
|
self.logger.info(f"Registered loader: {loader.loader_name}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_loader(
|
||||||
|
self, file_path: str, mime_type: str = None, preferred_loaders: List[str] = None
|
||||||
|
) -> Optional[LoaderInterface]:
|
||||||
|
"""
|
||||||
|
Get appropriate loader for a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
mime_type: Optional MIME type of the file
|
||||||
|
preferred_loaders: List of preferred loader names to try first
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderInterface that can handle the file, or None if not found
|
||||||
|
"""
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
|
||||||
|
# Try preferred loaders first
|
||||||
|
if preferred_loaders:
|
||||||
|
for loader_name in preferred_loaders:
|
||||||
|
if loader_name in self._loaders:
|
||||||
|
loader = self._loaders[loader_name]
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Try priority order
|
||||||
|
for loader_name in self.default_loader_priority:
|
||||||
|
if loader_name in self._loaders:
|
||||||
|
loader = self._loaders[loader_name]
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Try mime type mapping
|
||||||
|
if mime_type and mime_type in self._mime_type_map:
|
||||||
|
for loader in self._mime_type_map[mime_type]:
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Try extension mapping
|
||||||
|
if ext in self._extension_map:
|
||||||
|
for loader in self._extension_map[ext]:
|
||||||
|
if loader.can_handle(file_path, mime_type):
|
||||||
|
return loader
|
||||||
|
|
||||||
|
# Fallback loader
|
||||||
|
if self.fallback_loader in self._loaders:
|
||||||
|
fallback = self._loaders[self.fallback_loader]
|
||||||
|
if fallback.can_handle(file_path, mime_type):
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def load_file(
|
||||||
|
self, file_path: str, mime_type: str = None, preferred_loaders: List[str] = None, **kwargs
|
||||||
|
) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load file using appropriate loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
mime_type: Optional MIME type of the file
|
||||||
|
preferred_loaders: List of preferred loader names to try first
|
||||||
|
**kwargs: Additional loader-specific configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult containing processed content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If no suitable loader is found
|
||||||
|
Exception: If file processing fails
|
||||||
|
"""
|
||||||
|
loader = self.get_loader(file_path, mime_type, preferred_loaders)
|
||||||
|
if not loader:
|
||||||
|
raise ValueError(f"No loader found for file: {file_path}")
|
||||||
|
|
||||||
|
self.logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
||||||
|
return await loader.load(file_path, **kwargs)
|
||||||
|
|
||||||
|
def discover_loaders(self):
|
||||||
|
"""
|
||||||
|
Auto-discover loaders from configured directories.
|
||||||
|
|
||||||
|
Scans loader directories for Python modules containing
|
||||||
|
LoaderInterface implementations and registers them.
|
||||||
|
"""
|
||||||
|
for directory in self.loader_directories:
|
||||||
|
if os.path.exists(directory):
|
||||||
|
self._discover_in_directory(directory)
|
||||||
|
|
||||||
|
def _discover_in_directory(self, directory: str):
|
||||||
|
"""
|
||||||
|
Discover loaders in a specific directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Directory path to scan for loader implementations
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
for file_name in os.listdir(directory):
|
||||||
|
if file_name.endswith(".py") and not file_name.startswith("_"):
|
||||||
|
module_name = file_name[:-3]
|
||||||
|
file_path = os.path.join(directory, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||||
|
if spec and spec.loader:
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
|
||||||
|
# Look for loader classes
|
||||||
|
for attr_name in dir(module):
|
||||||
|
attr = getattr(module, attr_name)
|
||||||
|
if (
|
||||||
|
isinstance(attr, type)
|
||||||
|
and issubclass(attr, LoaderInterface)
|
||||||
|
and attr != LoaderInterface
|
||||||
|
):
|
||||||
|
# Instantiate and register the loader
|
||||||
|
try:
|
||||||
|
loader_instance = attr()
|
||||||
|
self.register_loader(loader_instance)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Failed to instantiate loader {attr_name}: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Failed to load module {module_name}: {e}")
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
self.logger.warning(f"Failed to scan directory {directory}: {e}")
|
||||||
|
|
||||||
|
def get_available_loaders(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get list of available loader names.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of registered loader names
|
||||||
|
"""
|
||||||
|
return list(self._loaders.keys())
|
||||||
|
|
||||||
|
def get_loader_info(self, loader_name: str) -> Dict[str, any]:
|
||||||
|
"""
|
||||||
|
Get information about a specific loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_name: Name of the loader to inspect
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing loader information
|
||||||
|
"""
|
||||||
|
if loader_name not in self._loaders:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
loader = self._loaders[loader_name]
|
||||||
|
return {
|
||||||
|
"name": loader.loader_name,
|
||||||
|
"extensions": loader.supported_extensions,
|
||||||
|
"mime_types": loader.supported_mime_types,
|
||||||
|
"dependencies": loader.get_dependencies(),
|
||||||
|
"available": loader.validate_dependencies(),
|
||||||
|
}
|
||||||
101
infrastructure/loaders/LoaderInterface.py
Normal file
101
infrastructure/loaders/LoaderInterface.py
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List
|
||||||
|
from .models.LoaderResult import LoaderResult
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderInterface(ABC):
|
||||||
|
"""
|
||||||
|
Base interface for all file loaders in cognee.
|
||||||
|
|
||||||
|
This interface follows cognee's established pattern for database adapters,
|
||||||
|
ensuring consistent behavior across all loader implementations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
List of file extensions this loader supports.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of extensions including the dot (e.g., ['.txt', '.md'])
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
List of MIME types this loader supports.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Unique name identifier for this loader.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String identifier used for registration and configuration
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader can handle the given file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
mime_type: Optional MIME type of the file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if this loader can process the file, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load and process the file, returning standardized result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to be processed
|
||||||
|
**kwargs: Additional loader-specific configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult containing processed content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If file cannot be processed
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_dependencies(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Optional: Return list of required dependencies for this loader.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of package names with optional version specifications
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def validate_dependencies(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if all required dependencies are available.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all dependencies are installed, False otherwise
|
||||||
|
"""
|
||||||
|
for dep in self.get_dependencies():
|
||||||
|
# Extract package name from version specification
|
||||||
|
package_name = dep.split(">=")[0].split("==")[0].split("<")[0]
|
||||||
|
try:
|
||||||
|
__import__(package_name)
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
19
infrastructure/loaders/__init__.py
Normal file
19
infrastructure/loaders/__init__.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""
|
||||||
|
File loader infrastructure for cognee.
|
||||||
|
|
||||||
|
This package provides a plugin-based system for loading different file formats
|
||||||
|
into cognee, following the same patterns as database adapters.
|
||||||
|
|
||||||
|
Main exports:
|
||||||
|
- get_loader_engine(): Factory function to get configured loader engine
|
||||||
|
- use_loader(): Register custom loaders at runtime
|
||||||
|
- LoaderInterface: Base interface for implementing loaders
|
||||||
|
- LoaderResult, ContentType: Data models for loader results
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .get_loader_engine import get_loader_engine
|
||||||
|
from .use_loader import use_loader
|
||||||
|
from .LoaderInterface import LoaderInterface
|
||||||
|
from .models.LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
__all__ = ["get_loader_engine", "use_loader", "LoaderInterface", "LoaderResult", "ContentType"]
|
||||||
57
infrastructure/loaders/config.py
Normal file
57
infrastructure/loaders/config.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
from cognee.root_dir import get_absolute_path
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderConfig(BaseSettings):
|
||||||
|
"""
|
||||||
|
Configuration for file loader system.
|
||||||
|
|
||||||
|
Follows cognee's pattern using pydantic_settings.BaseSettings for
|
||||||
|
environment variable support and validation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
loader_directories: List[str] = [
|
||||||
|
get_absolute_path("cognee/infrastructure/loaders/core"),
|
||||||
|
get_absolute_path("cognee/infrastructure/loaders/external"),
|
||||||
|
]
|
||||||
|
default_loader_priority: List[str] = [
|
||||||
|
"text_loader",
|
||||||
|
"pypdf_loader",
|
||||||
|
"unstructured_loader",
|
||||||
|
"dlt_loader",
|
||||||
|
]
|
||||||
|
auto_discover: bool = True
|
||||||
|
fallback_loader: str = "text_loader"
|
||||||
|
enable_dependency_validation: bool = True
|
||||||
|
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", extra="allow", env_prefix="LOADER_")
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert configuration to dictionary format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing all loader configuration settings
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"loader_directories": self.loader_directories,
|
||||||
|
"default_loader_priority": self.default_loader_priority,
|
||||||
|
"auto_discover": self.auto_discover,
|
||||||
|
"fallback_loader": self.fallback_loader,
|
||||||
|
"enable_dependency_validation": self.enable_dependency_validation,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_loader_config() -> LoaderConfig:
|
||||||
|
"""
|
||||||
|
Get cached loader configuration.
|
||||||
|
|
||||||
|
Uses LRU cache following cognee's pattern for configuration objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderConfig instance with current settings
|
||||||
|
"""
|
||||||
|
return LoaderConfig()
|
||||||
5
infrastructure/loaders/core/__init__.py
Normal file
5
infrastructure/loaders/core/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
"""Core loader implementations that are always available."""
|
||||||
|
|
||||||
|
from .text_loader import TextLoader
|
||||||
|
|
||||||
|
__all__ = ["TextLoader"]
|
||||||
128
infrastructure/loaders/core/text_loader.py
Normal file
128
infrastructure/loaders/core/text_loader.py
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from ..LoaderInterface import LoaderInterface
|
||||||
|
from ..models.LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
|
||||||
|
class TextLoader(LoaderInterface):
|
||||||
|
"""
|
||||||
|
Core text file loader that handles basic text file formats.
|
||||||
|
|
||||||
|
This loader is always available and serves as the fallback for
|
||||||
|
text-based files when no specialized loader is available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
"""Supported text file extensions."""
|
||||||
|
return [".txt", ".md", ".csv", ".json", ".xml", ".yaml", ".yml", ".log"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
"""Supported MIME types for text content."""
|
||||||
|
return [
|
||||||
|
"text/plain",
|
||||||
|
"text/markdown",
|
||||||
|
"text/csv",
|
||||||
|
"application/json",
|
||||||
|
"text/xml",
|
||||||
|
"application/xml",
|
||||||
|
"text/yaml",
|
||||||
|
"application/yaml",
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
"""Unique identifier for this loader."""
|
||||||
|
return "text_loader"
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this loader can handle the given file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file
|
||||||
|
mime_type: Optional MIME type
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if file can be handled, False otherwise
|
||||||
|
"""
|
||||||
|
# Check by extension
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
if ext in self.supported_extensions:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check by MIME type
|
||||||
|
if mime_type and mime_type in self.supported_mime_types:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# As fallback loader, can attempt to handle any text-like file
|
||||||
|
# This is useful when other loaders fail
|
||||||
|
try:
|
||||||
|
# Quick check if file appears to be text
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
sample = f.read(512)
|
||||||
|
# Simple heuristic: if most bytes are printable, consider it text
|
||||||
|
if sample:
|
||||||
|
try:
|
||||||
|
sample.decode("utf-8")
|
||||||
|
return True
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
sample.decode("latin-1")
|
||||||
|
return True
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
except (OSError, IOError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def load(self, file_path: str, encoding: str = "utf-8", **kwargs) -> LoaderResult:
|
||||||
|
"""
|
||||||
|
Load and process the text file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to load
|
||||||
|
encoding: Text encoding to use (default: utf-8)
|
||||||
|
**kwargs: Additional configuration (unused)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoaderResult containing the file content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If file doesn't exist
|
||||||
|
UnicodeDecodeError: If file cannot be decoded with specified encoding
|
||||||
|
OSError: If file cannot be read
|
||||||
|
"""
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, "r", encoding=encoding) as f:
|
||||||
|
content = f.read()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Try with fallback encoding
|
||||||
|
if encoding == "utf-8":
|
||||||
|
return await self.load(file_path, encoding="latin-1", **kwargs)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Extract basic metadata
|
||||||
|
file_stat = os.stat(file_path)
|
||||||
|
metadata = {
|
||||||
|
"name": os.path.basename(file_path),
|
||||||
|
"size": file_stat.st_size,
|
||||||
|
"extension": os.path.splitext(file_path)[1],
|
||||||
|
"encoding": encoding,
|
||||||
|
"loader": self.loader_name,
|
||||||
|
"lines": len(content.splitlines()) if content else 0,
|
||||||
|
"characters": len(content),
|
||||||
|
}
|
||||||
|
|
||||||
|
return LoaderResult(
|
||||||
|
content=content,
|
||||||
|
metadata=metadata,
|
||||||
|
content_type=ContentType.TEXT,
|
||||||
|
source_info={"file_path": file_path, "encoding": encoding},
|
||||||
|
)
|
||||||
49
infrastructure/loaders/create_loader_engine.py
Normal file
49
infrastructure/loaders/create_loader_engine.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
from typing import List
|
||||||
|
from .LoaderEngine import LoaderEngine
|
||||||
|
from .supported_loaders import supported_loaders
|
||||||
|
|
||||||
|
|
||||||
|
def create_loader_engine(
|
||||||
|
loader_directories: List[str],
|
||||||
|
default_loader_priority: List[str],
|
||||||
|
auto_discover: bool = True,
|
||||||
|
fallback_loader: str = "text_loader",
|
||||||
|
enable_dependency_validation: bool = True,
|
||||||
|
) -> LoaderEngine:
|
||||||
|
"""
|
||||||
|
Create loader engine with given configuration.
|
||||||
|
|
||||||
|
Follows cognee's pattern for engine creation functions used
|
||||||
|
in database adapters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_directories: Directories to search for loader implementations
|
||||||
|
default_loader_priority: Priority order for loader selection
|
||||||
|
auto_discover: Whether to auto-discover loaders from directories
|
||||||
|
fallback_loader: Default loader to use when no other matches
|
||||||
|
enable_dependency_validation: Whether to validate loader dependencies
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured LoaderEngine instance
|
||||||
|
"""
|
||||||
|
engine = LoaderEngine(
|
||||||
|
loader_directories=loader_directories,
|
||||||
|
default_loader_priority=default_loader_priority,
|
||||||
|
fallback_loader=fallback_loader,
|
||||||
|
enable_dependency_validation=enable_dependency_validation,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register supported loaders from registry
|
||||||
|
for loader_name, loader_class in supported_loaders.items():
|
||||||
|
try:
|
||||||
|
loader_instance = loader_class()
|
||||||
|
engine.register_loader(loader_instance)
|
||||||
|
except Exception as e:
|
||||||
|
# Log but don't fail - allow engine to continue with other loaders
|
||||||
|
engine.logger.warning(f"Failed to register loader {loader_name}: {e}")
|
||||||
|
|
||||||
|
# Auto-discover loaders if enabled
|
||||||
|
if auto_discover:
|
||||||
|
engine.discover_loaders()
|
||||||
|
|
||||||
|
return engine
|
||||||
20
infrastructure/loaders/get_loader_engine.py
Normal file
20
infrastructure/loaders/get_loader_engine.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from .config import get_loader_config
|
||||||
|
from .LoaderEngine import LoaderEngine
|
||||||
|
from .create_loader_engine import create_loader_engine
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_loader_engine() -> LoaderEngine:
|
||||||
|
"""
|
||||||
|
Factory function to get loader engine.
|
||||||
|
|
||||||
|
Follows cognee's pattern with @lru_cache for efficient reuse
|
||||||
|
of engine instances. Configuration is loaded from environment
|
||||||
|
variables and settings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached LoaderEngine instance configured with current settings
|
||||||
|
"""
|
||||||
|
config = get_loader_config()
|
||||||
|
return create_loader_engine(**config.to_dict())
|
||||||
47
infrastructure/loaders/models/LoaderResult.py
Normal file
47
infrastructure/loaders/models/LoaderResult.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional, Dict, Any, List
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class ContentType(Enum):
|
||||||
|
"""Content type classification for loaded files"""
|
||||||
|
|
||||||
|
TEXT = "text"
|
||||||
|
STRUCTURED = "structured"
|
||||||
|
BINARY = "binary"
|
||||||
|
|
||||||
|
|
||||||
|
class LoaderResult(BaseModel):
|
||||||
|
"""
|
||||||
|
Standardized output format for all file loaders.
|
||||||
|
|
||||||
|
This model ensures consistent data structure across all loader implementations,
|
||||||
|
following cognee's pattern of using Pydantic models for data validation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
content: str # Primary text content extracted from file
|
||||||
|
metadata: Dict[str, Any] # File metadata (name, size, type, loader info, etc.)
|
||||||
|
content_type: ContentType # Content classification
|
||||||
|
chunks: Optional[List[str]] = None # Pre-chunked content if available
|
||||||
|
source_info: Optional[Dict[str, Any]] = None # Source-specific information
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert the loader result to a dictionary format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing all loader result data with string-serialized content_type
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"content": self.content,
|
||||||
|
"metadata": self.metadata,
|
||||||
|
"content_type": self.content_type.value,
|
||||||
|
"source_info": self.source_info or {},
|
||||||
|
"chunks": self.chunks,
|
||||||
|
}
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
"""Pydantic configuration following cognee patterns"""
|
||||||
|
|
||||||
|
use_enum_values = True
|
||||||
|
validate_assignment = True
|
||||||
3
infrastructure/loaders/models/__init__.py
Normal file
3
infrastructure/loaders/models/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
from .LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
__all__ = ["LoaderResult", "ContentType"]
|
||||||
3
infrastructure/loaders/supported_loaders.py
Normal file
3
infrastructure/loaders/supported_loaders.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
# Registry for loader implementations
|
||||||
|
# Follows cognee's pattern used in databases/vector/supported_databases.py
|
||||||
|
supported_loaders = {}
|
||||||
22
infrastructure/loaders/use_loader.py
Normal file
22
infrastructure/loaders/use_loader.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
from .supported_loaders import supported_loaders
|
||||||
|
|
||||||
|
|
||||||
|
def use_loader(loader_name: str, loader_class):
|
||||||
|
"""
|
||||||
|
Register a loader at runtime.
|
||||||
|
|
||||||
|
Follows cognee's pattern used in databases for adapter registration.
|
||||||
|
This allows external packages and custom loaders to be registered
|
||||||
|
into the loader system.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loader_name: Unique name for the loader
|
||||||
|
loader_class: Loader class implementing LoaderInterface
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from cognee.infrastructure.loaders import use_loader
|
||||||
|
from my_package import MyCustomLoader
|
||||||
|
|
||||||
|
use_loader("my_custom_loader", MyCustomLoader)
|
||||||
|
"""
|
||||||
|
supported_loaders[loader_name] = loader_class
|
||||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Tests package
|
||||||
1
tests/unit/__init__.py
Normal file
1
tests/unit/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Unit tests package
|
||||||
1
tests/unit/infrastructure/__init__.py
Normal file
1
tests/unit/infrastructure/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Infrastructure tests package
|
||||||
1
tests/unit/infrastructure/loaders/__init__.py
Normal file
1
tests/unit/infrastructure/loaders/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Loaders tests package
|
||||||
252
tests/unit/infrastructure/loaders/test_loader_engine.py
Normal file
252
tests/unit/infrastructure/loaders/test_loader_engine.py
Normal file
|
|
@ -0,0 +1,252 @@
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from unittest.mock import Mock, AsyncMock
|
||||||
|
|
||||||
|
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
|
||||||
|
class MockLoader(LoaderInterface):
|
||||||
|
"""Mock loader for testing."""
|
||||||
|
|
||||||
|
def __init__(self, name="mock_loader", extensions=None, mime_types=None, fail_deps=False):
|
||||||
|
self._name = name
|
||||||
|
self._extensions = extensions or [".mock"]
|
||||||
|
self._mime_types = mime_types or ["application/mock"]
|
||||||
|
self._fail_deps = fail_deps
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self):
|
||||||
|
return self._extensions
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self):
|
||||||
|
return self._mime_types
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self):
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
return ext in self._extensions or mime_type in self._mime_types
|
||||||
|
|
||||||
|
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||||
|
return LoaderResult(
|
||||||
|
content=f"Mock content from {self._name}",
|
||||||
|
metadata={"loader": self._name, "name": os.path.basename(file_path)},
|
||||||
|
content_type=ContentType.TEXT,
|
||||||
|
)
|
||||||
|
|
||||||
|
def validate_dependencies(self) -> bool:
|
||||||
|
return not self._fail_deps
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoaderEngine:
|
||||||
|
"""Test the LoaderEngine class."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def engine(self):
|
||||||
|
"""Create a LoaderEngine instance for testing."""
|
||||||
|
return LoaderEngine(
|
||||||
|
loader_directories=[],
|
||||||
|
default_loader_priority=["loader1", "loader2"],
|
||||||
|
fallback_loader="fallback",
|
||||||
|
enable_dependency_validation=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_engine_initialization(self, engine):
|
||||||
|
"""Test LoaderEngine initialization."""
|
||||||
|
assert engine.loader_directories == []
|
||||||
|
assert engine.default_loader_priority == ["loader1", "loader2"]
|
||||||
|
assert engine.fallback_loader == "fallback"
|
||||||
|
assert engine.enable_dependency_validation is True
|
||||||
|
assert len(engine.get_available_loaders()) == 0
|
||||||
|
|
||||||
|
def test_register_loader_success(self, engine):
|
||||||
|
"""Test successful loader registration."""
|
||||||
|
loader = MockLoader("test_loader", [".test"])
|
||||||
|
|
||||||
|
success = engine.register_loader(loader)
|
||||||
|
|
||||||
|
assert success is True
|
||||||
|
assert "test_loader" in engine.get_available_loaders()
|
||||||
|
assert engine._loaders["test_loader"] == loader
|
||||||
|
assert ".test" in engine._extension_map
|
||||||
|
assert "application/mock" in engine._mime_type_map
|
||||||
|
|
||||||
|
def test_register_loader_with_failed_dependencies(self, engine):
|
||||||
|
"""Test loader registration with failed dependency validation."""
|
||||||
|
loader = MockLoader("test_loader", [".test"], fail_deps=True)
|
||||||
|
|
||||||
|
success = engine.register_loader(loader)
|
||||||
|
|
||||||
|
assert success is False
|
||||||
|
assert "test_loader" not in engine.get_available_loaders()
|
||||||
|
|
||||||
|
def test_register_loader_without_dependency_validation(self):
|
||||||
|
"""Test loader registration without dependency validation."""
|
||||||
|
engine = LoaderEngine(
|
||||||
|
loader_directories=[], default_loader_priority=[], enable_dependency_validation=False
|
||||||
|
)
|
||||||
|
loader = MockLoader("test_loader", [".test"], fail_deps=True)
|
||||||
|
|
||||||
|
success = engine.register_loader(loader)
|
||||||
|
|
||||||
|
assert success is True
|
||||||
|
assert "test_loader" in engine.get_available_loaders()
|
||||||
|
|
||||||
|
def test_get_loader_by_extension(self, engine):
|
||||||
|
"""Test getting loader by file extension."""
|
||||||
|
loader1 = MockLoader("loader1", [".txt"])
|
||||||
|
loader2 = MockLoader("loader2", [".pdf"])
|
||||||
|
|
||||||
|
engine.register_loader(loader1)
|
||||||
|
engine.register_loader(loader2)
|
||||||
|
|
||||||
|
result = engine.get_loader("test.txt")
|
||||||
|
assert result == loader1
|
||||||
|
|
||||||
|
result = engine.get_loader("test.pdf")
|
||||||
|
assert result == loader2
|
||||||
|
|
||||||
|
result = engine.get_loader("test.unknown")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_get_loader_by_mime_type(self, engine):
|
||||||
|
"""Test getting loader by MIME type."""
|
||||||
|
loader = MockLoader("loader", [".txt"], ["text/plain"])
|
||||||
|
engine.register_loader(loader)
|
||||||
|
|
||||||
|
result = engine.get_loader("test.unknown", mime_type="text/plain")
|
||||||
|
assert result == loader
|
||||||
|
|
||||||
|
result = engine.get_loader("test.unknown", mime_type="application/pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_get_loader_with_preferences(self, engine):
|
||||||
|
"""Test getting loader with preferred loaders."""
|
||||||
|
loader1 = MockLoader("loader1", [".txt"])
|
||||||
|
loader2 = MockLoader("loader2", [".txt"])
|
||||||
|
|
||||||
|
engine.register_loader(loader1)
|
||||||
|
engine.register_loader(loader2)
|
||||||
|
|
||||||
|
# Should get preferred loader
|
||||||
|
result = engine.get_loader("test.txt", preferred_loaders=["loader2"])
|
||||||
|
assert result == loader2
|
||||||
|
|
||||||
|
# Should fallback to first available if preferred not found
|
||||||
|
result = engine.get_loader("test.txt", preferred_loaders=["nonexistent"])
|
||||||
|
assert result in [loader1, loader2] # One of them should be returned
|
||||||
|
|
||||||
|
def test_get_loader_with_priority(self, engine):
|
||||||
|
"""Test loader selection with priority order."""
|
||||||
|
engine.default_loader_priority = ["priority_loader", "other_loader"]
|
||||||
|
|
||||||
|
priority_loader = MockLoader("priority_loader", [".txt"])
|
||||||
|
other_loader = MockLoader("other_loader", [".txt"])
|
||||||
|
|
||||||
|
# Register in reverse order
|
||||||
|
engine.register_loader(other_loader)
|
||||||
|
engine.register_loader(priority_loader)
|
||||||
|
|
||||||
|
# Should get priority loader even though other was registered first
|
||||||
|
result = engine.get_loader("test.txt")
|
||||||
|
assert result == priority_loader
|
||||||
|
|
||||||
|
def test_get_loader_fallback(self, engine):
|
||||||
|
"""Test fallback loader selection."""
|
||||||
|
fallback_loader = MockLoader("fallback", [".txt"])
|
||||||
|
other_loader = MockLoader("other", [".pdf"])
|
||||||
|
|
||||||
|
engine.register_loader(fallback_loader)
|
||||||
|
engine.register_loader(other_loader)
|
||||||
|
engine.fallback_loader = "fallback"
|
||||||
|
|
||||||
|
# For .txt file, fallback should be considered
|
||||||
|
result = engine.get_loader("test.txt")
|
||||||
|
assert result == fallback_loader
|
||||||
|
|
||||||
|
# For unknown extension, should still get fallback if it can handle
|
||||||
|
result = engine.get_loader("test.unknown")
|
||||||
|
assert result == fallback_loader
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_file_success(self, engine):
|
||||||
|
"""Test successful file loading."""
|
||||||
|
loader = MockLoader("test_loader", [".txt"])
|
||||||
|
engine.register_loader(loader)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
|
||||||
|
f.write(b"test content")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await engine.load_file(temp_path)
|
||||||
|
assert result.content == "Mock content from test_loader"
|
||||||
|
assert result.metadata["loader"] == "test_loader"
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_file_no_loader(self, engine):
|
||||||
|
"""Test file loading when no suitable loader is found."""
|
||||||
|
with pytest.raises(ValueError, match="No loader found for file"):
|
||||||
|
await engine.load_file("test.unknown")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_file_with_preferences(self, engine):
|
||||||
|
"""Test file loading with preferred loaders."""
|
||||||
|
loader1 = MockLoader("loader1", [".txt"])
|
||||||
|
loader2 = MockLoader("loader2", [".txt"])
|
||||||
|
|
||||||
|
engine.register_loader(loader1)
|
||||||
|
engine.register_loader(loader2)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
|
||||||
|
f.write(b"test content")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await engine.load_file(temp_path, preferred_loaders=["loader2"])
|
||||||
|
assert result.metadata["loader"] == "loader2"
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
def test_get_loader_info(self, engine):
|
||||||
|
"""Test getting loader information."""
|
||||||
|
loader = MockLoader("test_loader", [".txt"], ["text/plain"])
|
||||||
|
engine.register_loader(loader)
|
||||||
|
|
||||||
|
info = engine.get_loader_info("test_loader")
|
||||||
|
|
||||||
|
assert info["name"] == "test_loader"
|
||||||
|
assert info["extensions"] == [".txt"]
|
||||||
|
assert info["mime_types"] == ["text/plain"]
|
||||||
|
assert info["available"] is True
|
||||||
|
|
||||||
|
# Test non-existent loader
|
||||||
|
info = engine.get_loader_info("nonexistent")
|
||||||
|
assert info == {}
|
||||||
|
|
||||||
|
def test_discover_loaders_empty_directory(self, engine):
|
||||||
|
"""Test loader discovery with empty directory."""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
engine.loader_directories = [temp_dir]
|
||||||
|
engine.discover_loaders()
|
||||||
|
|
||||||
|
# Should not find any loaders in empty directory
|
||||||
|
assert len(engine.get_available_loaders()) == 0
|
||||||
|
|
||||||
|
def test_discover_loaders_nonexistent_directory(self, engine):
|
||||||
|
"""Test loader discovery with non-existent directory."""
|
||||||
|
engine.loader_directories = ["/nonexistent/directory"]
|
||||||
|
|
||||||
|
# Should not raise exception, just log warning
|
||||||
|
engine.discover_loaders()
|
||||||
|
assert len(engine.get_available_loaders()) == 0
|
||||||
99
tests/unit/infrastructure/loaders/test_loader_interface.py
Normal file
99
tests/unit/infrastructure/loaders/test_loader_interface.py
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from unittest.mock import AsyncMock
|
||||||
|
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import LoaderResult, ContentType
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoaderInterface:
|
||||||
|
"""Test the LoaderInterface abstract base class."""
|
||||||
|
|
||||||
|
def test_loader_interface_is_abstract(self):
|
||||||
|
"""Test that LoaderInterface cannot be instantiated directly."""
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
LoaderInterface()
|
||||||
|
|
||||||
|
def test_dependency_validation_with_no_dependencies(self):
|
||||||
|
"""Test dependency validation when no dependencies are required."""
|
||||||
|
|
||||||
|
class MockLoader(LoaderInterface):
|
||||||
|
@property
|
||||||
|
def supported_extensions(self):
|
||||||
|
return [".txt"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self):
|
||||||
|
return ["text/plain"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self):
|
||||||
|
return "mock_loader"
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||||
|
return LoaderResult(content="test", metadata={}, content_type=ContentType.TEXT)
|
||||||
|
|
||||||
|
loader = MockLoader()
|
||||||
|
assert loader.validate_dependencies() is True
|
||||||
|
assert loader.get_dependencies() == []
|
||||||
|
|
||||||
|
def test_dependency_validation_with_missing_dependencies(self):
|
||||||
|
"""Test dependency validation with missing dependencies."""
|
||||||
|
|
||||||
|
class MockLoaderWithDeps(LoaderInterface):
|
||||||
|
@property
|
||||||
|
def supported_extensions(self):
|
||||||
|
return [".txt"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self):
|
||||||
|
return ["text/plain"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self):
|
||||||
|
return "mock_loader_deps"
|
||||||
|
|
||||||
|
def get_dependencies(self):
|
||||||
|
return ["non_existent_package>=1.0.0"]
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||||
|
return LoaderResult(content="test", metadata={}, content_type=ContentType.TEXT)
|
||||||
|
|
||||||
|
loader = MockLoaderWithDeps()
|
||||||
|
assert loader.validate_dependencies() is False
|
||||||
|
assert "non_existent_package>=1.0.0" in loader.get_dependencies()
|
||||||
|
|
||||||
|
def test_dependency_validation_with_existing_dependencies(self):
|
||||||
|
"""Test dependency validation with existing dependencies."""
|
||||||
|
|
||||||
|
class MockLoaderWithExistingDeps(LoaderInterface):
|
||||||
|
@property
|
||||||
|
def supported_extensions(self):
|
||||||
|
return [".txt"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self):
|
||||||
|
return ["text/plain"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self):
|
||||||
|
return "mock_loader_existing"
|
||||||
|
|
||||||
|
def get_dependencies(self):
|
||||||
|
return ["os"] # Built-in module that always exists
|
||||||
|
|
||||||
|
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||||
|
return LoaderResult(content="test", metadata={}, content_type=ContentType.TEXT)
|
||||||
|
|
||||||
|
loader = MockLoaderWithExistingDeps()
|
||||||
|
assert loader.validate_dependencies() is True
|
||||||
157
tests/unit/infrastructure/loaders/test_text_loader.py
Normal file
157
tests/unit/infrastructure/loaders/test_text_loader.py
Normal file
|
|
@ -0,0 +1,157 @@
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from cognee.infrastructure.loaders.core.text_loader import TextLoader
|
||||||
|
from cognee.infrastructure.loaders.models.LoaderResult import ContentType
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextLoader:
|
||||||
|
"""Test the TextLoader implementation."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def text_loader(self):
|
||||||
|
"""Create a TextLoader instance for testing."""
|
||||||
|
return TextLoader()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_text_file(self):
|
||||||
|
"""Create a temporary text file for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write("This is a test file.\nIt has multiple lines.\n")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
yield temp_path
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_binary_file(self):
|
||||||
|
"""Create a temporary binary file for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(mode="wb", suffix=".bin", delete=False) as f:
|
||||||
|
f.write(b"\x00\x01\x02\x03\x04\x05")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
yield temp_path
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
def test_loader_properties(self, text_loader):
|
||||||
|
"""Test basic loader properties."""
|
||||||
|
assert text_loader.loader_name == "text_loader"
|
||||||
|
assert ".txt" in text_loader.supported_extensions
|
||||||
|
assert ".md" in text_loader.supported_extensions
|
||||||
|
assert "text/plain" in text_loader.supported_mime_types
|
||||||
|
assert "application/json" in text_loader.supported_mime_types
|
||||||
|
|
||||||
|
def test_can_handle_by_extension(self, text_loader):
|
||||||
|
"""Test file handling by extension."""
|
||||||
|
assert text_loader.can_handle("test.txt")
|
||||||
|
assert text_loader.can_handle("test.md")
|
||||||
|
assert text_loader.can_handle("test.json")
|
||||||
|
assert text_loader.can_handle("test.TXT") # Case insensitive
|
||||||
|
assert not text_loader.can_handle("test.pdf")
|
||||||
|
|
||||||
|
def test_can_handle_by_mime_type(self, text_loader):
|
||||||
|
"""Test file handling by MIME type."""
|
||||||
|
assert text_loader.can_handle("test.unknown", mime_type="text/plain")
|
||||||
|
assert text_loader.can_handle("test.unknown", mime_type="application/json")
|
||||||
|
assert not text_loader.can_handle("test.unknown", mime_type="application/pdf")
|
||||||
|
|
||||||
|
def test_can_handle_text_file_heuristic(self, text_loader, temp_text_file):
|
||||||
|
"""Test handling of text files by content heuristic."""
|
||||||
|
# Remove extension to force heuristic check
|
||||||
|
no_ext_path = temp_text_file.replace(".txt", "")
|
||||||
|
os.rename(temp_text_file, no_ext_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert text_loader.can_handle(no_ext_path)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(no_ext_path):
|
||||||
|
os.unlink(no_ext_path)
|
||||||
|
|
||||||
|
def test_cannot_handle_binary_file(self, text_loader, temp_binary_file):
|
||||||
|
"""Test that binary files are not handled."""
|
||||||
|
assert not text_loader.can_handle(temp_binary_file)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_text_file(self, text_loader, temp_text_file):
|
||||||
|
"""Test loading a text file."""
|
||||||
|
result = await text_loader.load(temp_text_file)
|
||||||
|
|
||||||
|
assert isinstance(result.content, str)
|
||||||
|
assert "This is a test file." in result.content
|
||||||
|
assert result.content_type == ContentType.TEXT
|
||||||
|
assert result.metadata["loader"] == "text_loader"
|
||||||
|
assert result.metadata["name"] == os.path.basename(temp_text_file)
|
||||||
|
assert result.metadata["lines"] == 2
|
||||||
|
assert result.metadata["encoding"] == "utf-8"
|
||||||
|
assert result.source_info["file_path"] == temp_text_file
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_with_custom_encoding(self, text_loader):
|
||||||
|
"""Test loading with custom encoding."""
|
||||||
|
# Create a file with latin-1 encoding
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w", suffix=".txt", delete=False, encoding="latin-1"
|
||||||
|
) as f:
|
||||||
|
f.write("Test with åéîøü characters")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await text_loader.load(temp_path, encoding="latin-1")
|
||||||
|
assert "åéîøü" in result.content
|
||||||
|
assert result.metadata["encoding"] == "latin-1"
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_with_fallback_encoding(self, text_loader):
|
||||||
|
"""Test automatic fallback to latin-1 encoding."""
|
||||||
|
# Create a file with latin-1 content but try to read as utf-8
|
||||||
|
with tempfile.NamedTemporaryFile(mode="wb", suffix=".txt", delete=False) as f:
|
||||||
|
# Write latin-1 encoded bytes that are invalid in utf-8
|
||||||
|
f.write(b"Test with \xe5\xe9\xee\xf8\xfc characters")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Should automatically fallback to latin-1
|
||||||
|
result = await text_loader.load(temp_path)
|
||||||
|
assert result.metadata["encoding"] == "latin-1"
|
||||||
|
assert len(result.content) > 0
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_nonexistent_file(self, text_loader):
|
||||||
|
"""Test loading a file that doesn't exist."""
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
await text_loader.load("/nonexistent/file.txt")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_empty_file(self, text_loader):
|
||||||
|
"""Test loading an empty file."""
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
# Create empty file
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await text_loader.load(temp_path)
|
||||||
|
assert result.content == ""
|
||||||
|
assert result.metadata["lines"] == 0
|
||||||
|
assert result.metadata["characters"] == 0
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
def test_no_dependencies(self, text_loader):
|
||||||
|
"""Test that TextLoader has no external dependencies."""
|
||||||
|
assert text_loader.get_dependencies() == []
|
||||||
|
assert text_loader.validate_dependencies() is True
|
||||||
Loading…
Add table
Reference in a new issue