import os from typing import List, Union from pathlib import Path from ..LoaderInterface import LoaderInterface from ..models.LoaderResult import LoaderResult, ContentType class TextLoader(LoaderInterface): """ Core text file loader that handles basic text file formats. This loader is always available and serves as the fallback for text-based files when no specialized loader is available. """ @property def supported_extensions(self) -> List[str]: """Supported text file extensions.""" return [".txt", ".md", ".csv", ".json", ".xml", ".yaml", ".yml", ".log"] @property def supported_mime_types(self) -> List[str]: """Supported MIME types for text content.""" return [ "text/plain", "text/markdown", "text/csv", "application/json", "text/xml", "application/xml", "text/yaml", "application/yaml", ] @property def loader_name(self) -> str: """Unique identifier for this loader.""" return "text_loader" def can_handle(self, file_path: Union[str, Path], mime_type: str = None) -> bool: """ Check if this loader can handle the given file. Args: file_path: Path to the file (Path type recommended for explicit file path handling) mime_type: Optional MIME type Returns: True if file can be handled, False otherwise """ # Convert to Path for consistent handling path_obj = Path(file_path) if isinstance(file_path, str) else file_path # Check by extension ext = path_obj.suffix.lower() if ext in self.supported_extensions: return True # Check by MIME type if mime_type and mime_type in self.supported_mime_types: return True # As fallback loader, can attempt to handle any text-like file # This is useful when other loaders fail try: # Quick check if file appears to be text with open(path_obj, "rb") as f: sample = f.read(512) # Simple heuristic: if most bytes are printable, consider it text if sample: try: sample.decode("utf-8") return True except UnicodeDecodeError: try: sample.decode("latin-1") return True except UnicodeDecodeError: pass except (OSError, IOError): pass return False async def load( self, file_path: Union[str, Path], encoding: str = "utf-8", **kwargs ) -> LoaderResult: """ Load and process the text file. Args: file_path: Path to the file to load (Path type recommended for explicit file path handling) encoding: Text encoding to use (default: utf-8) **kwargs: Additional configuration (unused) Returns: LoaderResult containing the file content and metadata Raises: FileNotFoundError: If file doesn't exist UnicodeDecodeError: If file cannot be decoded with specified encoding OSError: If file cannot be read """ # Convert to Path for consistent handling path_obj = Path(file_path) if isinstance(file_path, str) else file_path if not path_obj.exists(): raise FileNotFoundError(f"File not found: {path_obj}") try: with open(path_obj, "r", encoding=encoding) as f: content = f.read() except UnicodeDecodeError: # Try with fallback encoding if encoding == "utf-8": return await self.load(path_obj, encoding="latin-1", **kwargs) else: raise # Extract basic metadata file_stat = path_obj.stat() metadata = { "name": path_obj.name, "size": file_stat.st_size, "extension": path_obj.suffix, "encoding": encoding, "loader": self.loader_name, "lines": len(content.splitlines()) if content else 0, "characters": len(content), } return LoaderResult( content=content, metadata=metadata, content_type=ContentType.TEXT, source_info={"file_path": str(path_obj), "encoding": encoding}, )