extend LoaderInterface to support web_url_loader, implement load()

This commit is contained in:
Daulet Amirkhanov 2025-10-16 12:40:07 +01:00
parent 305969c61b
commit d884867d2c
5 changed files with 71 additions and 9 deletions

View file

@ -64,7 +64,7 @@ class LoaderEngine:
return True return True
def get_loader( def get_loader(
self, file_path: str, preferred_loaders: List[str] = None self, data_item_path: str, preferred_loaders: List[str] = None
) -> Optional[LoaderInterface]: ) -> Optional[LoaderInterface]:
""" """
Get appropriate loader for a file. Get appropriate loader for a file.
@ -77,20 +77,26 @@ class LoaderEngine:
LoaderInterface that can handle the file, or None if not found LoaderInterface that can handle the file, or None if not found
""" """
file_info = filetype.guess(file_path) file_info = filetype.guess(data_item_path)
# Try preferred loaders first # Try preferred loaders first
if preferred_loaders: if preferred_loaders:
for loader_name in preferred_loaders: for loader_name in preferred_loaders:
if loader_name in self._loaders: if loader_name in self._loaders:
loader = self._loaders[loader_name] loader = self._loaders[loader_name]
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): if loader.can_handle(
extension=file_info.extension,
mime_type=file_info.mime,
data_item_path=data_item_path,
): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
return loader return loader
else: else:
logger.info(f"Skipping {loader_name}: Preferred Loader not registered") logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
# Try default priority order # Try default priority order
for loader_name in self.default_loader_priority: for loader_name in (
self.default_loader_priority
): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
if loader_name in self._loaders: if loader_name in self._loaders:
loader = self._loaders[loader_name] loader = self._loaders[loader_name]
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):

View file

@ -44,7 +44,9 @@ class LoaderInterface(ABC):
pass pass
@abstractmethod @abstractmethod
def can_handle(self, extension: str, mime_type: str) -> bool: def can_handle(
self, extension: str, mime_type: str, data_item_path: str = None
) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
""" """
Check if this loader can handle the given file. Check if this loader can handle the given file.

View file

@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe
""" """
from .pypdf_loader import PyPdfLoader from .pypdf_loader import PyPdfLoader
from .web_url_loader import WebUrlLoader
__all__ = ["PyPdfLoader"] __all__ = ["PyPdfLoader"]
@ -27,3 +28,10 @@ try:
__all__.append("AdvancedPdfLoader") __all__.append("AdvancedPdfLoader")
except ImportError: except ImportError:
pass pass
try:
from .web_url_loader import WebUrlLoader
__all__.append("WebUrlLoader")
except ImportError:
pass

View file

@ -1,8 +1,11 @@
from cognee.infrastructure.loaders import LoaderInterface from cognee.infrastructure.loaders import LoaderInterface
from typing import List from typing import List
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
from cognee.modules.ingestion import save_data_to_file
class WebLoader(LoaderInterface):
class WebUrlLoader(LoaderInterface):
@property @property
def supported_extensions(self) -> List[str]: def supported_extensions(self) -> List[str]:
""" """
@ -31,9 +34,9 @@ class WebLoader(LoaderInterface):
Returns: Returns:
String identifier used for registration and configuration String identifier used for registration and configuration
""" """
raise NotImplementedError return "web_url_loader"
def can_handle(self, extension: str, mime_type: str) -> bool: def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
""" """
Check if this loader can handle the given file. Check if this loader can handle the given file.
@ -44,7 +47,9 @@ class WebLoader(LoaderInterface):
Returns: Returns:
True if this loader can process the file, False otherwise True if this loader can process the file, False otherwise
""" """
raise NotImplementedError if data_item_path is None:
raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
return data_item_path.startswith(("http://", "https://"))
async def load(self, file_path: str, **kwargs): async def load(self, file_path: str, **kwargs):
""" """
@ -58,4 +63,38 @@ class WebLoader(LoaderInterface):
Raises: Raises:
Exception: If file cannot be processed Exception: If file cannot be processed
""" """
try:
from cognee.context_global_variables import tavily_config, soup_crawler_config
from cognee.tasks.web_scraper import fetch_page_content
tavily = tavily_config.get()
soup_crawler = soup_crawler_config.get()
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
if preferred_tool == "tavily" and tavily is None:
raise IngestionError(
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
)
if preferred_tool == "beautifulsoup" and soup_crawler is None:
raise IngestionError(
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
)
data = await fetch_page_content(
file_path,
preferred_tool=preferred_tool,
tavily_config=tavily,
soup_crawler_config=soup_crawler,
)
content = ""
for key, value in data.items():
content += f"{key}:\n{value}\n\n"
await save_data_to_file(content)
return content
except IngestionError:
raise
except Exception as e:
raise IngestionError(
message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
)
raise NotImplementedError raise NotImplementedError

View file

@ -23,3 +23,10 @@ try:
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
except ImportError: except ImportError:
pass pass
try:
from cognee.infrastructure.loaders.external import WebUrlLoader
supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
except ImportError:
pass