diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 6b62f7641..af6b53e93 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -64,7 +64,7 @@ class LoaderEngine: return True def get_loader( - self, file_path: str, preferred_loaders: List[str] = None + self, data_item_path: str, preferred_loaders: List[str] = None ) -> Optional[LoaderInterface]: """ Get appropriate loader for a file. @@ -77,20 +77,26 @@ class LoaderEngine: LoaderInterface that can handle the file, or None if not found """ - file_info = filetype.guess(file_path) + file_info = filetype.guess(data_item_path) # Try preferred loaders first if preferred_loaders: for loader_name in preferred_loaders: if loader_name in self._loaders: loader = self._loaders[loader_name] - if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): + if loader.can_handle( + extension=file_info.extension, + mime_type=file_info.mime, + data_item_path=data_item_path, + ): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time return loader else: logger.info(f"Skipping {loader_name}: Preferred Loader not registered") # Try default priority order - for loader_name in self.default_loader_priority: + for loader_name in ( + self.default_loader_priority + ): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review if loader_name in self._loaders: loader = self._loaders[loader_name] if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): diff --git a/cognee/infrastructure/loaders/LoaderInterface.py b/cognee/infrastructure/loaders/LoaderInterface.py index 3a1c9bf3e..fb309304b 100644 --- a/cognee/infrastructure/loaders/LoaderInterface.py +++ b/cognee/infrastructure/loaders/LoaderInterface.py @@ -44,7 +44,9 @@ class LoaderInterface(ABC): pass @abstractmethod - def can_handle(self, extension: str, mime_type: str) -> bool: + def can_handle( + self, extension: str, mime_type: str, data_item_path: str = None + ) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py """ Check if this loader can handle the given file. diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py index 6bf9f9200..b92d9e7f0 100644 --- a/cognee/infrastructure/loaders/external/__init__.py +++ b/cognee/infrastructure/loaders/external/__init__.py @@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe """ from .pypdf_loader import PyPdfLoader +from .web_url_loader import WebUrlLoader __all__ = ["PyPdfLoader"] @@ -27,3 +28,10 @@ try: __all__.append("AdvancedPdfLoader") except ImportError: pass + +try: + from .web_url_loader import WebUrlLoader + + __all__.append("WebUrlLoader") +except ImportError: + pass diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index db24c86e6..4d519d443 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -1,8 +1,11 @@ from cognee.infrastructure.loaders import LoaderInterface from typing import List +from cognee.modules.ingestion.exceptions.exceptions import IngestionError +from cognee.modules.ingestion import save_data_to_file -class WebLoader(LoaderInterface): + +class WebUrlLoader(LoaderInterface): @property def supported_extensions(self) -> List[str]: """ @@ -31,9 +34,9 @@ class WebLoader(LoaderInterface): Returns: String identifier used for registration and configuration """ - raise NotImplementedError + return "web_url_loader" - def can_handle(self, extension: str, mime_type: str) -> bool: + def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool: """ Check if this loader can handle the given file. @@ -44,7 +47,9 @@ class WebLoader(LoaderInterface): Returns: True if this loader can process the file, False otherwise """ - raise NotImplementedError + if data_item_path is None: + raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py + return data_item_path.startswith(("http://", "https://")) async def load(self, file_path: str, **kwargs): """ @@ -58,4 +63,38 @@ class WebLoader(LoaderInterface): Raises: Exception: If file cannot be processed """ + try: + from cognee.context_global_variables import tavily_config, soup_crawler_config + from cognee.tasks.web_scraper import fetch_page_content + + tavily = tavily_config.get() + soup_crawler = soup_crawler_config.get() + preferred_tool = "beautifulsoup" if soup_crawler else "tavily" + if preferred_tool == "tavily" and tavily is None: + raise IngestionError( + message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." + ) + if preferred_tool == "beautifulsoup" and soup_crawler is None: + raise IngestionError( + message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." + ) + + data = await fetch_page_content( + file_path, + preferred_tool=preferred_tool, + tavily_config=tavily, + soup_crawler_config=soup_crawler, + ) + content = "" + for key, value in data.items(): + content += f"{key}:\n{value}\n\n" + await save_data_to_file(content) + + return content + except IngestionError: + raise + except Exception as e: + raise IngestionError( + message=f"Error ingesting webpage results of url {file_path}: {str(e)}" + ) raise NotImplementedError diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py index d103babe3..7f92aa36a 100644 --- a/cognee/infrastructure/loaders/supported_loaders.py +++ b/cognee/infrastructure/loaders/supported_loaders.py @@ -23,3 +23,10 @@ try: supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader except ImportError: pass + +try: + from cognee.infrastructure.loaders.external import WebUrlLoader + + supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader +except ImportError: + pass