extend LoaderInterface to support web_url_loader, implement load()

2025-10-16 12:40:07 +01:00 · 2025-10-16 12:40:07 +01:00 · d884867d2c
commit d884867d2c
parent 305969c61b
5 changed files with 71 additions and 9 deletions
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@ -64,7 +64,7 @@ class LoaderEngine:
        return True

    def get_loader(
-        self, file_path: str, preferred_loaders: List[str] = None
+        self, data_item_path: str, preferred_loaders: List[str] = None
    ) -> Optional[LoaderInterface]:
        """
        Get appropriate loader for a file.
@ -77,20 +77,26 @@ class LoaderEngine:
            LoaderInterface that can handle the file, or None if not found
        """

-        file_info = filetype.guess(file_path)
+        file_info = filetype.guess(data_item_path)

        # Try preferred loaders first
        if preferred_loaders:
            for loader_name in preferred_loaders:
                if loader_name in self._loaders:
                    loader = self._loaders[loader_name]
-                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
+                    if loader.can_handle(
+                        extension=file_info.extension,
+                        mime_type=file_info.mime,
+                        data_item_path=data_item_path,
+                    ):  # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
                        return loader
                else:
                    logger.info(f"Skipping {loader_name}: Preferred Loader not registered")

        # Try default priority order
-        for loader_name in self.default_loader_priority:
+        for loader_name in (
+            self.default_loader_priority
+        ):  # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
            if loader_name in self._loaders:
                loader = self._loaders[loader_name]
                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
--- a/cognee/infrastructure/loaders/LoaderInterface.py
+++ b/cognee/infrastructure/loaders/LoaderInterface.py
@ -44,7 +44,9 @@ class LoaderInterface(ABC):
        pass

    @abstractmethod
-    def can_handle(self, extension: str, mime_type: str) -> bool:
+    def can_handle(
+        self, extension: str, mime_type: str, data_item_path: str = None
+    ) -> bool:  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
        """
        Check if this loader can handle the given file.

--- a/cognee/infrastructure/loaders/external/init.py
+++ b/cognee/infrastructure/loaders/external/init.py
@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe
 """

 from .pypdf_loader import PyPdfLoader
+from .web_url_loader import WebUrlLoader

 __all__ = ["PyPdfLoader"]

@ -27,3 +28,10 @@ try:
    __all__.append("AdvancedPdfLoader")
 except ImportError:
    pass
+
+try:
+    from .web_url_loader import WebUrlLoader
+
+    __all__.append("WebUrlLoader")
+except ImportError:
+    pass
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@ -1,8 +1,11 @@
 from cognee.infrastructure.loaders import LoaderInterface
 from typing import List

+from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.modules.ingestion import save_data_to_file

-class WebLoader(LoaderInterface):
+
+class WebUrlLoader(LoaderInterface):
    @property
    def supported_extensions(self) -> List[str]:
        """
@ -31,9 +34,9 @@ class WebLoader(LoaderInterface):
        Returns:
            String identifier used for registration and configuration
        """
-        raise NotImplementedError
+        return "web_url_loader"

-    def can_handle(self, extension: str, mime_type: str) -> bool:
+    def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
        """
        Check if this loader can handle the given file.

@ -44,7 +47,9 @@ class WebLoader(LoaderInterface):
        Returns:
            True if this loader can process the file, False otherwise
        """
-        raise NotImplementedError
+        if data_item_path is None:
+            raise  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
+        return data_item_path.startswith(("http://", "https://"))

    async def load(self, file_path: str, **kwargs):
        """
@ -58,4 +63,38 @@ class WebLoader(LoaderInterface):
        Raises:
            Exception: If file cannot be processed
        """
+        try:
+            from cognee.context_global_variables import tavily_config, soup_crawler_config
+            from cognee.tasks.web_scraper import fetch_page_content
+
+            tavily = tavily_config.get()
+            soup_crawler = soup_crawler_config.get()
+            preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
+            if preferred_tool == "tavily" and tavily is None:
+                raise IngestionError(
+                    message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
+                )
+            if preferred_tool == "beautifulsoup" and soup_crawler is None:
+                raise IngestionError(
+                    message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
+                )
+
+            data = await fetch_page_content(
+                file_path,
+                preferred_tool=preferred_tool,
+                tavily_config=tavily,
+                soup_crawler_config=soup_crawler,
+            )
+            content = ""
+            for key, value in data.items():
+                content += f"{key}:\n{value}\n\n"
+            await save_data_to_file(content)
+
+            return content
+        except IngestionError:
+            raise
+        except Exception as e:
+            raise IngestionError(
+                message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
+            )
        raise NotImplementedError
--- a/cognee/infrastructure/loaders/supported_loaders.py
+++ b/cognee/infrastructure/loaders/supported_loaders.py
@ -23,3 +23,10 @@ try:
    supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
 except ImportError:
    pass
+
+try:
+    from cognee.infrastructure.loaders.external import WebUrlLoader
+
+    supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
+except ImportError:
+    pass