fix: ensure web urls correctly go through ingest_data and reach loaders

2025-10-16 12:16:37 +01:00 · 2025-10-16 12:16:37 +01:00 · 95106d5914
commit 95106d5914
parent 9395539868
4 changed files with 11 additions and 36 deletions
--- a/cognee/infrastructure/files/utils/get_data_file_path.py
+++ b/cognee/infrastructure/files/utils/get_data_file_path.py
@ -38,6 +38,9 @@ def get_data_file_path(file_path: str):

        return normalized_url

+    elif file_path.startswith(("http://", "https://")):
+        return file_path
+
    else:
        # Regular file path - normalize separators
        normalized_path = os.path.normpath(file_path)
--- a/cognee/infrastructure/loaders/external/WebLoader.py
+++ b/cognee/infrastructure/loaders/external/WebLoader.py
@ -11,7 +11,7 @@ class WebLoader(LoaderInterface):
        Returns:
            List of extensions including the dot (e.g., ['.txt', '.md'])
        """
-        raise NotImplementedError
+        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality

    @property
    def supported_mime_types(self) -> List[str]:
@ -21,7 +21,7 @@ class WebLoader(LoaderInterface):
        Returns:
            List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
        """
-        raise NotImplementedError
+        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality

    @property
    def loader_name(self) -> str:
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@ -75,5 +75,10 @@ async def data_item_to_text_file(
            else:
                raise IngestionError(message="Local files are not accepted.")

+        elif data_item_path.startswith(("http://", "https://")):
+            loader = get_loader_engine()
+            return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
+                data_item_path, preferred_loaders
+            )
    # data is not a supported type
    raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
        if parsed_url.scheme == "s3":
            return data_item
        elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            # Validate URL by sending a HEAD request
-            try:
-                from cognee.context_global_variables import tavily_config, soup_crawler_config
-                from cognee.tasks.web_scraper import fetch_page_content
-
-                tavily = tavily_config.get()
-                soup_crawler = soup_crawler_config.get()
-                preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
-                if preferred_tool == "tavily" and tavily is None:
-                    raise IngestionError(
-                        message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
-                    )
-                if preferred_tool == "beautifulsoup" and soup_crawler is None:
-                    raise IngestionError(
-                        message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
-                    )
-
-                data = await fetch_page_content(
-                    data_item,
-                    preferred_tool=preferred_tool,
-                    tavily_config=tavily,
-                    soup_crawler_config=soup_crawler,
-                )
-                content = ""
-                for key, value in data.items():
-                    content += f"{key}:\n{value}\n\n"
-                return await save_data_to_file(content)
-            except IngestionError:
-                raise
-            except Exception as e:
-                raise IngestionError(
-                    message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
-                )
-
+            return data_item
        # data is local file path
        elif parsed_url.scheme == "file":
            if settings.accept_local_file_path: