fix: ensure web urls correctly go through ingest_data and reach loaders

2025-10-16 12:16:37 +01:00 · 2025-10-16 12:16:37 +01:00 · 95106d5914
commit 95106d5914
parent 9395539868
4 changed files with 11 additions and 36 deletions
--- a/cognee/infrastructure/files/utils/get_data_file_path.py
+++ b/cognee/infrastructure/files/utils/get_data_file_path.py
@ -38,6 +38,9 @@ def get_data_file_path(file_path: str):
        return normalized_url
    elif file_path.startswith(("http://", "https://")):
        return file_path
    else:
        # Regular file path - normalize separators
        normalized_path = os.path.normpath(file_path)
--- a/cognee/infrastructure/loaders/external/WebLoader.py
+++ b/cognee/infrastructure/loaders/external/WebLoader.py
@ -11,7 +11,7 @@ class WebLoader(LoaderInterface):
        Returns:
            List of extensions including the dot (e.g., ['.txt', '.md'])
        """
-        raise NotImplementedError
+        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
    @property
    def supported_mime_types(self) -> List[str]:
@ -21,7 +21,7 @@ class WebLoader(LoaderInterface):
        Returns:
            List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
        """
-        raise NotImplementedError
+        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
    @property
    def loader_name(self) -> str:
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@ -75,5 +75,10 @@ async def data_item_to_text_file(
            else:
                raise IngestionError(message="Local files are not accepted.")
        elif data_item_path.startswith(("http://", "https://")):
            loader = get_loader_engine()
            return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
                data_item_path, preferred_loaders
            )
    # data is not a supported type
    raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
        if parsed_url.scheme == "s3":
            return data_item
        elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            # Validate URL by sending a HEAD request
+            return data_item
            try:
                from cognee.context_global_variables import tavily_config, soup_crawler_config
                from cognee.tasks.web_scraper import fetch_page_content
                tavily = tavily_config.get()
                soup_crawler = soup_crawler_config.get()
                preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
                if preferred_tool == "tavily" and tavily is None:
                    raise IngestionError(
                        message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
                    )
                if preferred_tool == "beautifulsoup" and soup_crawler is None:
                    raise IngestionError(
                        message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
                    )
                data = await fetch_page_content(
                    data_item,
                    preferred_tool=preferred_tool,
                    tavily_config=tavily,
                    soup_crawler_config=soup_crawler,
                )
                content = ""
                for key, value in data.items():
                    content += f"{key}:\n{value}\n\n"
                return await save_data_to_file(content)
            except IngestionError:
                raise
            except Exception as e:
                raise IngestionError(
                    message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
                )
        # data is local file path
        elif parsed_url.scheme == "file":
            if settings.accept_local_file_path: