From fc660b46bb13fcaf901a830636a3aed73c4c9065 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 21:50:10 +0100 Subject: [PATCH] remove web_url_loader since there is no logic post fetching for loader --- .../loaders/external/__init__.py | 7 -- .../loaders/external/web_url_loader.py | 73 ------------------- .../tasks/ingestion/data_item_to_text_file.py | 11 --- .../web_url_crawler/test_loader_engine.py | 20 ----- ..._example.py => web_url_fetcher_example.py} | 7 +- 5 files changed, 3 insertions(+), 115 deletions(-) delete mode 100644 cognee/infrastructure/loaders/external/web_url_loader.py delete mode 100644 cognee/tests/integration/web_url_crawler/test_loader_engine.py rename examples/python/{web_url_loader_example.py => web_url_fetcher_example.py} (80%) diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py index 2790a7ea0..6bf9f9200 100644 --- a/cognee/infrastructure/loaders/external/__init__.py +++ b/cognee/infrastructure/loaders/external/__init__.py @@ -27,10 +27,3 @@ try: __all__.append("AdvancedPdfLoader") except ImportError: pass - -try: - from .web_url_loader import WebUrlLoader - - __all__.append("WebUrlLoader") -except ImportError: - pass diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py deleted file mode 100644 index 996f7dae6..000000000 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ /dev/null @@ -1,73 +0,0 @@ -from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface -from typing import List - -from cognee.modules.ingestion.exceptions.exceptions import IngestionError -from cognee.shared.logging_utils import get_logger - -logger = get_logger() - - -class WebUrlLoader(LoaderInterface): - @property - def supported_extensions(self) -> List[str]: - """ - List of file extensions this loader supports. - - Returns: - List of extensions including the dot (e.g., ['.txt', '.md']) - """ - return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality - - @property - def supported_mime_types(self) -> List[str]: - """ - List of MIME types this loader supports. - - Returns: - List of MIME type strings (e.g., ['text/plain', 'application/pdf']) - """ - return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality - - @property - def loader_name(self) -> str: - """ - Unique name identifier for this loader. - - Returns: - String identifier used for registration and configuration - """ - return "web_url_loader" - - def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool: - """ - Check if this loader can handle the given file. - - Args: - extension: File extension - mime_type: MIME type of the file - - Returns: - True if this loader can process the file, False otherwise - """ - if data_item_path is None: - raise IngestionError( - "data_item_path should not be None" - ) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py - return data_item_path.startswith(("http://", "https://")) - - async def load(self, file_path: str, **kwargs): - """ - Load and process the file, returning standardized result. - - Args: - file_path: Path to the file to be processed (already saved by fetcher) - file_stream: If file stream is provided it will be used to process file instead - **kwargs: Additional loader-specific configuration - - Returns: - file path to the stored file - Raises: - Exception: If file cannot be processed - """ - - return file_path diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 211b918ae..8d2e915b0 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -76,16 +76,5 @@ async def data_item_to_text_file( ) else: raise IngestionError(message="Local files are not accepted.") - - elif data_item_path.startswith(("http://", "https://")): - loader = get_loader_engine() - return ( - await loader.load_file( - data_item_path, - preferred_loaders, - loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal - ), - loader.get_loader(data_item_path, preferred_loaders), - ) # data is not a supported type raise IngestionError(message=f"Data type not supported: {type(data_item_path)}") diff --git a/cognee/tests/integration/web_url_crawler/test_loader_engine.py b/cognee/tests/integration/web_url_crawler/test_loader_engine.py deleted file mode 100644 index 018c034e1..000000000 --- a/cognee/tests/integration/web_url_crawler/test_loader_engine.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from cognee.infrastructure.loaders import get_loader_engine -from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader - - -def test_get_loader_returns_none_by_default_for_web_urls(): - loader_engine = get_loader_engine() - urls = ["https://cognee.ai", "http://cognee.ai"] - for url in urls: - loader = loader_engine.get_loader(url) - assert loader is None - - -def test_get_loader_returns_valid_loader_when_preferred_loaders_specified(): - loader_engine = get_loader_engine() - urls = ["https://cognee.ai", "http://cognee.ai"] - for url in urls: - loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"]) - assert isinstance(loader, WebUrlLoader) diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_fetcher_example.py similarity index 80% rename from examples/python/web_url_loader_example.py rename to examples/python/web_url_fetcher_example.py index 37dd2258c..9ac099e16 100644 --- a/examples/python/web_url_loader_example.py +++ b/examples/python/web_url_fetcher_example.py @@ -20,7 +20,7 @@ async def main(): "paragraphs": {"selector": "p", "all": True}, } - loaders_config = { + fetchers_config = { "web_url_loader": { "soup_config": { "max_depth": 1, @@ -32,9 +32,8 @@ async def main(): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], - incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix - loaders_config=loaders_config, + incremental_loading=False, + fetchers_config=fetchers_config, ) await cognee.cognify()