From 9395539868155cbb1402f1cd77a8f5f7c5f478cd Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 11:52:16 +0100
Subject: [PATCH 01/44] feat: interface for WebLoader

---
 .../loaders/external/WebLoader.py             | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 cognee/infrastructure/loaders/external/WebLoader.py

diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/WebLoader.py
new file mode 100644
index 000000000..609ade2e0
--- /dev/null
+++ b/cognee/infrastructure/loaders/external/WebLoader.py
@@ -0,0 +1,61 @@
+from cognee.infrastructure.loaders import LoaderInterface
+from typing import List
+
+
+class WebLoader(LoaderInterface):
+    @property
+    def supported_extensions(self) -> List[str]:
+        """
+        List of file extensions this loader supports.
+
+        Returns:
+            List of extensions including the dot (e.g., ['.txt', '.md'])
+        """
+        raise NotImplementedError
+
+    @property
+    def supported_mime_types(self) -> List[str]:
+        """
+        List of MIME types this loader supports.
+
+        Returns:
+            List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
+        """
+        raise NotImplementedError
+
+    @property
+    def loader_name(self) -> str:
+        """
+        Unique name identifier for this loader.
+
+        Returns:
+            String identifier used for registration and configuration
+        """
+        raise NotImplementedError
+
+    def can_handle(self, extension: str, mime_type: str) -> bool:
+        """
+        Check if this loader can handle the given file.
+
+        Args:
+            extension: File extension
+            mime_type: MIME type of the file
+
+        Returns:
+            True if this loader can process the file, False otherwise
+        """
+        raise NotImplementedError
+
+    async def load(self, file_path: str, **kwargs):
+        """
+        Load and process the file, returning standardized result.
+
+        Args:
+            file_path: Path to the file to be processed
+            file_stream: If file stream is provided it will be used to process file instead
+            **kwargs: Additional loader-specific configuration
+
+        Raises:
+            Exception: If file cannot be processed
+        """
+        raise NotImplementedError

From 95106d5914a2fe47f93b67896e404e6a41f39430 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 12:16:37 +0100
Subject: [PATCH 02/44] fix: ensure web urls correctly go through ingest_data
 and reach loaders

---
 .../files/utils/get_data_file_path.py         |  3 ++
 .../loaders/external/WebLoader.py             |  4 +--
 .../tasks/ingestion/data_item_to_text_file.py |  5 +++
 .../ingestion/save_data_item_to_storage.py    | 35 +------------------
 4 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py
index 7ffda79bd..242d130a9 100644
--- a/cognee/infrastructure/files/utils/get_data_file_path.py
+++ b/cognee/infrastructure/files/utils/get_data_file_path.py
@@ -38,6 +38,9 @@ def get_data_file_path(file_path: str):
 
         return normalized_url
 
+    elif file_path.startswith(("http://", "https://")):
+        return file_path
+
     else:
         # Regular file path - normalize separators
         normalized_path = os.path.normpath(file_path)
diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/WebLoader.py
index 609ade2e0..db24c86e6 100644
--- a/cognee/infrastructure/loaders/external/WebLoader.py
+++ b/cognee/infrastructure/loaders/external/WebLoader.py
@@ -11,7 +11,7 @@ class WebLoader(LoaderInterface):
         Returns:
             List of extensions including the dot (e.g., ['.txt', '.md'])
         """
-        raise NotImplementedError
+        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
 
     @property
     def supported_mime_types(self) -> List[str]:
@@ -21,7 +21,7 @@ class WebLoader(LoaderInterface):
         Returns:
             List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
         """
-        raise NotImplementedError
+        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
 
     @property
     def loader_name(self) -> str:
diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index 9fcafca57..cd722bd76 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -75,5 +75,10 @@ async def data_item_to_text_file(
             else:
                 raise IngestionError(message="Local files are not accepted.")
 
+        elif data_item_path.startswith(("http://", "https://")):
+            loader = get_loader_engine()
+            return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
+                data_item_path, preferred_loaders
+            )
     # data is not a supported type
     raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index b6e1f7d00..d9f1beae7 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         if parsed_url.scheme == "s3":
             return data_item
         elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            # Validate URL by sending a HEAD request
-            try:
-                from cognee.context_global_variables import tavily_config, soup_crawler_config
-                from cognee.tasks.web_scraper import fetch_page_content
-
-                tavily = tavily_config.get()
-                soup_crawler = soup_crawler_config.get()
-                preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
-                if preferred_tool == "tavily" and tavily is None:
-                    raise IngestionError(
-                        message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
-                    )
-                if preferred_tool == "beautifulsoup" and soup_crawler is None:
-                    raise IngestionError(
-                        message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
-                    )
-
-                data = await fetch_page_content(
-                    data_item,
-                    preferred_tool=preferred_tool,
-                    tavily_config=tavily,
-                    soup_crawler_config=soup_crawler,
-                )
-                content = ""
-                for key, value in data.items():
-                    content += f"{key}:\n{value}\n\n"
-                return await save_data_to_file(content)
-            except IngestionError:
-                raise
-            except Exception as e:
-                raise IngestionError(
-                    message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
-                )
-
+            return data_item
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:

From 305969c61b08bdaabb4ed554c57833f89b61a005 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 12:17:43 +0100
Subject: [PATCH 03/44] refactor web_url_loader filename

---
 .../loaders/external/{WebLoader.py => web_url_loader.py}          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cognee/infrastructure/loaders/external/{WebLoader.py => web_url_loader.py} (100%)

diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
similarity index 100%
rename from cognee/infrastructure/loaders/external/WebLoader.py
rename to cognee/infrastructure/loaders/external/web_url_loader.py

From d884867d2c3953d92d55da5132f29c4893c0b176 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 12:40:07 +0100
Subject: [PATCH 04/44] extend LoaderInterface to support web_url_loader,
 implement `load()`

---
 cognee/infrastructure/loaders/LoaderEngine.py | 14 ++++--
 .../infrastructure/loaders/LoaderInterface.py |  4 +-
 .../loaders/external/__init__.py              |  8 ++++
 .../loaders/external/web_url_loader.py        | 47 +++++++++++++++++--
 .../loaders/supported_loaders.py              |  7 +++
 5 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index 6b62f7641..af6b53e93 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -64,7 +64,7 @@ class LoaderEngine:
         return True
 
     def get_loader(
-        self, file_path: str, preferred_loaders: List[str] = None
+        self, data_item_path: str, preferred_loaders: List[str] = None
     ) -> Optional[LoaderInterface]:
         """
         Get appropriate loader for a file.
@@ -77,20 +77,26 @@ class LoaderEngine:
             LoaderInterface that can handle the file, or None if not found
         """
 
-        file_info = filetype.guess(file_path)
+        file_info = filetype.guess(data_item_path)
 
         # Try preferred loaders first
         if preferred_loaders:
             for loader_name in preferred_loaders:
                 if loader_name in self._loaders:
                     loader = self._loaders[loader_name]
-                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
+                    if loader.can_handle(
+                        extension=file_info.extension,
+                        mime_type=file_info.mime,
+                        data_item_path=data_item_path,
+                    ):  # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
                         return loader
                 else:
                     logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
 
         # Try default priority order
-        for loader_name in self.default_loader_priority:
+        for loader_name in (
+            self.default_loader_priority
+        ):  # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
             if loader_name in self._loaders:
                 loader = self._loaders[loader_name]
                 if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
diff --git a/cognee/infrastructure/loaders/LoaderInterface.py b/cognee/infrastructure/loaders/LoaderInterface.py
index 3a1c9bf3e..fb309304b 100644
--- a/cognee/infrastructure/loaders/LoaderInterface.py
+++ b/cognee/infrastructure/loaders/LoaderInterface.py
@@ -44,7 +44,9 @@ class LoaderInterface(ABC):
         pass
 
     @abstractmethod
-    def can_handle(self, extension: str, mime_type: str) -> bool:
+    def can_handle(
+        self, extension: str, mime_type: str, data_item_path: str = None
+    ) -> bool:  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
         """
         Check if this loader can handle the given file.
 
diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py
index 6bf9f9200..b92d9e7f0 100644
--- a/cognee/infrastructure/loaders/external/__init__.py
+++ b/cognee/infrastructure/loaders/external/__init__.py
@@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe
 """
 
 from .pypdf_loader import PyPdfLoader
+from .web_url_loader import WebUrlLoader
 
 __all__ = ["PyPdfLoader"]
 
@@ -27,3 +28,10 @@ try:
     __all__.append("AdvancedPdfLoader")
 except ImportError:
     pass
+
+try:
+    from .web_url_loader import WebUrlLoader
+
+    __all__.append("WebUrlLoader")
+except ImportError:
+    pass
diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index db24c86e6..4d519d443 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -1,8 +1,11 @@
 from cognee.infrastructure.loaders import LoaderInterface
 from typing import List
 
+from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.modules.ingestion import save_data_to_file
 
-class WebLoader(LoaderInterface):
+
+class WebUrlLoader(LoaderInterface):
     @property
     def supported_extensions(self) -> List[str]:
         """
@@ -31,9 +34,9 @@ class WebLoader(LoaderInterface):
         Returns:
             String identifier used for registration and configuration
         """
-        raise NotImplementedError
+        return "web_url_loader"
 
-    def can_handle(self, extension: str, mime_type: str) -> bool:
+    def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
         """
         Check if this loader can handle the given file.
 
@@ -44,7 +47,9 @@ class WebLoader(LoaderInterface):
         Returns:
             True if this loader can process the file, False otherwise
         """
-        raise NotImplementedError
+        if data_item_path is None:
+            raise  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
+        return data_item_path.startswith(("http://", "https://"))
 
     async def load(self, file_path: str, **kwargs):
         """
@@ -58,4 +63,38 @@ class WebLoader(LoaderInterface):
         Raises:
             Exception: If file cannot be processed
         """
+        try:
+            from cognee.context_global_variables import tavily_config, soup_crawler_config
+            from cognee.tasks.web_scraper import fetch_page_content
+
+            tavily = tavily_config.get()
+            soup_crawler = soup_crawler_config.get()
+            preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
+            if preferred_tool == "tavily" and tavily is None:
+                raise IngestionError(
+                    message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
+                )
+            if preferred_tool == "beautifulsoup" and soup_crawler is None:
+                raise IngestionError(
+                    message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
+                )
+
+            data = await fetch_page_content(
+                file_path,
+                preferred_tool=preferred_tool,
+                tavily_config=tavily,
+                soup_crawler_config=soup_crawler,
+            )
+            content = ""
+            for key, value in data.items():
+                content += f"{key}:\n{value}\n\n"
+            await save_data_to_file(content)
+
+            return content
+        except IngestionError:
+            raise
+        except Exception as e:
+            raise IngestionError(
+                message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
+            )
         raise NotImplementedError
diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py
index d103babe3..7f92aa36a 100644
--- a/cognee/infrastructure/loaders/supported_loaders.py
+++ b/cognee/infrastructure/loaders/supported_loaders.py
@@ -23,3 +23,10 @@ try:
     supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
 except ImportError:
     pass
+
+try:
+    from cognee.infrastructure.loaders.external import WebUrlLoader
+
+    supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
+except ImportError:
+    pass

From 185600fe177783e5816a98798746dd1d658a82dc Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 13:58:02 +0100
Subject: [PATCH 05/44] revert url_crawler changes to `cognee.add()`, and
 update `web_url_loader.load()`

---
 cognee/api/v1/add/add.py                      | 39 ++-----------------
 .../loaders/external/web_url_loader.py        | 36 +++++++++++------
 .../tasks/ingestion/data_item_to_text_file.py | 13 +++++--
 cognee/tasks/ingestion/ingest_data.py         |  6 ++-
 4 files changed, 42 insertions(+), 52 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 0f14683f9..3c4d7b696 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -3,6 +3,7 @@ import os
 from typing import Union, BinaryIO, List, Optional, Dict, Any
 from pydantic import BaseModel
 from urllib.parse import urlparse
+from cognee.infrastructure.loaders import LoaderInterface
 from cognee.modules.users.models import User
 from cognee.modules.pipelines import Task, run_pipeline
 from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
@@ -17,16 +18,6 @@ from cognee.shared.logging_utils import get_logger
 
 logger = get_logger()
 
-try:
-    from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
-    from cognee.context_global_variables import (
-        tavily_config as tavily,
-        soup_crawler_config as soup_crawler,
-    )
-except ImportError:
-    logger.debug(f"Unable to import {str(ImportError)}")
-    pass
-
 
 async def add(
     data: Union[BinaryIO, list[BinaryIO], str, list[str]],
@@ -38,10 +29,8 @@ async def add(
     dataset_id: Optional[UUID] = None,
     preferred_loaders: List[str] = None,
     incremental_loading: bool = True,
-    extraction_rules: Optional[Dict[str, Any]] = None,
-    tavily_config: Optional[BaseModel] = None,
-    soup_crawler_config: Optional[BaseModel] = None,
     data_per_batch: Optional[int] = 20,
+    loaders_config: dict[LoaderInterface, dict] = {},
 ):
     """
     Add data to Cognee for knowledge graph processing.
@@ -180,29 +169,6 @@ async def add(
         - TAVILY_API_KEY: YOUR_TAVILY_API_KEY
 
     """
-
-    try:
-        if not soup_crawler_config and extraction_rules:
-            soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
-        if not tavily_config and os.getenv("TAVILY_API_KEY"):
-            tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
-
-        soup_crawler.set(soup_crawler_config)
-        tavily.set(tavily_config)
-
-        http_schemes = {"http", "https"}
-
-        def _is_http_url(item: Union[str, BinaryIO]) -> bool:
-            return isinstance(item, str) and urlparse(item).scheme in http_schemes
-
-        if _is_http_url(data):
-            node_set = ["web_content"] if not node_set else node_set + ["web_content"]
-        elif isinstance(data, list) and any(_is_http_url(item) for item in data):
-            node_set = ["web_content"] if not node_set else node_set + ["web_content"]
-    except NameError:
-        logger.debug(f"Unable to import {str(ImportError)}")
-        pass
-
     tasks = [
         Task(resolve_data_directories, include_subdirectories=True),
         Task(
@@ -212,6 +178,7 @@ async def add(
             node_set,
             dataset_id,
             preferred_loaders,
+            loaders_config,
         ),
     ]
 
diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index 4d519d443..5e0cf07f1 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -48,7 +48,9 @@ class WebUrlLoader(LoaderInterface):
             True if this loader can process the file, False otherwise
         """
         if data_item_path is None:
-            raise  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
+            raise IngestionError(
+                "data_item_path should not be None"
+            )  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
         return data_item_path.startswith(("http://", "https://"))
 
     async def load(self, file_path: str, **kwargs):
@@ -63,18 +65,31 @@ class WebUrlLoader(LoaderInterface):
         Raises:
             Exception: If file cannot be processed
         """
+        loaders_config = kwargs.get("loaders_config")
+        if not isinstance(loaders_config, dict):
+            raise IngestionError("loaders_config must be a valid dictionary")
+
+        web_url_loader_config = loaders_config.get(self.loader_name)
+        if not isinstance(web_url_loader_config, dict):
+            raise IngestionError(f"{self.loader_name} configuration must be a valid dictionary")
+
         try:
             from cognee.context_global_variables import tavily_config, soup_crawler_config
             from cognee.tasks.web_scraper import fetch_page_content
 
-            tavily = tavily_config.get()
-            soup_crawler = soup_crawler_config.get()
-            preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
-            if preferred_tool == "tavily" and tavily is None:
+            _tavily_config = web_url_loader_config.get("tavily_config")
+            _soup_config = web_url_loader_config.get("soup_config")
+
+            # Set global configs for downstream access
+            tavily_config.set(_tavily_config)
+            soup_crawler_config.set(_soup_config)
+
+            preferred_tool = "beautifulsoup" if _soup_config else "tavily"
+            if preferred_tool == "tavily" and _tavily_config is None:
                 raise IngestionError(
                     message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
                 )
-            if preferred_tool == "beautifulsoup" and soup_crawler is None:
+            if preferred_tool == "beautifulsoup" and _soup_config is None:
                 raise IngestionError(
                     message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
                 )
@@ -82,8 +97,8 @@ class WebUrlLoader(LoaderInterface):
             data = await fetch_page_content(
                 file_path,
                 preferred_tool=preferred_tool,
-                tavily_config=tavily,
-                soup_crawler_config=soup_crawler,
+                tavily_config=_tavily_config,
+                soup_crawler_config=_soup_config,
             )
             content = ""
             for key, value in data.items():
@@ -94,7 +109,4 @@ class WebUrlLoader(LoaderInterface):
         except IngestionError:
             raise
         except Exception as e:
-            raise IngestionError(
-                message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
-            )
-        raise NotImplementedError
+            raise IngestionError(message=f"Error ingesting webpage from URL {file_path}: {str(e)}")
diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index cd722bd76..91d09059a 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -34,7 +34,9 @@ async def pull_from_s3(file_path, destination_file) -> None:
 
 
 async def data_item_to_text_file(
-    data_item_path: str, preferred_loaders: List[str]
+    data_item_path: str,
+    preferred_loaders: List[str],
+    loaders_config: dict[LoaderInterface, dict],
 ) -> Tuple[str, LoaderInterface]:
     if isinstance(data_item_path, str):
         parsed_url = urlparse(data_item_path)
@@ -77,8 +79,13 @@ async def data_item_to_text_file(
 
         elif data_item_path.startswith(("http://", "https://")):
             loader = get_loader_engine()
-            return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
-                data_item_path, preferred_loaders
+            return (
+                await loader.load_file(
+                    data_item_path,
+                    preferred_loaders,
+                    loaders_config,  # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
+                ),
+                loader.get_loader(data_item_path, preferred_loaders),
             )
     # data is not a supported type
     raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 3c20a2b13..3fb161181 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -3,6 +3,7 @@ import inspect
 from uuid import UUID
 from typing import Union, BinaryIO, Any, List, Optional
 
+from cognee.infrastructure.loaders import LoaderInterface
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
@@ -28,6 +29,7 @@ async def ingest_data(
     node_set: Optional[List[str]] = None,
     dataset_id: UUID = None,
     preferred_loaders: List[str] = None,
+    loaders_config: dict[LoaderInterface, dict] = {},
 ):
     if not user:
         user = await get_default_user()
@@ -85,7 +87,9 @@ async def ingest_data(
 
             # Store all input data as text files in Cognee data storage
             cognee_storage_file_path, loader_engine = await data_item_to_text_file(
-                actual_file_path, preferred_loaders
+                actual_file_path,
+                preferred_loaders,
+                loaders_config,
             )
 
             # Find metadata from original file

From 9a9f9f6836859db629024ffedce34cefba8700a9 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 13:58:38 +0100
Subject: [PATCH 06/44] tests: add some tests to assert behaviour is as
 expected

---
 .../integration/web_url_crawler/test_add.py   | 26 +++++++++++++++++++
 .../web_url_crawler/test_loader_engine.py     | 20 ++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 cognee/tests/integration/web_url_crawler/test_add.py
 create mode 100644 cognee/tests/integration/web_url_crawler/test_loader_engine.py

diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
new file mode 100644
index 000000000..b9840df3d
--- /dev/null
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -0,0 +1,26 @@
+import pytest
+import cognee
+
+
+@pytest.mark.asyncio
+async def test_add_fails_when_preferred_loader_not_specified():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    with pytest.raises:
+        await cognee.add(
+            "https://en.wikipedia.org/wiki/Large_language_model",
+            preferred_loaders=["web_url_loader"],
+        )
+
+
+@pytest.mark.asyncio
+async def test_add_succesfully_adds_url_when_preferred_loader_specified():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    try:
+        await cognee.add(
+            "https://en.wikipedia.org/wiki/Large_language_model",
+            preferred_loaders=["web_url_loader"],
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to add url: {e}")
diff --git a/cognee/tests/integration/web_url_crawler/test_loader_engine.py b/cognee/tests/integration/web_url_crawler/test_loader_engine.py
new file mode 100644
index 000000000..018c034e1
--- /dev/null
+++ b/cognee/tests/integration/web_url_crawler/test_loader_engine.py
@@ -0,0 +1,20 @@
+import pytest
+
+from cognee.infrastructure.loaders import get_loader_engine
+from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader
+
+
+def test_get_loader_returns_none_by_default_for_web_urls():
+    loader_engine = get_loader_engine()
+    urls = ["https://cognee.ai", "http://cognee.ai"]
+    for url in urls:
+        loader = loader_engine.get_loader(url)
+        assert loader is None
+
+
+def test_get_loader_returns_valid_loader_when_preferred_loaders_specified():
+    loader_engine = get_loader_engine()
+    urls = ["https://cognee.ai", "http://cognee.ai"]
+    for url in urls:
+        loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"])
+        assert isinstance(loader, WebUrlLoader)

From 36364285b27d2305c4e88ed6d323a6e934d6c59d Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 15:53:49 +0100
Subject: [PATCH 07/44] tests: fix failing tests

---
 cognee/infrastructure/loaders/LoaderEngine.py | 15 +++++++++----
 .../loaders/external/web_url_loader.py        |  2 +-
 .../tasks/ingestion/data_item_to_text_file.py | 21 ++++++++++---------
 .../integration/web_url_crawler/test_add.py   | 16 ++++++++++++--
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index af6b53e93..d6c4d4d8c 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -76,8 +76,15 @@ class LoaderEngine:
         Returns:
             LoaderInterface that can handle the file, or None if not found
         """
+        is_url = data_item_path.startswith(("http://", "https://"))
 
-        file_info = filetype.guess(data_item_path)
+        if is_url:
+            extension = None
+            mime_type = None
+        else:
+            file_info = filetype.guess(data_item_path)
+            extension = file_info.extension if file_info else None
+            mime_type = file_info.mime if file_info else None
 
         # Try preferred loaders first
         if preferred_loaders:
@@ -85,8 +92,8 @@ class LoaderEngine:
                 if loader_name in self._loaders:
                     loader = self._loaders[loader_name]
                     if loader.can_handle(
-                        extension=file_info.extension,
-                        mime_type=file_info.mime,
+                        extension=extension,
+                        mime_type=mime_type,
                         data_item_path=data_item_path,
                     ):  # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
                         return loader
@@ -99,7 +106,7 @@ class LoaderEngine:
         ):  # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
             if loader_name in self._loaders:
                 loader = self._loaders[loader_name]
-                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
+                if loader.can_handle(extension=extension, mime_type=mime_type):
                     return loader
             else:
                 logger.info(
diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index 5e0cf07f1..38bca2523 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -1,4 +1,4 @@
-from cognee.infrastructure.loaders import LoaderInterface
+from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from typing import List
 
 from cognee.modules.ingestion.exceptions.exceptions import IngestionError
diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index 91d09059a..f82d9a0dc 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -50,17 +50,17 @@ async def data_item_to_text_file(
                 await pull_from_s3(data_item_path, temp_file)
                 temp_file.flush()  # Data needs to be saved to local storage
                 loader = get_loader_engine()
-                return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader(
-                    temp_file.name, preferred_loaders
-                )
+                return await loader.load_file(
+                    temp_file.name, None, preferred_loaders
+                ), loader.get_loader(temp_file.name, preferred_loaders)
 
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:
                 loader = get_loader_engine()
-                return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
-                    data_item_path, preferred_loaders
-                )
+                return await loader.load_file(
+                    data_item_path, None, preferred_loaders
+                ), loader.get_loader(data_item_path, preferred_loaders)
             else:
                 raise IngestionError(message="Local files are not accepted.")
 
@@ -71,9 +71,9 @@ async def data_item_to_text_file(
             # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path)
             if settings.accept_local_file_path:
                 loader = get_loader_engine()
-                return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
-                    data_item_path, preferred_loaders
-                )
+                return await loader.load_file(
+                    data_item_path, None, preferred_loaders
+                ), loader.get_loader(data_item_path, preferred_loaders)
             else:
                 raise IngestionError(message="Local files are not accepted.")
 
@@ -82,8 +82,9 @@ async def data_item_to_text_file(
             return (
                 await loader.load_file(
                     data_item_path,
+                    None,
                     preferred_loaders,
-                    loaders_config,  # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
+                    loaders_config=loaders_config,  # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
                 ),
                 loader.get_loader(data_item_path, preferred_loaders),
             )
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index b9840df3d..0c4332c6d 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -6,10 +6,10 @@ import cognee
 async def test_add_fails_when_preferred_loader_not_specified():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
-    with pytest.raises:
+    with pytest.raises(ValueError):
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            preferred_loaders=["web_url_loader"],
+            incremental_loading=False,  # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
         )
 
 
@@ -17,10 +17,22 @@ async def test_add_fails_when_preferred_loader_not_specified():
 async def test_add_succesfully_adds_url_when_preferred_loader_specified():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
+
+    loaders_config = {
+        "web_url_loader": {
+            "soup_config": {
+                "max_depth": 1,
+                "follow_links": False,
+            }
+        }
+    }
+
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
             preferred_loaders=["web_url_loader"],
+            incremental_loading=False,  # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
+            loaders_config=loaders_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")

From 572c8ebce745d4cd675eb522c6e500cf04102591 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 18:26:50 +0100
Subject: [PATCH 08/44] refactor: use pydantic models for tavily and
 beautifulsoup configs instead of dicts

---
 .../loaders/external/web_url_loader.py               | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index 38bca2523..f9fce47a9 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -3,6 +3,7 @@ from typing import List
 
 from cognee.modules.ingestion.exceptions.exceptions import IngestionError
 from cognee.modules.ingestion import save_data_to_file
+from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
 
 
 class WebUrlLoader(LoaderInterface):
@@ -77,8 +78,11 @@ class WebUrlLoader(LoaderInterface):
             from cognee.context_global_variables import tavily_config, soup_crawler_config
             from cognee.tasks.web_scraper import fetch_page_content
 
-            _tavily_config = web_url_loader_config.get("tavily_config")
-            _soup_config = web_url_loader_config.get("soup_config")
+            tavily_dict = web_url_loader_config.get("tavily_config")
+            _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
+
+            soup_dict = web_url_loader_config.get("soup_config")
+            _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
 
             # Set global configs for downstream access
             tavily_config.set(_tavily_config)
@@ -109,4 +113,6 @@ class WebUrlLoader(LoaderInterface):
         except IngestionError:
             raise
         except Exception as e:
-            raise IngestionError(message=f"Error ingesting webpage from URL {file_path}: {str(e)}")
+            raise IngestionError(
+                message=f"Error ingesting webpage from URL {file_path}: {str(e)}"
+            ) from e

From c0d450b165a0d19ff28bc3e7be7ef30c66926795 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 18:27:03 +0100
Subject: [PATCH 09/44] tests: fix test_add - add missing required parameter

---
 cognee/tests/integration/web_url_crawler/test_add.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index 0c4332c6d..e0dda94a9 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -18,11 +18,19 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
+    extraction_rules = {
+        "title": {"selector": "title", "attr": "text"},
+        "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "attr": "text", "all": True},
+    }
+
     loaders_config = {
         "web_url_loader": {
             "soup_config": {
                 "max_depth": 1,
                 "follow_links": False,
+                "extraction_rules": extraction_rules,
             }
         }
     }

From 2e7ff0b01ba3e25e4c50b4ea9bb02fdd3adf9c8d Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Thu, 16 Oct 2025 18:28:51 +0100
Subject: [PATCH 10/44] remove reduntant HtmlContent class in
 save_data_item_to_storage

---
 cognee/tasks/ingestion/save_data_item_to_storage.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index d9f1beae7..5761b19ba 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -18,13 +18,6 @@ class SaveDataSettings(BaseSettings):
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
 
-class HTMLContent(str):
-    def __new__(cls, value: str):
-        if not ("<" in value and ">" in value):
-            raise ValueError("Not valid HTML-like content")
-        return super().__new__(cls, value)
-
-
 settings = SaveDataSettings()
 
 

From d0f3e224cb07c958658082cd0dfaf6012c82b397 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 11:12:09 +0100
Subject: [PATCH 11/44] refactor ingest_data to accomodate non-FS data items

---
 cognee/infrastructure/files/exceptions.py     | 12 +++++
 .../files/utils/get_data_file_path.py         |  6 ++-
 cognee/tasks/ingestion/ingest_data.py         | 45 ++++++++++++-------
 .../ingestion/save_data_item_to_storage.py    |  5 ++-
 4 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/cognee/infrastructure/files/exceptions.py b/cognee/infrastructure/files/exceptions.py
index 351eaee9c..eb6efdbce 100644
--- a/cognee/infrastructure/files/exceptions.py
+++ b/cognee/infrastructure/files/exceptions.py
@@ -11,3 +11,15 @@ class FileContentHashingError(Exception):
         status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
     ):
         super().__init__(message, name, status_code)
+
+
+class UnsupportedPathSchemeError(Exception):
+    """Raised when a non-filesystem path scheme (like http://, https://) is passed to a function expecting filesystem paths."""
+
+    def __init__(
+        self,
+        message: str = "This function only supports filesystem paths (file:// or local paths), not HTTP/HTTPS URLs.",
+        name: str = "UnsupportedPathSchemeError",
+        status_code=status.HTTP_400_BAD_REQUEST,
+    ):
+        super().__init__(message, name, status_code)
diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py
index 242d130a9..d67fc95a0 100644
--- a/cognee/infrastructure/files/utils/get_data_file_path.py
+++ b/cognee/infrastructure/files/utils/get_data_file_path.py
@@ -1,6 +1,8 @@
 import os
 from urllib.parse import urlparse
 
+from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
+
 
 def get_data_file_path(file_path: str):
     # Check if this is a file URI BEFORE normalizing (which corrupts URIs)
@@ -39,7 +41,9 @@ def get_data_file_path(file_path: str):
         return normalized_url
 
     elif file_path.startswith(("http://", "https://")):
-        return file_path
+        raise UnsupportedPathSchemeError(
+            message=f"HTTP/HTTPS URLs are not supported by get_data_file_path(). Received: {file_path}"
+        )
 
     else:
         # Regular file path - normalize separators
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 3fb161181..b742e474e 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -3,6 +3,7 @@ import inspect
 from uuid import UUID
 from typing import Union, BinaryIO, Any, List, Optional
 
+from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
 from cognee.infrastructure.loaders import LoaderInterface
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
@@ -79,11 +80,16 @@ async def ingest_data(
         dataset_data_map = {str(data.id): True for data in dataset_data}
 
         for data_item in data:
-            # Get file path of data item or create a file it doesn't exist
-            original_file_path = await save_data_item_to_storage(data_item)
-
-            # Transform file path to be OS usable
-            actual_file_path = get_data_file_path(original_file_path)
+            try:
+                # Get file path of data item or create a file if it doesn't exist
+                original_file_path = await save_data_item_to_storage(data_item)
+                # Transform file path to be OS usable
+                actual_file_path = get_data_file_path(original_file_path)
+            except UnsupportedPathSchemeError:
+                # This data_item (e.g., HTTP/HTTPS URL) should be passed directly to the loader
+                # skip save_data_item_to_storage and get_data_file_path
+                actual_file_path = data_item
+                original_file_path = None  # we don't have an original file path
 
             # Store all input data as text files in Cognee data storage
             cognee_storage_file_path, loader_engine = await data_item_to_text_file(
@@ -93,17 +99,26 @@ async def ingest_data(
             )
 
             # Find metadata from original file
-            async with open_data_file(original_file_path) as file:
-                classified_data = ingestion.classify(file)
+            if original_file_path is not None:
+                # Standard flow: extract metadata from both original and stored files
+                async with open_data_file(original_file_path) as file:
+                    classified_data = ingestion.classify(file)
+                    data_id = ingestion.identify(classified_data, user)
+                    original_file_metadata = classified_data.get_metadata()
 
-                # data_id is the hash of original file contents + owner id to avoid duplicate data
-                data_id = ingestion.identify(classified_data, user)
-                original_file_metadata = classified_data.get_metadata()
-
-            # Find metadata from Cognee data storage text file
-            async with open_data_file(cognee_storage_file_path) as file:
-                classified_data = ingestion.classify(file)
-                storage_file_metadata = classified_data.get_metadata()
+                async with open_data_file(cognee_storage_file_path) as file:
+                    classified_data = ingestion.classify(file)
+                    storage_file_metadata = classified_data.get_metadata()
+            else:
+                # Alternative flow (e.g., URLs): extract metadata once from stored file
+                async with open_data_file(cognee_storage_file_path) as file:
+                    classified_data = ingestion.classify(file)
+                    data_id = ingestion.identify(classified_data, user)
+                    original_file_metadata = classified_data.get_metadata()
+                    # Override file_path to be the actual data_item (e.g., URL) ?
+                    # original_file_metadata["file_path"] = actual_file_path
+                    # Storage metadata is the same as original
+                    # storage_file_metadata = original_file_metadata.copy()
 
             from sqlalchemy import select
 
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index 5761b19ba..cf32477cb 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -3,6 +3,7 @@ from pathlib import Path
 from urllib.parse import urlparse
 from typing import Union, BinaryIO, Any
 
+from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
 from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
@@ -56,7 +57,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         if parsed_url.scheme == "s3":
             return data_item
         elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            return data_item
+            raise UnsupportedPathSchemeError(
+                message=f"HTTP/HTTPS URLs should be handled by loader, not by save_data_item_to_storage. Received: {data_item}"
+            )
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:

From 9b802f651bc642f318ecec07af5fb3e1f46a5146 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 12:34:50 +0100
Subject: [PATCH 12/44] fix: web_url_loader load_data should yield stored_path

---
 cognee/infrastructure/loaders/external/web_url_loader.py | 6 ++++--
 cognee/tasks/ingestion/ingest_data.py                    | 4 ++--
 cognee/tests/integration/web_url_crawler/test_add.py     | 3 +++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index f9fce47a9..491428c82 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -63,6 +63,8 @@ class WebUrlLoader(LoaderInterface):
             file_stream: If file stream is provided it will be used to process file instead
             **kwargs: Additional loader-specific configuration
 
+        Returns:
+            file path to the stored file
         Raises:
             Exception: If file cannot be processed
         """
@@ -107,9 +109,9 @@ class WebUrlLoader(LoaderInterface):
             content = ""
             for key, value in data.items():
                 content += f"{key}:\n{value}\n\n"
-            await save_data_to_file(content)
+            stored_path = await save_data_to_file(content)
 
-            return content
+            return stored_path
         except IngestionError:
             raise
         except Exception as e:
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index b742e474e..233bb5f1c 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -116,9 +116,9 @@ async def ingest_data(
                     data_id = ingestion.identify(classified_data, user)
                     original_file_metadata = classified_data.get_metadata()
                     # Override file_path to be the actual data_item (e.g., URL) ?
-                    # original_file_metadata["file_path"] = actual_file_path
+                    original_file_metadata["file_path"] = actual_file_path
                     # Storage metadata is the same as original
-                    # storage_file_metadata = original_file_metadata.copy()
+                    storage_file_metadata = original_file_metadata.copy()
 
             from sqlalchemy import select
 
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index e0dda94a9..2a75b5054 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -4,6 +4,9 @@ import cognee
 
 @pytest.mark.asyncio
 async def test_add_fails_when_preferred_loader_not_specified():
+    from cognee.shared.logging_utils import setup_logging, ERROR
+
+    setup_logging(log_level=ERROR)
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
     with pytest.raises(ValueError):

From b9877f9e876de87e1cea051b764669922290e87e Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 13:16:30 +0100
Subject: [PATCH 13/44] create web_url_loader_example.py

---
 examples/python/web_url_loader_example.py | 46 +++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 examples/python/web_url_loader_example.py

diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_loader_example.py
new file mode 100644
index 000000000..7845d4001
--- /dev/null
+++ b/examples/python/web_url_loader_example.py
@@ -0,0 +1,46 @@
+import asyncio
+
+import cognee
+from cognee.shared.logging_utils import setup_logging, ERROR
+
+
+async def main():
+    await cognee.prune.prune_data()
+    print("Data pruned.")
+
+    await cognee.prune.prune_system(metadata=True)
+
+    extraction_rules = {
+        "title": {"selector": "title", "attr": "text"},
+        "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "attr": "text", "all": True},
+    }
+
+    loaders_config = {
+        "web_url_loader": {
+            "soup_config": {
+                "max_depth": 1,
+                "follow_links": False,
+                "extraction_rules": extraction_rules,
+            }
+        }
+    }
+
+    await cognee.add(
+        "https://en.wikipedia.org/wiki/Large_language_model",
+        preferred_loaders=["web_url_loader"],
+        incremental_loading=False,  # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
+        loaders_config=loaders_config,
+    )
+
+    await cognee.cognify()
+    print("Knowledge graph created.")
+
+    await cognee.visualize_graph()
+    print("Data visualized")
+
+
+if __name__ == "__main__":
+    logger = setup_logging(log_level=ERROR)
+    asyncio.run(main())

From b5190c90f1efc8256ae7405728e766c06b97e963 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 16:51:36 +0100
Subject: [PATCH 14/44] add logging for crawling status; add cap to the
 crawl_delay from robots.txt

- Not advising to use the cap, but giving an option to be able to configure it
---
 .../loaders/external/web_url_loader.py        | 13 ++++
 cognee/tasks/web_scraper/bs4_crawler.py       | 66 ++++++++++++++++++-
 cognee/tasks/web_scraper/config.py            |  3 +
 cognee/tasks/web_scraper/utils.py             | 41 ++++++++++--
 4 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index 491428c82..1ecf82171 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -4,6 +4,9 @@ from typing import List
 from cognee.modules.ingestion.exceptions.exceptions import IngestionError
 from cognee.modules.ingestion import save_data_to_file
 from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger()
 
 
 class WebUrlLoader(LoaderInterface):
@@ -100,16 +103,26 @@ class WebUrlLoader(LoaderInterface):
                     message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
                 )
 
+            logger.info(f"Starting web URL crawling for: {file_path}")
+            logger.info(f"Using scraping tool: {preferred_tool}")
+
             data = await fetch_page_content(
                 file_path,
                 preferred_tool=preferred_tool,
                 tavily_config=_tavily_config,
                 soup_crawler_config=_soup_config,
             )
+
+            logger.info(f"Successfully fetched content from {len(data)} URL(s)")
+            logger.info("Processing and concatenating fetched content")
+
             content = ""
             for key, value in data.items():
                 content += f"{key}:\n{value}\n\n"
+
+            logger.info(f"Saving content to file (total size: {len(content)} characters)")
             stored_path = await save_data_to_file(content)
+            logger.info(f"Successfully saved content to: {stored_path}")
 
             return stored_path
         except IngestionError:
diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py
index 0fbff4808..400287e08 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@@ -75,6 +75,7 @@ class BeautifulSoupCrawler:
     Attributes:
         concurrency: Number of concurrent requests allowed.
         crawl_delay: Minimum seconds between requests to the same domain.
+        max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
         timeout: Per-request timeout in seconds.
         max_retries: Number of retries for failed requests.
         retry_delay_factor: Multiplier for exponential backoff on retries.
@@ -87,6 +88,7 @@ class BeautifulSoupCrawler:
         *,
         concurrency: int = 5,
         crawl_delay: float = 0.5,
+        max_crawl_delay: Optional[float] = 10.0,
         timeout: float = 15.0,
         max_retries: int = 2,
         retry_delay_factor: float = 0.5,
@@ -98,6 +100,7 @@ class BeautifulSoupCrawler:
         Args:
             concurrency: Number of concurrent requests allowed.
             crawl_delay: Minimum seconds between requests to the same domain.
+            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
             timeout: Per-request timeout in seconds.
             max_retries: Number of retries for failed requests.
             retry_delay_factor: Multiplier for exponential backoff on retries.
@@ -107,6 +110,7 @@ class BeautifulSoupCrawler:
         self.concurrency = concurrency
         self._sem = asyncio.Semaphore(concurrency)
         self.crawl_delay = crawl_delay
+        self.max_crawl_delay = max_crawl_delay
         self.timeout = timeout
         self.max_retries = max_retries
         self.retry_delay_factor = retry_delay_factor
@@ -183,7 +187,11 @@ class BeautifulSoupCrawler:
         elapsed = time.time() - last
         wait_for = delay - elapsed
         if wait_for > 0:
+            logger.info(
+                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
+            )
             await asyncio.sleep(wait_for)
+            logger.info(f"Rate limit wait completed for {url}")
         self._last_request_time_per_domain[domain] = time.time()
 
     async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
@@ -236,7 +244,16 @@ class BeautifulSoupCrawler:
             crawl_delay = self.crawl_delay
             if protego:
                 delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
-                crawl_delay = delay if delay else self.crawl_delay
+                if delay:
+                    # Apply max_crawl_delay cap if configured
+                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
+                        logger.warning(
+                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
+                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
+                        )
+                        crawl_delay = self.max_crawl_delay
+                    else:
+                        crawl_delay = delay
 
             cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
             self._robots_cache[domain_root] = cache_entry
@@ -307,12 +324,16 @@ class BeautifulSoupCrawler:
 
         attempt = 0
         crawl_delay = await self._get_crawl_delay(url)
+        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
 
         while True:
             try:
                 await self._respect_rate_limit(url, crawl_delay)
                 resp = await self._client.get(url)
                 resp.raise_for_status()
+                logger.info(
+                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
+                )
                 return resp.text
             except Exception as exc:
                 attempt += 1
@@ -347,22 +368,35 @@ class BeautifulSoupCrawler:
             raise RuntimeError(
                 "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
             )
+
+        timeout_val = timeout or self.timeout
+        logger.info(
+            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
+        )
+
         attempt = 0
         while True:
             try:
                 async with async_playwright() as p:
+                    logger.info(f"Launching headless Chromium browser for {url}")
                     browser = await p.chromium.launch(headless=True)
                     try:
                         context = await browser.new_context()
                         page = await context.new_page()
+                        logger.info(f"Navigating to {url} and waiting for network idle")
                         await page.goto(
                             url,
                             wait_until="networkidle",
-                            timeout=int((timeout or self.timeout) * 1000),
+                            timeout=int(timeout_val * 1000),
                         )
                         if js_wait:
+                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
                             await asyncio.sleep(js_wait)
-                        return await page.content()
+                        content = await page.content()
+                        logger.info(
+                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
+                        )
+                        return content
                     finally:
                         await browser.close()
             except Exception as exc:
@@ -498,6 +532,10 @@ class BeautifulSoupCrawler:
         else:
             raise ValueError(f"Invalid urls type: {type(urls)}")
 
+        logger.info(
+            f"Preparing to fetch {len(url_rules_map)} URL(s) with {len(extraction_rules) if extraction_rules else 0} extraction rule(s)"
+        )
+
         normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
         for url, rules in url_rules_map.items():
             normalized_rules = []
@@ -508,21 +546,36 @@ class BeautifulSoupCrawler:
                 normalized_rules.append(r)
             normalized_url_rules[url] = normalized_rules
 
+        logger.info(f"Normalized extraction rules for {len(normalized_url_rules)} URL(s)")
+
         async def _task(url: str):
             async with self._sem:
                 try:
+                    logger.info(f"Processing URL: {url}")
+
+                    # Check robots.txt
                     allowed = await self._is_url_allowed(url)
                     if not allowed:
                         logger.warning(f"URL disallowed by robots.txt: {url}")
                         return url, ""
 
+                    logger.info(f"Robots.txt check passed for {url}")
+
+                    # Fetch HTML
                     if use_playwright:
+                        logger.info(
+                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
+                        )
                         html = await self._render_with_playwright(
                             url, js_wait=playwright_js_wait, timeout=self.timeout
                         )
                     else:
+                        logger.info(f"Fetching {url} with httpx")
                         html = await self._fetch_httpx(url)
 
+                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
+
+                    # Extract content
                     pieces = []
                     for rule in normalized_url_rules[url]:
                         text = self._extract_with_bs4(html, rule)
@@ -530,17 +583,24 @@ class BeautifulSoupCrawler:
                             pieces.append(text)
 
                     concatenated = " ".join(pieces).strip()
+                    logger.info(f"Extracted {len(concatenated)} characters from {url}")
                     return url, concatenated
 
                 except Exception as e:
                     logger.error(f"Error processing {url}: {e}")
                     return url, ""
 
+        logger.info(f"Creating {len(url_rules_map)} async tasks for concurrent fetching")
         tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
         results = {}
+        completed = 0
+        total = len(tasks)
 
         for coro in asyncio.as_completed(tasks):
             url, text = await coro
             results[url] = text
+            completed += 1
+            logger.info(f"Progress: {completed}/{total} URLs processed")
 
+        logger.info(f"Completed fetching all {len(results)} URL(s)")
         return results
diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py
index 2ee43ed32..ac470daa9 100644
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@@ -13,6 +13,9 @@ class TavilyConfig(BaseModel):
 class SoupCrawlerConfig(BaseModel):
     concurrency: int = 5
     crawl_delay: float = 0.5
+    max_crawl_delay: Optional[float] = (
+        10.0  # Maximum crawl delay to respect from robots.txt (None = no limit)
+    )
     timeout: float = 15.0
     max_retries: int = 2
     retry_delay_factor: float = 0.5
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index 6d094f423..a32b6848c 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -45,9 +45,13 @@ async def fetch_page_content(
         ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
             installed.
     """
+    url_list = [urls] if isinstance(urls, str) else urls
+    logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}")
+
     if preferred_tool == "tavily":
         if not tavily_config or tavily_config.api_key is None:
             raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
+        logger.info("Using Tavily API for content extraction")
         return await fetch_with_tavily(urls, tavily_config)
 
     if preferred_tool == "beautifulsoup":
@@ -60,10 +64,17 @@ async def fetch_page_content(
             raise ImportError
         if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
             raise ValueError("extraction_rules must be provided when not using Tavily")
+
+        logger.info("Using BeautifulSoup for content extraction")
         extraction_rules = soup_crawler_config.extraction_rules
+        logger.info(
+            f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s"
+        )
+
         crawler = BeautifulSoupCrawler(
             concurrency=soup_crawler_config.concurrency,
             crawl_delay=soup_crawler_config.crawl_delay,
+            max_crawl_delay=soup_crawler_config.max_crawl_delay,
             timeout=soup_crawler_config.timeout,
             max_retries=soup_crawler_config.max_retries,
             retry_delay_factor=soup_crawler_config.retry_delay_factor,
@@ -71,6 +82,9 @@ async def fetch_page_content(
             robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
         )
         try:
+            logger.info(
+                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})"
+            )
             results = await crawler.fetch_with_bs4(
                 urls,
                 extraction_rules,
@@ -78,11 +92,13 @@ async def fetch_page_content(
                 playwright_js_wait=soup_crawler_config.playwright_js_wait,
                 join_all_matches=soup_crawler_config.join_all_matches,
             )
+            logger.info(f"Successfully fetched content from {len(results)} URL(s)")
             return results
         except Exception as e:
             logger.error(f"Error fetching page content: {str(e)}")
             raise
         finally:
+            logger.info("Closing BeautifulSoup crawler")
             await crawler.close()
 
 
@@ -108,19 +124,36 @@ async def fetch_with_tavily(
             "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
         )
         raise
+
+    url_list = [urls] if isinstance(urls, str) else urls
+    extract_depth = tavily_config.extract_depth if tavily_config else "basic"
+    timeout = tavily_config.timeout if tavily_config else 10
+
+    logger.info(
+        f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
+    )
     client = AsyncTavilyClient(
         api_key=tavily_config.api_key if tavily_config else None,
         proxies=tavily_config.proxies if tavily_config else None,
     )
+
+    logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
     results = await client.extract(
         urls,
         format="text",
-        extract_depth=tavily_config.extract_depth if tavily_config else "basic",
-        timeout=tavily_config.timeout if tavily_config else 10,
+        extract_depth=extract_depth,
+        timeout=timeout,
     )
-    for failed_result in results.get("failed_results", []):
-        logger.warning(f"Failed to fetch {failed_result}")
+
+    failed_count = len(results.get("failed_results", []))
+    if failed_count > 0:
+        logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
+        for failed_result in results.get("failed_results", []):
+            logger.warning(f"Failed to fetch {failed_result}")
+
     return_results = {}
     for result in results.get("results", []):
         return_results[result["url"]] = result["raw_content"]
+
+    logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
     return return_results

From a69a7e5fc46b5c42c29abd6cbf0f21c911dacde5 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 16:52:37 +0100
Subject: [PATCH 15/44] tests: remove redundant bs4 configs from tests

---
 .../tests/integration/web_url_crawler/test_add.py  |  6 +++---
 examples/python/web_url_loader_example.py          | 14 ++++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index 2a75b5054..926c25a94 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -22,10 +22,10 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified():
     await cognee.prune.prune_system(metadata=True)
 
     extraction_rules = {
-        "title": {"selector": "title", "attr": "text"},
-        "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True},
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
         "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "attr": "text", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
     }
 
     loaders_config = {
diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_loader_example.py
index 7845d4001..37dd2258c 100644
--- a/examples/python/web_url_loader_example.py
+++ b/examples/python/web_url_loader_example.py
@@ -1,7 +1,6 @@
 import asyncio
 
 import cognee
-from cognee.shared.logging_utils import setup_logging, ERROR
 
 
 async def main():
@@ -11,10 +10,14 @@ async def main():
     await cognee.prune.prune_system(metadata=True)
 
     extraction_rules = {
-        "title": {"selector": "title", "attr": "text"},
-        "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "attr": "text", "all": True},
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {
+            "selector": "a",
+            "attr": "href",
+            "all": True,
+        },
+        "paragraphs": {"selector": "p", "all": True},
     }
 
     loaders_config = {
@@ -42,5 +45,4 @@ async def main():
 
 
 if __name__ == "__main__":
-    logger = setup_logging(log_level=ERROR)
     asyncio.run(main())

From a0f760a3d101bae4446a62e5fec2cfeba4c73b50 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 16:54:27 +0100
Subject: [PATCH 16/44] refactor: remove redundant `filestream` arg from
 `LoaderEngine.load_file(...)`

---
 .../tasks/ingestion/data_item_to_text_file.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index f82d9a0dc..211b918ae 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -50,17 +50,17 @@ async def data_item_to_text_file(
                 await pull_from_s3(data_item_path, temp_file)
                 temp_file.flush()  # Data needs to be saved to local storage
                 loader = get_loader_engine()
-                return await loader.load_file(
-                    temp_file.name, None, preferred_loaders
-                ), loader.get_loader(temp_file.name, preferred_loaders)
+                return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader(
+                    temp_file.name, preferred_loaders
+                )
 
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:
                 loader = get_loader_engine()
-                return await loader.load_file(
-                    data_item_path, None, preferred_loaders
-                ), loader.get_loader(data_item_path, preferred_loaders)
+                return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
+                    data_item_path, preferred_loaders
+                )
             else:
                 raise IngestionError(message="Local files are not accepted.")
 
@@ -71,9 +71,9 @@ async def data_item_to_text_file(
             # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path)
             if settings.accept_local_file_path:
                 loader = get_loader_engine()
-                return await loader.load_file(
-                    data_item_path, None, preferred_loaders
-                ), loader.get_loader(data_item_path, preferred_loaders)
+                return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
+                    data_item_path, preferred_loaders
+                )
             else:
                 raise IngestionError(message="Local files are not accepted.")
 
@@ -82,7 +82,6 @@ async def data_item_to_text_file(
             return (
                 await loader.load_file(
                     data_item_path,
-                    None,
                     preferred_loaders,
                     loaders_config=loaders_config,  # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
                 ),

From 1a0978fb3764fb47619a0cf5a6881b7c1c70ae7e Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 22:38:36 +0100
Subject: [PATCH 17/44] incremental loading - fallback to regular, update test
 cases

---
 .../operations/run_tasks_data_item.py         | 74 +++++++++++--------
 .../integration/web_url_crawler/test_add.py   | 66 ++++++++++++++++-
 2 files changed, 109 insertions(+), 31 deletions(-)

diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py
index 152e72d7f..0118e7976 100644
--- a/cognee/modules/pipelines/operations/run_tasks_data_item.py
+++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py
@@ -9,6 +9,7 @@ import os
 from typing import Any, Dict, AsyncGenerator, Optional
 from sqlalchemy import select
 
+from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.infrastructure.files.utils.open_data_file import open_data_file
@@ -63,36 +64,51 @@ async def run_tasks_data_item_incremental(
 
     # If incremental_loading of data is set to True don't process documents already processed by pipeline
     # If data is being added to Cognee for the first time calculate the id of the data
-    if not isinstance(data_item, Data):
-        file_path = await save_data_item_to_storage(data_item)
-        # Ingest data and add metadata
-        async with open_data_file(file_path) as file:
-            classified_data = ingestion.classify(file)
-            # data_id is the hash of file contents + owner id to avoid duplicate data
-            data_id = ingestion.identify(classified_data, user)
-    else:
-        # If data was already processed by Cognee get data id
-        data_id = data_item.id
+    try:
+        if not isinstance(data_item, Data):
+            file_path = await save_data_item_to_storage(data_item)
+            # Ingest data and add metadata
+            async with open_data_file(file_path) as file:
+                classified_data = ingestion.classify(file)
+                # data_id is the hash of file contents + owner id to avoid duplicate data
+                data_id = ingestion.identify(classified_data, user)
+        else:
+            # If data was already processed by Cognee get data id
+            data_id = data_item.id
 
-    # Check pipeline status, if Data already processed for pipeline before skip current processing
-    async with db_engine.get_async_session() as session:
-        data_point = (
-            await session.execute(select(Data).filter(Data.id == data_id))
-        ).scalar_one_or_none()
-        if data_point:
-            if (
-                data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
-                == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
-            ):
-                yield {
-                    "run_info": PipelineRunAlreadyCompleted(
-                        pipeline_run_id=pipeline_run_id,
-                        dataset_id=dataset.id,
-                        dataset_name=dataset.name,
-                    ),
-                    "data_id": data_id,
-                }
-                return
+        # Check pipeline status, if Data already processed for pipeline before skip current processing
+        async with db_engine.get_async_session() as session:
+            data_point = (
+                await session.execute(select(Data).filter(Data.id == data_id))
+            ).scalar_one_or_none()
+            if data_point:
+                if (
+                    data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
+                    == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
+                ):
+                    yield {
+                        "run_info": PipelineRunAlreadyCompleted(
+                            pipeline_run_id=pipeline_run_id,
+                            dataset_id=dataset.id,
+                            dataset_name=dataset.name,
+                        ),
+                        "data_id": data_id,
+                    }
+                    return
+    except UnsupportedPathSchemeError as e:
+        logger.warning(f"data_item does not support incremental loading: {str(e)}")
+        # Fall back to regular processing since incremental loading is not supported
+        async for result in run_tasks_data_item_regular(
+            data_item=data_item,
+            dataset=dataset,
+            tasks=tasks,
+            pipeline_id=pipeline_id,
+            pipeline_run_id=pipeline_run_id,
+            context=context,
+            user=user,
+        ):
+            yield result
+        return
 
     try:
         # Process data based on data_item and list of tasks
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index 926c25a94..abd0d77ba 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -12,7 +12,6 @@ async def test_add_fails_when_preferred_loader_not_specified():
     with pytest.raises(ValueError):
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,  # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
         )
 
 
@@ -42,7 +41,70 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified():
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
             preferred_loaders=["web_url_loader"],
-            incremental_loading=False,  # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
+            loaders_config=loaders_config,
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to add url: {e}")
+
+
+@pytest.mark.asyncio
+async def test_add_with_incremental_loading_works():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    loaders_config = {
+        "web_url_loader": {
+            "soup_config": {
+                "max_depth": 1,
+                "follow_links": False,
+                "extraction_rules": extraction_rules,
+            }
+        }
+    }
+    try:
+        await cognee.add(
+            "https://en.wikipedia.org/wiki/Large_language_model",
+            preferred_loaders=["web_url_loader"],
+            incremental_loading=True,
+            loaders_config=loaders_config,
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to add url: {e}")
+
+
+@pytest.mark.asyncio
+async def test_add_without_incremental_loading_works():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    loaders_config = {
+        "web_url_loader": {
+            "soup_config": {
+                "max_depth": 1,
+                "follow_links": False,
+                "extraction_rules": extraction_rules,
+            }
+        }
+    }
+    try:
+        await cognee.add(
+            "https://en.wikipedia.org/wiki/Large_language_model",
+            preferred_loaders=["web_url_loader"],
+            incremental_loading=False,
             loaders_config=loaders_config,
         )
     except Exception as e:

From 8fe789ee9627cc8dbf6c707c3dde9c15e0fac893 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 19:24:40 +0100
Subject: [PATCH 18/44] nit: remove uneccessary import

---
 cognee/infrastructure/loaders/external/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py
index b92d9e7f0..2790a7ea0 100644
--- a/cognee/infrastructure/loaders/external/__init__.py
+++ b/cognee/infrastructure/loaders/external/__init__.py
@@ -10,7 +10,6 @@ These loaders are optional and only available if their dependencies are installe
 """
 
 from .pypdf_loader import PyPdfLoader
-from .web_url_loader import WebUrlLoader
 
 __all__ = ["PyPdfLoader"]
 

From 17b33ab443f7d2beb407438cdd2cfe2a959d531d Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 20:54:39 +0100
Subject: [PATCH 19/44] feat: web_url_fetcher

---
 .../tasks/ingestion/data_fetchers/__init__.py |  8 +++
 .../data_fetchers/data_fetcher_interface.py   | 15 ++++
 .../data_fetchers/web_url_fetcher.py          | 70 +++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 cognee/tasks/ingestion/data_fetchers/__init__.py
 create mode 100644 cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
 create mode 100644 cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py

diff --git a/cognee/tasks/ingestion/data_fetchers/__init__.py b/cognee/tasks/ingestion/data_fetchers/__init__.py
new file mode 100644
index 000000000..63530b427
--- /dev/null
+++ b/cognee/tasks/ingestion/data_fetchers/__init__.py
@@ -0,0 +1,8 @@
+__all__ = []
+
+try:
+    from .web_url_fetcher import WebUrlFetcher
+
+    __all__.append("WebUrlFetcher")
+except ImportError:
+    pass
diff --git a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
new file mode 100644
index 000000000..db8b8963b
--- /dev/null
+++ b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
@@ -0,0 +1,15 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class DataFetcherInterface(ABC):
+    @abstractmethod
+    def fetcher_name(self) -> str:
+        pass
+
+    @abstractmethod
+    async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]) -> str:
+        """
+        args: data_item_path - path to the data item
+        """
+        pass
diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
new file mode 100644
index 000000000..f1e5dac91
--- /dev/null
+++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
@@ -0,0 +1,70 @@
+from cognee.modules.ingestion import save_data_to_file
+from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface
+from typing import Any
+from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
+from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger()
+
+
+class WebUrlFetcher(DataFetcherInterface):
+    def __init__(self): ...
+
+    def fetcher_name(self):
+        return "web_url_fetcher"
+
+    async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]):
+        from cognee.context_global_variables import tavily_config, soup_crawler_config
+        from cognee.tasks.web_scraper import fetch_page_content
+
+        web_url_fetcher_config = fetchers_config.get(self.fetcher_name())
+        if not isinstance(web_url_fetcher_config, dict):
+            raise IngestionError(f"{self.fetcher_name()} configuration must be a valid dictionary")
+
+        tavily_dict = web_url_fetcher_config.get("tavily_config")
+        _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
+
+        soup_dict = web_url_fetcher_config.get("soup_config")
+        _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
+
+        # Set global configs for downstream access
+        tavily_config.set(_tavily_config)
+        soup_crawler_config.set(_soup_config)
+
+        preferred_tool = "beautifulsoup" if _soup_config else "tavily"
+        if preferred_tool == "tavily" and _tavily_config is None:
+            raise IngestionError(
+                message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
+            )
+        if preferred_tool == "beautifulsoup" and _soup_config is None:
+            raise IngestionError(
+                message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
+            )
+
+        logger.info(f"Starting web URL crawling for: {data_item_path}")
+        logger.info(f"Using scraping tool: {preferred_tool}")
+
+        data = await fetch_page_content(
+            data_item_path,
+            preferred_tool=preferred_tool,
+            soup_crawler_config=_soup_config,
+            tavily_config=_tavily_config,
+        )
+
+        logger.info(f"Successfully fetched content from URL {data_item_path}")
+
+        # fetch_page_content returns a dict like {url: content}
+        # Extract the content string before saving
+        if isinstance(data, dict):
+            # Concatenate all URL contents (usually just one URL)
+            content = ""
+            for url, text in data.items():
+                content += f"{url}:\n{text}\n\n"
+            logger.info(
+                f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters"
+            )
+        else:
+            content = data
+
+        return await save_data_to_file(content)

From d7417d9b06af7f912ff2c5f73a971a95466c2f9a Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 21:47:11 +0100
Subject: [PATCH 20/44] refactor: move url data fetching logic into
 `save_data_item_to_storage`

---
 cognee/api/v1/add/add.py                      |  3 +
 .../loaders/external/web_url_loader.py        | 64 +---------------
 .../modules/pipelines/operations/pipeline.py  | 14 +++-
 .../modules/pipelines/operations/run_tasks.py |  2 +
 .../operations/run_tasks_data_item.py         | 76 ++++++++-----------
 cognee/tasks/ingestion/ingest_data.py         | 42 ++++------
 .../ingestion/save_data_item_to_storage.py    | 11 ++-
 .../integration/web_url_crawler/test_add.py   | 34 +++++----
 8 files changed, 91 insertions(+), 155 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 3c4d7b696..1c76f7a52 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -31,6 +31,7 @@ async def add(
     incremental_loading: bool = True,
     data_per_batch: Optional[int] = 20,
     loaders_config: dict[LoaderInterface, dict] = {},
+    fetchers_config: dict[str, Any] = {},
 ):
     """
     Add data to Cognee for knowledge graph processing.
@@ -179,6 +180,7 @@ async def add(
             dataset_id,
             preferred_loaders,
             loaders_config,
+            fetchers_config,
         ),
     ]
 
@@ -204,6 +206,7 @@ async def add(
         graph_db_config=graph_db_config,
         incremental_loading=incremental_loading,
         data_per_batch=data_per_batch,
+        fetchers_config=fetchers_config,
     ):
         pipeline_run_info = run_info
 
diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index 1ecf82171..996f7dae6 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -2,8 +2,6 @@ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from typing import List
 
 from cognee.modules.ingestion.exceptions.exceptions import IngestionError
-from cognee.modules.ingestion import save_data_to_file
-from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger()
@@ -62,7 +60,7 @@ class WebUrlLoader(LoaderInterface):
         Load and process the file, returning standardized result.
 
         Args:
-            file_path: Path to the file to be processed
+            file_path: Path to the file to be processed (already saved by fetcher)
             file_stream: If file stream is provided it will be used to process file instead
             **kwargs: Additional loader-specific configuration
 
@@ -71,63 +69,5 @@ class WebUrlLoader(LoaderInterface):
         Raises:
             Exception: If file cannot be processed
         """
-        loaders_config = kwargs.get("loaders_config")
-        if not isinstance(loaders_config, dict):
-            raise IngestionError("loaders_config must be a valid dictionary")
 
-        web_url_loader_config = loaders_config.get(self.loader_name)
-        if not isinstance(web_url_loader_config, dict):
-            raise IngestionError(f"{self.loader_name} configuration must be a valid dictionary")
-
-        try:
-            from cognee.context_global_variables import tavily_config, soup_crawler_config
-            from cognee.tasks.web_scraper import fetch_page_content
-
-            tavily_dict = web_url_loader_config.get("tavily_config")
-            _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
-
-            soup_dict = web_url_loader_config.get("soup_config")
-            _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
-
-            # Set global configs for downstream access
-            tavily_config.set(_tavily_config)
-            soup_crawler_config.set(_soup_config)
-
-            preferred_tool = "beautifulsoup" if _soup_config else "tavily"
-            if preferred_tool == "tavily" and _tavily_config is None:
-                raise IngestionError(
-                    message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
-                )
-            if preferred_tool == "beautifulsoup" and _soup_config is None:
-                raise IngestionError(
-                    message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
-                )
-
-            logger.info(f"Starting web URL crawling for: {file_path}")
-            logger.info(f"Using scraping tool: {preferred_tool}")
-
-            data = await fetch_page_content(
-                file_path,
-                preferred_tool=preferred_tool,
-                tavily_config=_tavily_config,
-                soup_crawler_config=_soup_config,
-            )
-
-            logger.info(f"Successfully fetched content from {len(data)} URL(s)")
-            logger.info("Processing and concatenating fetched content")
-
-            content = ""
-            for key, value in data.items():
-                content += f"{key}:\n{value}\n\n"
-
-            logger.info(f"Saving content to file (total size: {len(content)} characters)")
-            stored_path = await save_data_to_file(content)
-            logger.info(f"Successfully saved content to: {stored_path}")
-
-            return stored_path
-        except IngestionError:
-            raise
-        except Exception as e:
-            raise IngestionError(
-                message=f"Error ingesting webpage from URL {file_path}: {str(e)}"
-            ) from e
+        return file_path
diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py
index e15e9e505..1e2b3aca5 100644
--- a/cognee/modules/pipelines/operations/pipeline.py
+++ b/cognee/modules/pipelines/operations/pipeline.py
@@ -20,6 +20,7 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
 from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
     check_pipeline_run_qualification,
 )
+from typing import Any
 
 logger = get_logger("cognee.pipeline")
 
@@ -36,6 +37,7 @@ async def run_pipeline(
     graph_db_config: dict = None,
     incremental_loading: bool = False,
     data_per_batch: int = 20,
+    fetchers_config: dict[str, Any] = {},
 ):
     validate_pipeline_tasks(tasks)
     await setup_and_check_environment(vector_db_config, graph_db_config)
@@ -52,6 +54,7 @@ async def run_pipeline(
             context={"dataset": dataset},
             incremental_loading=incremental_loading,
             data_per_batch=data_per_batch,
+            fetchers_config=fetchers_config,
         ):
             yield run_info
 
@@ -65,6 +68,7 @@ async def run_pipeline_per_dataset(
     context: dict = None,
     incremental_loading=False,
     data_per_batch: int = 20,
+    fetchers_config: dict[str, Any] = {},
 ):
     # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
     await set_database_global_context_variables(dataset.id, dataset.owner_id)
@@ -80,7 +84,15 @@ async def run_pipeline_per_dataset(
         return
 
     pipeline_run = run_tasks(
-        tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_per_batch
+        tasks,
+        dataset.id,
+        data,
+        user,
+        pipeline_name,
+        context,
+        incremental_loading,
+        data_per_batch,
+        fetchers_config,
     )
 
     async for pipeline_run_info in pipeline_run:
diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py
index ecc2f647b..d11d87ddf 100644
--- a/cognee/modules/pipelines/operations/run_tasks.py
+++ b/cognee/modules/pipelines/operations/run_tasks.py
@@ -60,6 +60,7 @@ async def run_tasks(
     context: dict = None,
     incremental_loading: bool = False,
     data_per_batch: int = 20,
+    fetchers_config: dict[str, Any] = {},
 ):
     if not user:
         user = await get_default_user()
@@ -106,6 +107,7 @@ async def run_tasks(
                         context,
                         user,
                         incremental_loading,
+                        fetchers_config,
                     )
                 )
                 for data_item in data_batch
diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py
index 0118e7976..9ddadd855 100644
--- a/cognee/modules/pipelines/operations/run_tasks_data_item.py
+++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py
@@ -39,6 +39,7 @@ async def run_tasks_data_item_incremental(
     pipeline_run_id: str,
     context: Optional[Dict[str, Any]],
     user: User,
+    fetchers_config: dict[str, Any],
 ) -> AsyncGenerator[Dict[str, Any], None]:
     """
     Process a single data item with incremental loading support.
@@ -64,51 +65,36 @@ async def run_tasks_data_item_incremental(
 
     # If incremental_loading of data is set to True don't process documents already processed by pipeline
     # If data is being added to Cognee for the first time calculate the id of the data
-    try:
-        if not isinstance(data_item, Data):
-            file_path = await save_data_item_to_storage(data_item)
-            # Ingest data and add metadata
-            async with open_data_file(file_path) as file:
-                classified_data = ingestion.classify(file)
-                # data_id is the hash of file contents + owner id to avoid duplicate data
-                data_id = ingestion.identify(classified_data, user)
-        else:
-            # If data was already processed by Cognee get data id
-            data_id = data_item.id
+    if not isinstance(data_item, Data):
+        file_path = await save_data_item_to_storage(data_item, fetchers_config)
+        # Ingest data and add metadata
+        async with open_data_file(file_path) as file:
+            classified_data = ingestion.classify(file)
+            # data_id is the hash of file contents + owner id to avoid duplicate data
+            data_id = ingestion.identify(classified_data, user)
+    else:
+        # If data was already processed by Cognee get data id
+        data_id = data_item.id
 
-        # Check pipeline status, if Data already processed for pipeline before skip current processing
-        async with db_engine.get_async_session() as session:
-            data_point = (
-                await session.execute(select(Data).filter(Data.id == data_id))
-            ).scalar_one_or_none()
-            if data_point:
-                if (
-                    data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
-                    == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
-                ):
-                    yield {
-                        "run_info": PipelineRunAlreadyCompleted(
-                            pipeline_run_id=pipeline_run_id,
-                            dataset_id=dataset.id,
-                            dataset_name=dataset.name,
-                        ),
-                        "data_id": data_id,
-                    }
-                    return
-    except UnsupportedPathSchemeError as e:
-        logger.warning(f"data_item does not support incremental loading: {str(e)}")
-        # Fall back to regular processing since incremental loading is not supported
-        async for result in run_tasks_data_item_regular(
-            data_item=data_item,
-            dataset=dataset,
-            tasks=tasks,
-            pipeline_id=pipeline_id,
-            pipeline_run_id=pipeline_run_id,
-            context=context,
-            user=user,
-        ):
-            yield result
-        return
+    # Check pipeline status, if Data already processed for pipeline before skip current processing
+    async with db_engine.get_async_session() as session:
+        data_point = (
+            await session.execute(select(Data).filter(Data.id == data_id))
+        ).scalar_one_or_none()
+        if data_point:
+            if (
+                data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
+                == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
+            ):
+                yield {
+                    "run_info": PipelineRunAlreadyCompleted(
+                        pipeline_run_id=pipeline_run_id,
+                        dataset_id=dataset.id,
+                        dataset_name=dataset.name,
+                    ),
+                    "data_id": data_id,
+                }
+                return
 
     try:
         # Process data based on data_item and list of tasks
@@ -225,6 +211,7 @@ async def run_tasks_data_item(
     context: Optional[Dict[str, Any]],
     user: User,
     incremental_loading: bool,
+    fetchers_config: dict[str, Any] = {},
 ) -> Optional[Dict[str, Any]]:
     """
     Process a single data item, choosing between incremental and regular processing.
@@ -259,6 +246,7 @@ async def run_tasks_data_item(
             pipeline_run_id=pipeline_run_id,
             context=context,
             user=user,
+            fetchers_config=fetchers_config,
         ):
             pass
     else:
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 233bb5f1c..84cd1f38b 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -31,6 +31,7 @@ async def ingest_data(
     dataset_id: UUID = None,
     preferred_loaders: List[str] = None,
     loaders_config: dict[LoaderInterface, dict] = {},
+    fetchers_config: dict[str, Any] = {},
 ):
     if not user:
         user = await get_default_user()
@@ -80,16 +81,10 @@ async def ingest_data(
         dataset_data_map = {str(data.id): True for data in dataset_data}
 
         for data_item in data:
-            try:
-                # Get file path of data item or create a file if it doesn't exist
-                original_file_path = await save_data_item_to_storage(data_item)
-                # Transform file path to be OS usable
-                actual_file_path = get_data_file_path(original_file_path)
-            except UnsupportedPathSchemeError:
-                # This data_item (e.g., HTTP/HTTPS URL) should be passed directly to the loader
-                # skip save_data_item_to_storage and get_data_file_path
-                actual_file_path = data_item
-                original_file_path = None  # we don't have an original file path
+            # Get file path of data item or create a file if it doesn't exist
+            original_file_path = await save_data_item_to_storage(data_item, fetchers_config)
+            # Transform file path to be OS usable
+            actual_file_path = get_data_file_path(original_file_path)
 
             # Store all input data as text files in Cognee data storage
             cognee_storage_file_path, loader_engine = await data_item_to_text_file(
@@ -99,26 +94,15 @@ async def ingest_data(
             )
 
             # Find metadata from original file
-            if original_file_path is not None:
-                # Standard flow: extract metadata from both original and stored files
-                async with open_data_file(original_file_path) as file:
-                    classified_data = ingestion.classify(file)
-                    data_id = ingestion.identify(classified_data, user)
-                    original_file_metadata = classified_data.get_metadata()
+            # Standard flow: extract metadata from both original and stored files
+            async with open_data_file(original_file_path) as file:
+                classified_data = ingestion.classify(file)
+                data_id = ingestion.identify(classified_data, user)
+                original_file_metadata = classified_data.get_metadata()
 
-                async with open_data_file(cognee_storage_file_path) as file:
-                    classified_data = ingestion.classify(file)
-                    storage_file_metadata = classified_data.get_metadata()
-            else:
-                # Alternative flow (e.g., URLs): extract metadata once from stored file
-                async with open_data_file(cognee_storage_file_path) as file:
-                    classified_data = ingestion.classify(file)
-                    data_id = ingestion.identify(classified_data, user)
-                    original_file_metadata = classified_data.get_metadata()
-                    # Override file_path to be the actual data_item (e.g., URL) ?
-                    original_file_metadata["file_path"] = actual_file_path
-                    # Storage metadata is the same as original
-                    storage_file_metadata = original_file_metadata.copy()
+            async with open_data_file(cognee_storage_file_path) as file:
+                classified_data = ingestion.classify(file)
+                storage_file_metadata = classified_data.get_metadata()
 
             from sqlalchemy import select
 
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index cf32477cb..d9b98268d 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -9,6 +9,8 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
+from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
+
 
 logger = get_logger()
 
@@ -22,7 +24,9 @@ class SaveDataSettings(BaseSettings):
 settings = SaveDataSettings()
 
 
-async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
+async def save_data_item_to_storage(
+    data_item: Union[BinaryIO, str, Any], fetchers_config: dict[str, Any] = {}
+) -> str:
     if "llama_index" in str(type(data_item)):
         # Dynamic import is used because the llama_index module is optional.
         from .transform_data import get_data_from_llama_index
@@ -57,9 +61,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         if parsed_url.scheme == "s3":
             return data_item
         elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            raise UnsupportedPathSchemeError(
-                message=f"HTTP/HTTPS URLs should be handled by loader, not by save_data_item_to_storage. Received: {data_item}"
-            )
+            fetcher = WebUrlFetcher()
+            return await fetcher.fetch(data_item, fetchers_config)
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index abd0d77ba..b45ed9139 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -1,22 +1,28 @@
+from sys import exc_info
 import pytest
 import cognee
+from cognee.modules.ingestion.exceptions.exceptions import IngestionError
 
 
 @pytest.mark.asyncio
-async def test_add_fails_when_preferred_loader_not_specified():
+async def test_add_fails_when_web_url_fetcher_config_not_specified():
     from cognee.shared.logging_utils import setup_logging, ERROR
 
     setup_logging(log_level=ERROR)
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
-    with pytest.raises(ValueError):
+    with pytest.raises(IngestionError) as excinfo:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
+            incremental_loading=False,
         )
+    assert excinfo.value.message.startswith(
+        "web_url_fetcher configuration must be a valid dictionary"
+    )
 
 
 @pytest.mark.asyncio
-async def test_add_succesfully_adds_url_when_preferred_loader_specified():
+async def test_add_succesfully_adds_url_when_fetcher_config_specified():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
@@ -27,8 +33,8 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    loaders_config = {
-        "web_url_loader": {
+    fetchers_config = {
+        "web_url_fetcher": {
             "soup_config": {
                 "max_depth": 1,
                 "follow_links": False,
@@ -40,8 +46,8 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified():
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            preferred_loaders=["web_url_loader"],
-            loaders_config=loaders_config,
+            incremental_loading=False,
+            fetchers_config=fetchers_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
@@ -59,8 +65,8 @@ async def test_add_with_incremental_loading_works():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    loaders_config = {
-        "web_url_loader": {
+    fetchers_config = {
+        "web_url_fetcher": {
             "soup_config": {
                 "max_depth": 1,
                 "follow_links": False,
@@ -71,9 +77,8 @@ async def test_add_with_incremental_loading_works():
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            preferred_loaders=["web_url_loader"],
             incremental_loading=True,
-            loaders_config=loaders_config,
+            fetchers_config=fetchers_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
@@ -91,8 +96,8 @@ async def test_add_without_incremental_loading_works():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    loaders_config = {
-        "web_url_loader": {
+    fetchers_config = {
+        "web_url_fetcher": {
             "soup_config": {
                 "max_depth": 1,
                 "follow_links": False,
@@ -103,9 +108,8 @@ async def test_add_without_incremental_loading_works():
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            preferred_loaders=["web_url_loader"],
             incremental_loading=False,
-            loaders_config=loaders_config,
+            fetchers_config=fetchers_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")

From fc660b46bb13fcaf901a830636a3aed73c4c9065 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 21:50:10 +0100
Subject: [PATCH 21/44] remove web_url_loader since there is no logic post
 fetching for loader

---
 .../loaders/external/__init__.py              |  7 --
 .../loaders/external/web_url_loader.py        | 73 -------------------
 .../tasks/ingestion/data_item_to_text_file.py | 11 ---
 .../web_url_crawler/test_loader_engine.py     | 20 -----
 ..._example.py => web_url_fetcher_example.py} |  7 +-
 5 files changed, 3 insertions(+), 115 deletions(-)
 delete mode 100644 cognee/infrastructure/loaders/external/web_url_loader.py
 delete mode 100644 cognee/tests/integration/web_url_crawler/test_loader_engine.py
 rename examples/python/{web_url_loader_example.py => web_url_fetcher_example.py} (80%)

diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py
index 2790a7ea0..6bf9f9200 100644
--- a/cognee/infrastructure/loaders/external/__init__.py
+++ b/cognee/infrastructure/loaders/external/__init__.py
@@ -27,10 +27,3 @@ try:
     __all__.append("AdvancedPdfLoader")
 except ImportError:
     pass
-
-try:
-    from .web_url_loader import WebUrlLoader
-
-    __all__.append("WebUrlLoader")
-except ImportError:
-    pass
diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
deleted file mode 100644
index 996f7dae6..000000000
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
-from typing import List
-
-from cognee.modules.ingestion.exceptions.exceptions import IngestionError
-from cognee.shared.logging_utils import get_logger
-
-logger = get_logger()
-
-
-class WebUrlLoader(LoaderInterface):
-    @property
-    def supported_extensions(self) -> List[str]:
-        """
-        List of file extensions this loader supports.
-
-        Returns:
-            List of extensions including the dot (e.g., ['.txt', '.md'])
-        """
-        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
-
-    @property
-    def supported_mime_types(self) -> List[str]:
-        """
-        List of MIME types this loader supports.
-
-        Returns:
-            List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
-        """
-        return []  # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
-
-    @property
-    def loader_name(self) -> str:
-        """
-        Unique name identifier for this loader.
-
-        Returns:
-            String identifier used for registration and configuration
-        """
-        return "web_url_loader"
-
-    def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
-        """
-        Check if this loader can handle the given file.
-
-        Args:
-            extension: File extension
-            mime_type: MIME type of the file
-
-        Returns:
-            True if this loader can process the file, False otherwise
-        """
-        if data_item_path is None:
-            raise IngestionError(
-                "data_item_path should not be None"
-            )  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
-        return data_item_path.startswith(("http://", "https://"))
-
-    async def load(self, file_path: str, **kwargs):
-        """
-        Load and process the file, returning standardized result.
-
-        Args:
-            file_path: Path to the file to be processed (already saved by fetcher)
-            file_stream: If file stream is provided it will be used to process file instead
-            **kwargs: Additional loader-specific configuration
-
-        Returns:
-            file path to the stored file
-        Raises:
-            Exception: If file cannot be processed
-        """
-
-        return file_path
diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index 211b918ae..8d2e915b0 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -76,16 +76,5 @@ async def data_item_to_text_file(
                 )
             else:
                 raise IngestionError(message="Local files are not accepted.")
-
-        elif data_item_path.startswith(("http://", "https://")):
-            loader = get_loader_engine()
-            return (
-                await loader.load_file(
-                    data_item_path,
-                    preferred_loaders,
-                    loaders_config=loaders_config,  # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
-                ),
-                loader.get_loader(data_item_path, preferred_loaders),
-            )
     # data is not a supported type
     raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
diff --git a/cognee/tests/integration/web_url_crawler/test_loader_engine.py b/cognee/tests/integration/web_url_crawler/test_loader_engine.py
deleted file mode 100644
index 018c034e1..000000000
--- a/cognee/tests/integration/web_url_crawler/test_loader_engine.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-
-from cognee.infrastructure.loaders import get_loader_engine
-from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader
-
-
-def test_get_loader_returns_none_by_default_for_web_urls():
-    loader_engine = get_loader_engine()
-    urls = ["https://cognee.ai", "http://cognee.ai"]
-    for url in urls:
-        loader = loader_engine.get_loader(url)
-        assert loader is None
-
-
-def test_get_loader_returns_valid_loader_when_preferred_loaders_specified():
-    loader_engine = get_loader_engine()
-    urls = ["https://cognee.ai", "http://cognee.ai"]
-    for url in urls:
-        loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"])
-        assert isinstance(loader, WebUrlLoader)
diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_fetcher_example.py
similarity index 80%
rename from examples/python/web_url_loader_example.py
rename to examples/python/web_url_fetcher_example.py
index 37dd2258c..9ac099e16 100644
--- a/examples/python/web_url_loader_example.py
+++ b/examples/python/web_url_fetcher_example.py
@@ -20,7 +20,7 @@ async def main():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    loaders_config = {
+    fetchers_config = {
         "web_url_loader": {
             "soup_config": {
                 "max_depth": 1,
@@ -32,9 +32,8 @@ async def main():
 
     await cognee.add(
         "https://en.wikipedia.org/wiki/Large_language_model",
-        preferred_loaders=["web_url_loader"],
-        incremental_loading=False,  # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
-        loaders_config=loaders_config,
+        incremental_loading=False,
+        fetchers_config=fetchers_config,
     )
 
     await cognee.cognify()

From f7c2187ce7612c0ca4068bfe39a7895bf8823520 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 21:52:10 +0100
Subject: [PATCH 22/44] remove `loaders_config` as it's not in use

---
 cognee/api/v1/add/add.py                         | 2 --
 cognee/tasks/ingestion/data_item_to_text_file.py | 1 -
 cognee/tasks/ingestion/ingest_data.py            | 2 --
 3 files changed, 5 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 1c76f7a52..44005d755 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -30,7 +30,6 @@ async def add(
     preferred_loaders: List[str] = None,
     incremental_loading: bool = True,
     data_per_batch: Optional[int] = 20,
-    loaders_config: dict[LoaderInterface, dict] = {},
     fetchers_config: dict[str, Any] = {},
 ):
     """
@@ -179,7 +178,6 @@ async def add(
             node_set,
             dataset_id,
             preferred_loaders,
-            loaders_config,
             fetchers_config,
         ),
     ]
diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index 8d2e915b0..4b9e4bb23 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -36,7 +36,6 @@ async def pull_from_s3(file_path, destination_file) -> None:
 async def data_item_to_text_file(
     data_item_path: str,
     preferred_loaders: List[str],
-    loaders_config: dict[LoaderInterface, dict],
 ) -> Tuple[str, LoaderInterface]:
     if isinstance(data_item_path, str):
         parsed_url = urlparse(data_item_path)
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 84cd1f38b..648a34ace 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -30,7 +30,6 @@ async def ingest_data(
     node_set: Optional[List[str]] = None,
     dataset_id: UUID = None,
     preferred_loaders: List[str] = None,
-    loaders_config: dict[LoaderInterface, dict] = {},
     fetchers_config: dict[str, Any] = {},
 ):
     if not user:
@@ -90,7 +89,6 @@ async def ingest_data(
             cognee_storage_file_path, loader_engine = await data_item_to_text_file(
                 actual_file_path,
                 preferred_loaders,
-                loaders_config,
             )
 
             # Find metadata from original file

From 1213a3a4cb529a7f394baa9f8ef83e9e2bd2d67f Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 22:00:13 +0100
Subject: [PATCH 23/44] revert changes to `LoaderEngine` and `LoaderInterface`

---
 cognee/infrastructure/loaders/LoaderEngine.py | 23 ++++---------------
 .../infrastructure/loaders/LoaderInterface.py |  4 +---
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index d6c4d4d8c..6b62f7641 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -64,7 +64,7 @@ class LoaderEngine:
         return True
 
     def get_loader(
-        self, data_item_path: str, preferred_loaders: List[str] = None
+        self, file_path: str, preferred_loaders: List[str] = None
     ) -> Optional[LoaderInterface]:
         """
         Get appropriate loader for a file.
@@ -76,37 +76,24 @@ class LoaderEngine:
         Returns:
             LoaderInterface that can handle the file, or None if not found
         """
-        is_url = data_item_path.startswith(("http://", "https://"))
 
-        if is_url:
-            extension = None
-            mime_type = None
-        else:
-            file_info = filetype.guess(data_item_path)
-            extension = file_info.extension if file_info else None
-            mime_type = file_info.mime if file_info else None
+        file_info = filetype.guess(file_path)
 
         # Try preferred loaders first
         if preferred_loaders:
             for loader_name in preferred_loaders:
                 if loader_name in self._loaders:
                     loader = self._loaders[loader_name]
-                    if loader.can_handle(
-                        extension=extension,
-                        mime_type=mime_type,
-                        data_item_path=data_item_path,
-                    ):  # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
+                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                         return loader
                 else:
                     logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
 
         # Try default priority order
-        for loader_name in (
-            self.default_loader_priority
-        ):  # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
+        for loader_name in self.default_loader_priority:
             if loader_name in self._loaders:
                 loader = self._loaders[loader_name]
-                if loader.can_handle(extension=extension, mime_type=mime_type):
+                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                     return loader
             else:
                 logger.info(
diff --git a/cognee/infrastructure/loaders/LoaderInterface.py b/cognee/infrastructure/loaders/LoaderInterface.py
index fb309304b..3a1c9bf3e 100644
--- a/cognee/infrastructure/loaders/LoaderInterface.py
+++ b/cognee/infrastructure/loaders/LoaderInterface.py
@@ -44,9 +44,7 @@ class LoaderInterface(ABC):
         pass
 
     @abstractmethod
-    def can_handle(
-        self, extension: str, mime_type: str, data_item_path: str = None
-    ) -> bool:  # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
+    def can_handle(self, extension: str, mime_type: str) -> bool:
         """
         Check if this loader can handle the given file.
 

From fdf7c27fec2762c3fa84fb949f1c63266eaee5a6 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 22:02:29 +0100
Subject: [PATCH 24/44] refactor: remove WebUrlLoader imports

---
 cognee/infrastructure/loaders/supported_loaders.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py
index 7f92aa36a..d103babe3 100644
--- a/cognee/infrastructure/loaders/supported_loaders.py
+++ b/cognee/infrastructure/loaders/supported_loaders.py
@@ -23,10 +23,3 @@ try:
     supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
 except ImportError:
     pass
-
-try:
-    from cognee.infrastructure.loaders.external import WebUrlLoader
-
-    supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
-except ImportError:
-    pass

From 35d3c0877922624a5e9263d3efc5ebe4abcbe332 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 22:04:00 +0100
Subject: [PATCH 25/44] Clean up `add.py` imports

---
 cognee/api/v1/add/add.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 44005d755..67da3047b 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -1,9 +1,5 @@
 from uuid import UUID
-import os
-from typing import Union, BinaryIO, List, Optional, Dict, Any
-from pydantic import BaseModel
-from urllib.parse import urlparse
-from cognee.infrastructure.loaders import LoaderInterface
+from typing import Union, BinaryIO, List, Optional, Any
 from cognee.modules.users.models import User
 from cognee.modules.pipelines import Task, run_pipeline
 from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (

From 085e81c082e46f1a96c02d4443657baa3a5cb07f Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 22:05:14 +0100
Subject: [PATCH 26/44] Clean up - remove `UnsupportedPathSchemeError`

---
 cognee/infrastructure/files/exceptions.py            | 12 ------------
 .../infrastructure/files/utils/get_data_file_path.py |  7 -------
 .../pipelines/operations/run_tasks_data_item.py      |  1 -
 cognee/tasks/ingestion/ingest_data.py                |  2 --
 cognee/tasks/ingestion/save_data_item_to_storage.py  |  1 -
 5 files changed, 23 deletions(-)

diff --git a/cognee/infrastructure/files/exceptions.py b/cognee/infrastructure/files/exceptions.py
index eb6efdbce..351eaee9c 100644
--- a/cognee/infrastructure/files/exceptions.py
+++ b/cognee/infrastructure/files/exceptions.py
@@ -11,15 +11,3 @@ class FileContentHashingError(Exception):
         status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
     ):
         super().__init__(message, name, status_code)
-
-
-class UnsupportedPathSchemeError(Exception):
-    """Raised when a non-filesystem path scheme (like http://, https://) is passed to a function expecting filesystem paths."""
-
-    def __init__(
-        self,
-        message: str = "This function only supports filesystem paths (file:// or local paths), not HTTP/HTTPS URLs.",
-        name: str = "UnsupportedPathSchemeError",
-        status_code=status.HTTP_400_BAD_REQUEST,
-    ):
-        super().__init__(message, name, status_code)
diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py
index d67fc95a0..7ffda79bd 100644
--- a/cognee/infrastructure/files/utils/get_data_file_path.py
+++ b/cognee/infrastructure/files/utils/get_data_file_path.py
@@ -1,8 +1,6 @@
 import os
 from urllib.parse import urlparse
 
-from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
-
 
 def get_data_file_path(file_path: str):
     # Check if this is a file URI BEFORE normalizing (which corrupts URIs)
@@ -40,11 +38,6 @@ def get_data_file_path(file_path: str):
 
         return normalized_url
 
-    elif file_path.startswith(("http://", "https://")):
-        raise UnsupportedPathSchemeError(
-            message=f"HTTP/HTTPS URLs are not supported by get_data_file_path(). Received: {file_path}"
-        )
-
     else:
         # Regular file path - normalize separators
         normalized_path = os.path.normpath(file_path)
diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py
index 9ddadd855..e445d323b 100644
--- a/cognee/modules/pipelines/operations/run_tasks_data_item.py
+++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py
@@ -9,7 +9,6 @@ import os
 from typing import Any, Dict, AsyncGenerator, Optional
 from sqlalchemy import select
 
-from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.infrastructure.files.utils.open_data_file import open_data_file
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 648a34ace..e707f4d92 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -3,8 +3,6 @@ import inspect
 from uuid import UUID
 from typing import Union, BinaryIO, Any, List, Optional
 
-from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
-from cognee.infrastructure.loaders import LoaderInterface
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index d9b98268d..c70ddb2de 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -3,7 +3,6 @@ from pathlib import Path
 from urllib.parse import urlparse
 from typing import Union, BinaryIO, Any
 
-from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError
 from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger

From abbbf88ad342e70b0d1dab57100bf7821e0e25e5 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Mon, 20 Oct 2025 23:31:49 +0100
Subject: [PATCH 27/44] CI: use scraping dependenies for integration tests

---
 .github/workflows/basic_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/basic_tests.yml b/.github/workflows/basic_tests.yml
index 3f3e644a2..f89d031a6 100644
--- a/.github/workflows/basic_tests.yml
+++ b/.github/workflows/basic_tests.yml
@@ -123,6 +123,7 @@ jobs:
         uses: ./.github/actions/cognee_setup
         with:
           python-version: ${{ inputs.python-version }}
+          extra-dependencies: "scraping"
 
       - name: Run Integration Tests
         run: uv run pytest cognee/tests/integration/

From 95e735d3979aba36f72c7cc353c8641756fe2359 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 12:04:35 +0100
Subject: [PATCH 28/44] remove `fetchers_config`, use default configs for
 Tavily and BeautifulSoup

---
 cognee/api/v1/add/add.py                      |  3 --
 .../modules/pipelines/operations/pipeline.py  |  4 ---
 .../modules/pipelines/operations/run_tasks.py |  2 --
 .../operations/run_tasks_data_item.py         |  5 +--
 .../data_fetchers/data_fetcher_interface.py   |  2 +-
 .../data_fetchers/web_url_fetcher.py          | 33 ++++++-------------
 cognee/tasks/ingestion/ingest_data.py         |  3 +-
 .../ingestion/save_data_item_to_storage.py    |  6 ++--
 .../integration/web_url_crawler/test_add.py   | 31 -----------------
 examples/python/web_url_fetcher_example.py    | 11 -------
 10 files changed, 15 insertions(+), 85 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 67da3047b..216911ec0 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -26,7 +26,6 @@ async def add(
     preferred_loaders: List[str] = None,
     incremental_loading: bool = True,
     data_per_batch: Optional[int] = 20,
-    fetchers_config: dict[str, Any] = {},
 ):
     """
     Add data to Cognee for knowledge graph processing.
@@ -174,7 +173,6 @@ async def add(
             node_set,
             dataset_id,
             preferred_loaders,
-            fetchers_config,
         ),
     ]
 
@@ -200,7 +198,6 @@ async def add(
         graph_db_config=graph_db_config,
         incremental_loading=incremental_loading,
         data_per_batch=data_per_batch,
-        fetchers_config=fetchers_config,
     ):
         pipeline_run_info = run_info
 
diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py
index 1e2b3aca5..eb0ebe8bd 100644
--- a/cognee/modules/pipelines/operations/pipeline.py
+++ b/cognee/modules/pipelines/operations/pipeline.py
@@ -37,7 +37,6 @@ async def run_pipeline(
     graph_db_config: dict = None,
     incremental_loading: bool = False,
     data_per_batch: int = 20,
-    fetchers_config: dict[str, Any] = {},
 ):
     validate_pipeline_tasks(tasks)
     await setup_and_check_environment(vector_db_config, graph_db_config)
@@ -54,7 +53,6 @@ async def run_pipeline(
             context={"dataset": dataset},
             incremental_loading=incremental_loading,
             data_per_batch=data_per_batch,
-            fetchers_config=fetchers_config,
         ):
             yield run_info
 
@@ -68,7 +66,6 @@ async def run_pipeline_per_dataset(
     context: dict = None,
     incremental_loading=False,
     data_per_batch: int = 20,
-    fetchers_config: dict[str, Any] = {},
 ):
     # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
     await set_database_global_context_variables(dataset.id, dataset.owner_id)
@@ -92,7 +89,6 @@ async def run_pipeline_per_dataset(
         context,
         incremental_loading,
         data_per_batch,
-        fetchers_config,
     )
 
     async for pipeline_run_info in pipeline_run:
diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py
index d11d87ddf..ecc2f647b 100644
--- a/cognee/modules/pipelines/operations/run_tasks.py
+++ b/cognee/modules/pipelines/operations/run_tasks.py
@@ -60,7 +60,6 @@ async def run_tasks(
     context: dict = None,
     incremental_loading: bool = False,
     data_per_batch: int = 20,
-    fetchers_config: dict[str, Any] = {},
 ):
     if not user:
         user = await get_default_user()
@@ -107,7 +106,6 @@ async def run_tasks(
                         context,
                         user,
                         incremental_loading,
-                        fetchers_config,
                     )
                 )
                 for data_item in data_batch
diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py
index e445d323b..152e72d7f 100644
--- a/cognee/modules/pipelines/operations/run_tasks_data_item.py
+++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py
@@ -38,7 +38,6 @@ async def run_tasks_data_item_incremental(
     pipeline_run_id: str,
     context: Optional[Dict[str, Any]],
     user: User,
-    fetchers_config: dict[str, Any],
 ) -> AsyncGenerator[Dict[str, Any], None]:
     """
     Process a single data item with incremental loading support.
@@ -65,7 +64,7 @@ async def run_tasks_data_item_incremental(
     # If incremental_loading of data is set to True don't process documents already processed by pipeline
     # If data is being added to Cognee for the first time calculate the id of the data
     if not isinstance(data_item, Data):
-        file_path = await save_data_item_to_storage(data_item, fetchers_config)
+        file_path = await save_data_item_to_storage(data_item)
         # Ingest data and add metadata
         async with open_data_file(file_path) as file:
             classified_data = ingestion.classify(file)
@@ -210,7 +209,6 @@ async def run_tasks_data_item(
     context: Optional[Dict[str, Any]],
     user: User,
     incremental_loading: bool,
-    fetchers_config: dict[str, Any] = {},
 ) -> Optional[Dict[str, Any]]:
     """
     Process a single data item, choosing between incremental and regular processing.
@@ -245,7 +243,6 @@ async def run_tasks_data_item(
             pipeline_run_id=pipeline_run_id,
             context=context,
             user=user,
-            fetchers_config=fetchers_config,
         ):
             pass
     else:
diff --git a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
index db8b8963b..9171e429d 100644
--- a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
+++ b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
@@ -8,7 +8,7 @@ class DataFetcherInterface(ABC):
         pass
 
     @abstractmethod
-    async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]) -> str:
+    async def fetch(self, data_item_path: str) -> str:
         """
         args: data_item_path - path to the data item
         """
diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
index f1e5dac91..3b90b51b1 100644
--- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
+++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
@@ -1,8 +1,7 @@
+import os
 from cognee.modules.ingestion import save_data_to_file
 from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface
-from typing import Any
 from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
-from cognee.modules.ingestion.exceptions.exceptions import IngestionError
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger()
@@ -14,34 +13,22 @@ class WebUrlFetcher(DataFetcherInterface):
     def fetcher_name(self):
         return "web_url_fetcher"
 
-    async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]):
+    async def fetch(self, data_item_path: str):
         from cognee.context_global_variables import tavily_config, soup_crawler_config
         from cognee.tasks.web_scraper import fetch_page_content
 
-        web_url_fetcher_config = fetchers_config.get(self.fetcher_name())
-        if not isinstance(web_url_fetcher_config, dict):
-            raise IngestionError(f"{self.fetcher_name()} configuration must be a valid dictionary")
+        if os.getenv("TAVILY_API_KEY"):
+            _tavily_config = TavilyConfig()
+            _soup_config = None
+            preferred_tool = "tavily"
+        else:
+            _tavily_config = None
+            _soup_config = SoupCrawlerConfig()
+            preferred_tool = "beautifulsoup"
 
-        tavily_dict = web_url_fetcher_config.get("tavily_config")
-        _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
-
-        soup_dict = web_url_fetcher_config.get("soup_config")
-        _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
-
-        # Set global configs for downstream access
         tavily_config.set(_tavily_config)
         soup_crawler_config.set(_soup_config)
 
-        preferred_tool = "beautifulsoup" if _soup_config else "tavily"
-        if preferred_tool == "tavily" and _tavily_config is None:
-            raise IngestionError(
-                message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
-            )
-        if preferred_tool == "beautifulsoup" and _soup_config is None:
-            raise IngestionError(
-                message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
-            )
-
         logger.info(f"Starting web URL crawling for: {data_item_path}")
         logger.info(f"Using scraping tool: {preferred_tool}")
 
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index e707f4d92..02987b893 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -28,7 +28,6 @@ async def ingest_data(
     node_set: Optional[List[str]] = None,
     dataset_id: UUID = None,
     preferred_loaders: List[str] = None,
-    fetchers_config: dict[str, Any] = {},
 ):
     if not user:
         user = await get_default_user()
@@ -79,7 +78,7 @@ async def ingest_data(
 
         for data_item in data:
             # Get file path of data item or create a file if it doesn't exist
-            original_file_path = await save_data_item_to_storage(data_item, fetchers_config)
+            original_file_path = await save_data_item_to_storage(data_item)
             # Transform file path to be OS usable
             actual_file_path = get_data_file_path(original_file_path)
 
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index c70ddb2de..453219f15 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -23,9 +23,7 @@ class SaveDataSettings(BaseSettings):
 settings = SaveDataSettings()
 
 
-async def save_data_item_to_storage(
-    data_item: Union[BinaryIO, str, Any], fetchers_config: dict[str, Any] = {}
-) -> str:
+async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
     if "llama_index" in str(type(data_item)):
         # Dynamic import is used because the llama_index module is optional.
         from .transform_data import get_data_from_llama_index
@@ -61,7 +59,7 @@ async def save_data_item_to_storage(
             return data_item
         elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
             fetcher = WebUrlFetcher()
-            return await fetcher.fetch(data_item, fetchers_config)
+            return await fetcher.fetch(data_item)
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index b45ed9139..a00ca9e0d 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -33,21 +33,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    fetchers_config = {
-        "web_url_fetcher": {
-            "soup_config": {
-                "max_depth": 1,
-                "follow_links": False,
-                "extraction_rules": extraction_rules,
-            }
-        }
-    }
-
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
             incremental_loading=False,
-            fetchers_config=fetchers_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
@@ -65,20 +54,10 @@ async def test_add_with_incremental_loading_works():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    fetchers_config = {
-        "web_url_fetcher": {
-            "soup_config": {
-                "max_depth": 1,
-                "follow_links": False,
-                "extraction_rules": extraction_rules,
-            }
-        }
-    }
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
             incremental_loading=True,
-            fetchers_config=fetchers_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
@@ -96,20 +75,10 @@ async def test_add_without_incremental_loading_works():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    fetchers_config = {
-        "web_url_fetcher": {
-            "soup_config": {
-                "max_depth": 1,
-                "follow_links": False,
-                "extraction_rules": extraction_rules,
-            }
-        }
-    }
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
             incremental_loading=False,
-            fetchers_config=fetchers_config,
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
diff --git a/examples/python/web_url_fetcher_example.py b/examples/python/web_url_fetcher_example.py
index 9ac099e16..2195a62c0 100644
--- a/examples/python/web_url_fetcher_example.py
+++ b/examples/python/web_url_fetcher_example.py
@@ -20,20 +20,9 @@ async def main():
         "paragraphs": {"selector": "p", "all": True},
     }
 
-    fetchers_config = {
-        "web_url_loader": {
-            "soup_config": {
-                "max_depth": 1,
-                "follow_links": False,
-                "extraction_rules": extraction_rules,
-            }
-        }
-    }
-
     await cognee.add(
         "https://en.wikipedia.org/wiki/Large_language_model",
         incremental_loading=False,
-        fetchers_config=fetchers_config,
     )
 
     await cognee.cognify()

From 5035c872a71ce77dc8bd564ebb16400ec5a5f3dd Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 15:20:09 +0100
Subject: [PATCH 29/44] refactor: update web scraper configurations and
 simplify fetch logic

---
 cognee/context_global_variables.py            |  7 ++-
 .../data_fetchers/web_url_fetcher.py          |  2 -
 cognee/tasks/web_scraper/bs4_crawler.py       | 55 +++----------------
 cognee/tasks/web_scraper/utils.py             | 24 ++++----
 4 files changed, 25 insertions(+), 63 deletions(-)

diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py
index 2ecf9b8d3..388316359 100644
--- a/cognee/context_global_variables.py
+++ b/cognee/context_global_variables.py
@@ -7,14 +7,17 @@ from cognee.base_config import get_base_config
 from cognee.infrastructure.databases.utils import get_or_create_dataset_database
 from cognee.infrastructure.files.storage.config import file_storage_config
 from cognee.modules.users.methods import get_user
+from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
 
 # Note: ContextVar allows us to use different graph db configurations in Cognee
 #       for different async tasks, threads and processes
 vector_db_config = ContextVar("vector_db_config", default=None)
 graph_db_config = ContextVar("graph_db_config", default=None)
 session_user = ContextVar("session_user", default=None)
-soup_crawler_config = ContextVar("soup_crawler_config", default=None)
-tavily_config = ContextVar("tavily_config", default=None)
+soup_crawler_config: ContextVar[SoupCrawlerConfig | None] = ContextVar(
+    "soup_crawler_config", default=None
+)
+tavily_config: ContextVar[TavilyConfig | None] = ContextVar("tavily_config", default=None)
 
 
 async def set_session_user_context_variable(user):
diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
index 3b90b51b1..949cb9b0a 100644
--- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
+++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
@@ -35,8 +35,6 @@ class WebUrlFetcher(DataFetcherInterface):
         data = await fetch_page_content(
             data_item_path,
             preferred_tool=preferred_tool,
-            soup_crawler_config=_soup_config,
-            tavily_config=_tavily_config,
         )
 
         logger.info(f"Successfully fetched content from URL {data_item_path}")
diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py
index 400287e08..969058466 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@@ -66,6 +66,7 @@ class RobotsTxtCache:
     timestamp: float = field(default_factory=time.time)
 
 
+# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
 class BeautifulSoupCrawler:
     """Crawler for fetching and extracting web content using BeautifulSoup.
 
@@ -491,14 +492,12 @@ class BeautifulSoupCrawler:
                 return (val or "").strip()
             return el.get_text(strip=True)
 
-    async def fetch_with_bs4(
+    async def fetch_urls(
         self,
-        urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
-        extraction_rules: Optional[Dict[str, Any]] = None,
+        urls: Union[str, List[str]],
         *,
         use_playwright: bool = False,
         playwright_js_wait: float = 0.8,
-        join_all_matches: bool = False,
     ) -> Dict[str, str]:
         """Fetch and extract content from URLs using BeautifulSoup or Playwright.
 
@@ -516,38 +515,11 @@ class BeautifulSoupCrawler:
             ValueError: If extraction_rules are missing when required or if urls is invalid.
             Exception: If fetching or extraction fails.
         """
-        url_rules_map: Dict[str, Dict[str, Any]] = {}
-
         if isinstance(urls, str):
-            if not extraction_rules:
-                raise ValueError("extraction_rules required when urls is a string")
-            url_rules_map[urls] = extraction_rules
-        elif isinstance(urls, list):
-            if not extraction_rules:
-                raise ValueError("extraction_rules required when urls is a list")
-            for url in urls:
-                url_rules_map[url] = extraction_rules
-        elif isinstance(urls, dict):
-            url_rules_map = urls
+            urls = [urls]
         else:
             raise ValueError(f"Invalid urls type: {type(urls)}")
 
-        logger.info(
-            f"Preparing to fetch {len(url_rules_map)} URL(s) with {len(extraction_rules) if extraction_rules else 0} extraction rule(s)"
-        )
-
-        normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
-        for url, rules in url_rules_map.items():
-            normalized_rules = []
-            for _, rule in rules.items():
-                r = self._normalize_rule(rule)
-                if join_all_matches:
-                    r.all = True
-                normalized_rules.append(r)
-            normalized_url_rules[url] = normalized_rules
-
-        logger.info(f"Normalized extraction rules for {len(normalized_url_rules)} URL(s)")
-
         async def _task(url: str):
             async with self._sem:
                 try:
@@ -575,30 +547,21 @@ class BeautifulSoupCrawler:
 
                     logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
 
-                    # Extract content
-                    pieces = []
-                    for rule in normalized_url_rules[url]:
-                        text = self._extract_with_bs4(html, rule)
-                        if text:
-                            pieces.append(text)
-
-                    concatenated = " ".join(pieces).strip()
-                    logger.info(f"Extracted {len(concatenated)} characters from {url}")
-                    return url, concatenated
+                    return url, html
 
                 except Exception as e:
                     logger.error(f"Error processing {url}: {e}")
                     return url, ""
 
-        logger.info(f"Creating {len(url_rules_map)} async tasks for concurrent fetching")
-        tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
+        logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
+        tasks = [asyncio.create_task(_task(u)) for u in urls]
         results = {}
         completed = 0
         total = len(tasks)
 
         for coro in asyncio.as_completed(tasks):
-            url, text = await coro
-            results[url] = text
+            url, html = await coro
+            results[url] = html
             completed += 1
             logger.info(f"Progress: {completed}/{total} URLs processed")
 
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index a32b6848c..8b8bcc11f 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -5,19 +5,17 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
 """
 
 from typing import Dict, List, Union, Optional, Literal
+from cognee.context_global_variables import soup_crawler_config, tavily_config
 from cognee.shared.logging_utils import get_logger
 from .bs4_crawler import BeautifulSoupCrawler
-from .config import TavilyConfig, SoupCrawlerConfig
+from .config import TavilyConfig
 
 logger = get_logger(__name__)
 
 
 async def fetch_page_content(
     urls: Union[str, List[str]],
-    *,
     preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup",
-    tavily_config: Optional[TavilyConfig] = None,
-    soup_crawler_config: Optional[SoupCrawlerConfig] = None,
 ) -> Dict[str, str]:
     """Fetch content from one or more URLs using the specified tool.
 
@@ -48,6 +46,9 @@ async def fetch_page_content(
     url_list = [urls] if isinstance(urls, str) else urls
     logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}")
 
+    _tavily_config = tavily_config.get()
+    _soup_crawler_config = soup_crawler_config.get()
+
     if preferred_tool == "tavily":
         if not tavily_config or tavily_config.api_key is None:
             raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
@@ -62,11 +63,10 @@ async def fetch_page_content(
                 "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
             )
             raise ImportError
-        if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
-            raise ValueError("extraction_rules must be provided when not using Tavily")
+        if soup_crawler_config is None or soup_crawler_config.extraction_rules is None:
+            raise ValueError("soup_crawler_config must be provided when not using Tavily")
 
         logger.info("Using BeautifulSoup for content extraction")
-        extraction_rules = soup_crawler_config.extraction_rules
         logger.info(
             f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s"
         )
@@ -85,12 +85,10 @@ async def fetch_page_content(
             logger.info(
                 f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})"
             )
-            results = await crawler.fetch_with_bs4(
+            results = await crawler.fetch_urls(
                 urls,
-                extraction_rules,
                 use_playwright=soup_crawler_config.use_playwright,
                 playwright_js_wait=soup_crawler_config.playwright_js_wait,
-                join_all_matches=soup_crawler_config.join_all_matches,
             )
             logger.info(f"Successfully fetched content from {len(results)} URL(s)")
             return results
@@ -103,7 +101,7 @@ async def fetch_page_content(
 
 
 async def fetch_with_tavily(
-    urls: Union[str, List[str]], tavily_config: Optional[TavilyConfig] = None
+    urls: Union[str, List[str]], tavily_config: TavilyConfig
 ) -> Dict[str, str]:
     """Fetch content from URLs using the Tavily API.
 
@@ -133,8 +131,8 @@ async def fetch_with_tavily(
         f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
     )
     client = AsyncTavilyClient(
-        api_key=tavily_config.api_key if tavily_config else None,
-        proxies=tavily_config.proxies if tavily_config else None,
+        api_key=tavily_config.api_key,
+        proxies=tavily_config.proxies,
     )
 
     logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")

From a7ff18801866def587d028c7258749d4d9e6d80f Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 15:22:40 +0100
Subject: [PATCH 30/44] add crawler tests

---
 .../web_url_crawler/test_bs4_crawler.py           | 13 +++++++++++++
 .../web_url_crawler/test_tavily_crawler.py        | 15 +++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
 create mode 100644 cognee/tests/integration/web_url_crawler/test_tavily_crawler.py

diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
new file mode 100644
index 000000000..0e7637d86
--- /dev/null
+++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
@@ -0,0 +1,13 @@
+import pytest
+from cognee.tasks.web_scraper import BeautifulSoupCrawler
+
+
+@pytest.mark.asyncio
+async def test_fetch():
+    crawler = BeautifulSoupCrawler()
+    url = "https://en.wikipedia.org/wiki/Large_language_model"
+    results = await crawler.fetch_urls(url)
+    assert len(results) == 1
+    assert isinstance(results, dict)
+    html = results[url]
+    assert isinstance(html, str)
diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
new file mode 100644
index 000000000..7edb9b8d3
--- /dev/null
+++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
@@ -0,0 +1,15 @@
+import os
+import pytest
+from cognee.tasks.web_scraper.config import TavilyConfig
+from cognee.tasks.web_scraper.utils import fetch_with_tavily
+
+
+@pytest.mark.asyncio
+async def test_fetch():
+    url = "https://en.wikipedia.org/wiki/Large_language_model"
+    tavily_config = TavilyConfig()
+    results = await fetch_with_tavily(url, tavily_config)
+    assert len(results) == 1
+    assert isinstance(results, dict)
+    html = results[url]
+    assert isinstance(html, str)

From 9d9969676f105d60e46c6bdf7d0b75a4b5f3c8bb Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 15:49:12 +0100
Subject: [PATCH 31/44] Separate BeautifulSoup crawling from fetching

---
 cognee/context_global_variables.py            |   6 -
 .../tasks/ingestion/data_fetchers/__init__.py |   8 -
 .../data_fetchers/data_fetcher_interface.py   |  15 -
 .../data_fetchers/web_url_fetcher.py          |  55 ---
 cognee/tasks/web_scraper/__init__.py          |   2 +
 cognee/tasks/web_scraper/bs4_crawler.py       | 441 +----------------
 cognee/tasks/web_scraper/config.py            |   2 +-
 .../tasks/web_scraper/default_url_crawler.py  | 446 ++++++++++++++++++
 cognee/tasks/web_scraper/utils.py             |  71 ++-
 cognee/tasks/web_scraper/web_scraper_task.py  |  10 +-
 .../tasks/web_scraping/web_scraping_test.py   |   6 +-
 11 files changed, 489 insertions(+), 573 deletions(-)
 delete mode 100644 cognee/tasks/ingestion/data_fetchers/__init__.py
 delete mode 100644 cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
 delete mode 100644 cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
 create mode 100644 cognee/tasks/web_scraper/default_url_crawler.py

diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py
index 388316359..aad53341a 100644
--- a/cognee/context_global_variables.py
+++ b/cognee/context_global_variables.py
@@ -7,18 +7,12 @@ from cognee.base_config import get_base_config
 from cognee.infrastructure.databases.utils import get_or_create_dataset_database
 from cognee.infrastructure.files.storage.config import file_storage_config
 from cognee.modules.users.methods import get_user
-from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
 
 # Note: ContextVar allows us to use different graph db configurations in Cognee
 #       for different async tasks, threads and processes
 vector_db_config = ContextVar("vector_db_config", default=None)
 graph_db_config = ContextVar("graph_db_config", default=None)
 session_user = ContextVar("session_user", default=None)
-soup_crawler_config: ContextVar[SoupCrawlerConfig | None] = ContextVar(
-    "soup_crawler_config", default=None
-)
-tavily_config: ContextVar[TavilyConfig | None] = ContextVar("tavily_config", default=None)
-
 
 async def set_session_user_context_variable(user):
     session_user.set(user)
diff --git a/cognee/tasks/ingestion/data_fetchers/__init__.py b/cognee/tasks/ingestion/data_fetchers/__init__.py
deleted file mode 100644
index 63530b427..000000000
--- a/cognee/tasks/ingestion/data_fetchers/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-__all__ = []
-
-try:
-    from .web_url_fetcher import WebUrlFetcher
-
-    __all__.append("WebUrlFetcher")
-except ImportError:
-    pass
diff --git a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
deleted file mode 100644
index 9171e429d..000000000
--- a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any
-
-
-class DataFetcherInterface(ABC):
-    @abstractmethod
-    def fetcher_name(self) -> str:
-        pass
-
-    @abstractmethod
-    async def fetch(self, data_item_path: str) -> str:
-        """
-        args: data_item_path - path to the data item
-        """
-        pass
diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
deleted file mode 100644
index 949cb9b0a..000000000
--- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-from cognee.modules.ingestion import save_data_to_file
-from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface
-from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
-from cognee.shared.logging_utils import get_logger
-
-logger = get_logger()
-
-
-class WebUrlFetcher(DataFetcherInterface):
-    def __init__(self): ...
-
-    def fetcher_name(self):
-        return "web_url_fetcher"
-
-    async def fetch(self, data_item_path: str):
-        from cognee.context_global_variables import tavily_config, soup_crawler_config
-        from cognee.tasks.web_scraper import fetch_page_content
-
-        if os.getenv("TAVILY_API_KEY"):
-            _tavily_config = TavilyConfig()
-            _soup_config = None
-            preferred_tool = "tavily"
-        else:
-            _tavily_config = None
-            _soup_config = SoupCrawlerConfig()
-            preferred_tool = "beautifulsoup"
-
-        tavily_config.set(_tavily_config)
-        soup_crawler_config.set(_soup_config)
-
-        logger.info(f"Starting web URL crawling for: {data_item_path}")
-        logger.info(f"Using scraping tool: {preferred_tool}")
-
-        data = await fetch_page_content(
-            data_item_path,
-            preferred_tool=preferred_tool,
-        )
-
-        logger.info(f"Successfully fetched content from URL {data_item_path}")
-
-        # fetch_page_content returns a dict like {url: content}
-        # Extract the content string before saving
-        if isinstance(data, dict):
-            # Concatenate all URL contents (usually just one URL)
-            content = ""
-            for url, text in data.items():
-                content += f"{url}:\n{text}\n\n"
-            logger.info(
-                f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters"
-            )
-        else:
-            content = data
-
-        return await save_data_to_file(content)
diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py
index d8e580fad..f4d6677c7 100644
--- a/cognee/tasks/web_scraper/__init__.py
+++ b/cognee/tasks/web_scraper/__init__.py
@@ -8,6 +8,7 @@ BeautifulSoup or Tavily, defining data models, and handling scraping configurati
 from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
 from .web_scraper_task import cron_web_scraper_task, web_scraper_task
+from .default_url_crawler import DefaultUrlCrawler
 
 
 __all__ = [
@@ -15,4 +16,5 @@ __all__ = [
     "fetch_page_content",
     "cron_web_scraper_task",
     "web_scraper_task",
+    "DefaultUrlCrawler",
 ]
diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py
index 969058466..171a76633 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@@ -5,32 +5,13 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
 
-import asyncio
-import time
 from typing import Union, List, Dict, Any, Optional
-from urllib.parse import urlparse
-from dataclasses import dataclass, field
-from functools import lru_cache
-import httpx
+from dataclasses import dataclass
 from bs4 import BeautifulSoup
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger(__name__)
 
-try:
-    from playwright.async_api import async_playwright
-except ImportError:
-    logger.warning(
-        "Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
-    )
-    async_playwright = None
-
-try:
-    from protego import Protego
-except ImportError:
-    logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
-    Protego = None
-
 
 @dataclass
 class ExtractionRule:
@@ -51,21 +32,6 @@ class ExtractionRule:
     join_with: str = " "
 
 
-@dataclass
-class RobotsTxtCache:
-    """Cache for robots.txt data.
-
-    Attributes:
-        protego: Parsed robots.txt object (Protego instance).
-        crawl_delay: Delay between requests (in seconds).
-        timestamp: Time when the cache entry was created.
-    """
-
-    protego: Any
-    crawl_delay: float
-    timestamp: float = field(default_factory=time.time)
-
-
 # TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
 class BeautifulSoupCrawler:
     """Crawler for fetching and extracting web content using BeautifulSoup.
@@ -84,333 +50,6 @@ class BeautifulSoupCrawler:
         robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
     """
 
-    def __init__(
-        self,
-        *,
-        concurrency: int = 5,
-        crawl_delay: float = 0.5,
-        max_crawl_delay: Optional[float] = 10.0,
-        timeout: float = 15.0,
-        max_retries: int = 2,
-        retry_delay_factor: float = 0.5,
-        headers: Optional[Dict[str, str]] = None,
-        robots_cache_ttl: float = 3600.0,
-    ):
-        """Initialize the BeautifulSoupCrawler.
-
-        Args:
-            concurrency: Number of concurrent requests allowed.
-            crawl_delay: Minimum seconds between requests to the same domain.
-            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
-            timeout: Per-request timeout in seconds.
-            max_retries: Number of retries for failed requests.
-            retry_delay_factor: Multiplier for exponential backoff on retries.
-            headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
-            robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
-        """
-        self.concurrency = concurrency
-        self._sem = asyncio.Semaphore(concurrency)
-        self.crawl_delay = crawl_delay
-        self.max_crawl_delay = max_crawl_delay
-        self.timeout = timeout
-        self.max_retries = max_retries
-        self.retry_delay_factor = retry_delay_factor
-        self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
-        self.robots_cache_ttl = robots_cache_ttl
-        self._last_request_time_per_domain: Dict[str, float] = {}
-        self._robots_cache: Dict[str, RobotsTxtCache] = {}
-        self._client: Optional[httpx.AsyncClient] = None
-        self._robots_lock = asyncio.Lock()
-
-    async def _ensure_client(self):
-        """Initialize the HTTP client if not already created."""
-        if self._client is None:
-            self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
-
-    async def close(self):
-        """Close the HTTP client."""
-        if self._client:
-            await self._client.aclose()
-            self._client = None
-
-    async def __aenter__(self):
-        """Enter the context manager, initializing the HTTP client."""
-        await self._ensure_client()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Exit the context manager, closing the HTTP client."""
-        await self.close()
-
-    @lru_cache(maxsize=1024)
-    def _domain_from_url(self, url: str) -> str:
-        """Extract the domain (netloc) from a URL.
-
-        Args:
-            url: The URL to parse.
-
-        Returns:
-            str: The domain (netloc) of the URL.
-        """
-        try:
-            return urlparse(url).netloc
-        except Exception:
-            return url
-
-    @lru_cache(maxsize=1024)
-    def _get_domain_root(self, url: str) -> str:
-        """Get the root URL (scheme and netloc) from a URL.
-
-        Args:
-            url: The URL to parse.
-
-        Returns:
-            str: The root URL (e.g., "https://example.com").
-        """
-        parsed = urlparse(url)
-        return f"{parsed.scheme}://{parsed.netloc}"
-
-    async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
-        """Enforce rate limiting for requests to the same domain.
-
-        Args:
-            url: The URL to check.
-            crawl_delay: Custom crawl delay in seconds (if any).
-        """
-        domain = self._domain_from_url(url)
-        last = self._last_request_time_per_domain.get(domain)
-        delay = crawl_delay if crawl_delay is not None else self.crawl_delay
-
-        if last is None:
-            self._last_request_time_per_domain[domain] = time.time()
-            return
-
-        elapsed = time.time() - last
-        wait_for = delay - elapsed
-        if wait_for > 0:
-            logger.info(
-                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
-            )
-            await asyncio.sleep(wait_for)
-            logger.info(f"Rate limit wait completed for {url}")
-        self._last_request_time_per_domain[domain] = time.time()
-
-    async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
-        """Get cached robots.txt data if valid.
-
-        Args:
-            domain_root: The root URL (e.g., "https://example.com").
-
-        Returns:
-            Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
-        """
-        if Protego is None:
-            return None
-
-        cached = self._robots_cache.get(domain_root)
-        if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
-            return cached
-        return None
-
-    async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
-        """Fetch and cache robots.txt data.
-
-        Args:
-            domain_root: The root URL (e.g., "https://example.com").
-
-        Returns:
-            RobotsTxtCache: Cached robots.txt data with crawl delay.
-
-        Raises:
-            Exception: If fetching robots.txt fails.
-        """
-        async with self._robots_lock:
-            cached = await self._get_robots_cache(domain_root)
-            if cached:
-                return cached
-
-            robots_url = f"{domain_root}/robots.txt"
-            try:
-                await self._ensure_client()
-                await self._respect_rate_limit(robots_url, self.crawl_delay)
-                resp = await self._client.get(robots_url, timeout=5.0)
-                content = resp.text if resp.status_code == 200 else ""
-            except Exception as e:
-                logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
-                content = ""
-
-            protego = Protego.parse(content) if content.strip() else None
-            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
-
-            crawl_delay = self.crawl_delay
-            if protego:
-                delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
-                if delay:
-                    # Apply max_crawl_delay cap if configured
-                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
-                        logger.warning(
-                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
-                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
-                        )
-                        crawl_delay = self.max_crawl_delay
-                    else:
-                        crawl_delay = delay
-
-            cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
-            self._robots_cache[domain_root] = cache_entry
-            return cache_entry
-
-    async def _is_url_allowed(self, url: str) -> bool:
-        """Check if a URL is allowed by robots.txt.
-
-        Args:
-            url: The URL to check.
-
-        Returns:
-            bool: True if the URL is allowed, False otherwise.
-        """
-        if Protego is None:
-            return True
-
-        try:
-            domain_root = self._get_domain_root(url)
-            cache = await self._get_robots_cache(domain_root)
-            if cache is None:
-                cache = await self._fetch_and_cache_robots(domain_root)
-
-            if cache.protego is None:
-                return True
-
-            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
-            return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
-        except Exception as e:
-            logger.debug(f"Error checking robots.txt for {url}: {e}")
-            return True
-
-    async def _get_crawl_delay(self, url: str) -> float:
-        """Get the crawl delay for a URL from robots.txt.
-
-        Args:
-            url: The URL to check.
-
-        Returns:
-            float: Crawl delay in seconds.
-        """
-        if Protego is None:
-            return self.crawl_delay
-
-        try:
-            domain_root = self._get_domain_root(url)
-            cache = await self._get_robots_cache(domain_root)
-            if cache is None:
-                cache = await self._fetch_and_cache_robots(domain_root)
-            return cache.crawl_delay
-        except Exception:
-            return self.crawl_delay
-
-    async def _fetch_httpx(self, url: str) -> str:
-        """Fetch a URL using HTTPX with retries.
-
-        Args:
-            url: The URL to fetch.
-
-        Returns:
-            str: The HTML content of the page.
-
-        Raises:
-            Exception: If all retry attempts fail.
-        """
-        await self._ensure_client()
-        assert self._client is not None, "HTTP client not initialized"
-
-        attempt = 0
-        crawl_delay = await self._get_crawl_delay(url)
-        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
-
-        while True:
-            try:
-                await self._respect_rate_limit(url, crawl_delay)
-                resp = await self._client.get(url)
-                resp.raise_for_status()
-                logger.info(
-                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
-                )
-                return resp.text
-            except Exception as exc:
-                attempt += 1
-                if attempt > self.max_retries:
-                    logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
-                    raise
-
-                delay = self.retry_delay_factor * (2 ** (attempt - 1))
-                logger.warning(
-                    f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
-                )
-                await asyncio.sleep(delay)
-
-    async def _render_with_playwright(
-        self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
-    ) -> str:
-        """Fetch and render a URL using Playwright for JavaScript content.
-
-        Args:
-            url: The URL to fetch.
-            js_wait: Seconds to wait for JavaScript to load.
-            timeout: Timeout for the request (in seconds, defaults to instance timeout).
-
-        Returns:
-            str: The rendered HTML content.
-
-        Raises:
-            RuntimeError: If Playwright is not installed.
-            Exception: If all retry attempts fail.
-        """
-        if async_playwright is None:
-            raise RuntimeError(
-                "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
-            )
-
-        timeout_val = timeout or self.timeout
-        logger.info(
-            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
-        )
-
-        attempt = 0
-        while True:
-            try:
-                async with async_playwright() as p:
-                    logger.info(f"Launching headless Chromium browser for {url}")
-                    browser = await p.chromium.launch(headless=True)
-                    try:
-                        context = await browser.new_context()
-                        page = await context.new_page()
-                        logger.info(f"Navigating to {url} and waiting for network idle")
-                        await page.goto(
-                            url,
-                            wait_until="networkidle",
-                            timeout=int(timeout_val * 1000),
-                        )
-                        if js_wait:
-                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
-                            await asyncio.sleep(js_wait)
-                        content = await page.content()
-                        logger.info(
-                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
-                        )
-                        return content
-                    finally:
-                        await browser.close()
-            except Exception as exc:
-                attempt += 1
-                if attempt > self.max_retries:
-                    logger.error(f"Playwright fetch failed for {url}: {exc}")
-                    raise
-                backoff = self.retry_delay_factor * (2 ** (attempt - 1))
-                logger.warning(
-                    f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
-                )
-                await asyncio.sleep(backoff)
-
     def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
 
@@ -435,7 +74,7 @@ class BeautifulSoupCrawler:
             )
         raise ValueError(f"Invalid extraction rule: {rule}")
 
-    def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
+    def extract(self, html: str, rule: ExtractionRule) -> str:
         """Extract content from HTML using BeautifulSoup or lxml XPath.
 
         Args:
@@ -491,79 +130,3 @@ class BeautifulSoupCrawler:
                 val = el.get(rule.attr)
                 return (val or "").strip()
             return el.get_text(strip=True)
-
-    async def fetch_urls(
-        self,
-        urls: Union[str, List[str]],
-        *,
-        use_playwright: bool = False,
-        playwright_js_wait: float = 0.8,
-    ) -> Dict[str, str]:
-        """Fetch and extract content from URLs using BeautifulSoup or Playwright.
-
-        Args:
-            urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
-            extraction_rules: Default extraction rules for string or list URLs.
-            use_playwright: If True, use Playwright for JavaScript rendering.
-            playwright_js_wait: Seconds to wait for JavaScript to load.
-            join_all_matches: If True, extract all matching elements for each rule.
-
-        Returns:
-            Dict[str, str]: A dictionary mapping URLs to their extracted content.
-
-        Raises:
-            ValueError: If extraction_rules are missing when required or if urls is invalid.
-            Exception: If fetching or extraction fails.
-        """
-        if isinstance(urls, str):
-            urls = [urls]
-        else:
-            raise ValueError(f"Invalid urls type: {type(urls)}")
-
-        async def _task(url: str):
-            async with self._sem:
-                try:
-                    logger.info(f"Processing URL: {url}")
-
-                    # Check robots.txt
-                    allowed = await self._is_url_allowed(url)
-                    if not allowed:
-                        logger.warning(f"URL disallowed by robots.txt: {url}")
-                        return url, ""
-
-                    logger.info(f"Robots.txt check passed for {url}")
-
-                    # Fetch HTML
-                    if use_playwright:
-                        logger.info(
-                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
-                        )
-                        html = await self._render_with_playwright(
-                            url, js_wait=playwright_js_wait, timeout=self.timeout
-                        )
-                    else:
-                        logger.info(f"Fetching {url} with httpx")
-                        html = await self._fetch_httpx(url)
-
-                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
-
-                    return url, html
-
-                except Exception as e:
-                    logger.error(f"Error processing {url}: {e}")
-                    return url, ""
-
-        logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
-        tasks = [asyncio.create_task(_task(u)) for u in urls]
-        results = {}
-        completed = 0
-        total = len(tasks)
-
-        for coro in asyncio.as_completed(tasks):
-            url, html = await coro
-            results[url] = html
-            completed += 1
-            logger.info(f"Progress: {completed}/{total} URLs processed")
-
-        logger.info(f"Completed fetching all {len(results)} URL(s)")
-        return results
diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py
index ac470daa9..fcf22ab33 100644
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@@ -10,7 +10,7 @@ class TavilyConfig(BaseModel):
     timeout: Optional[int] = Field(default=10, ge=1, le=60)
 
 
-class SoupCrawlerConfig(BaseModel):
+class DefaultCrawlerConfig(BaseModel):
     concurrency: int = 5
     crawl_delay: float = 0.5
     max_crawl_delay: Optional[float] = (
diff --git a/cognee/tasks/web_scraper/default_url_crawler.py b/cognee/tasks/web_scraper/default_url_crawler.py
new file mode 100644
index 000000000..d9d2ee922
--- /dev/null
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@@ -0,0 +1,446 @@
+import asyncio
+from dataclasses import dataclass, field
+from functools import lru_cache
+import time
+from typing import Any, Union, List, Dict, Optional
+from urllib.parse import urlparse
+import httpx
+
+from cognee.shared.logging_utils import get_logger
+from cognee.tasks.web_scraper.utils import UrlsToHtmls
+
+logger = get_logger()
+
+try:
+    from protego import Protego
+except ImportError:
+    logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
+    Protego = None
+
+try:
+    from playwright.async_api import async_playwright
+except ImportError:
+    logger.warning(
+        "Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
+    )
+    async_playwright = None
+
+
+@dataclass
+class RobotsTxtCache:
+    """Cache for robots.txt data.
+
+    Attributes:
+        protego: Parsed robots.txt object (Protego instance).
+        crawl_delay: Delay between requests (in seconds).
+        timestamp: Time when the cache entry was created.
+    """
+
+    protego: Any
+    crawl_delay: float
+    timestamp: float = field(default_factory=time.time)
+
+
+class DefaultUrlCrawler:
+    def __init__(
+        self,
+        *,
+        concurrency: int = 5,
+        crawl_delay: float = 0.5,
+        max_crawl_delay: Optional[float] = 10.0,
+        timeout: float = 15.0,
+        max_retries: int = 2,
+        retry_delay_factor: float = 0.5,
+        headers: Optional[Dict[str, str]] = None,
+        robots_cache_ttl: float = 3600.0,
+    ):
+        """Initialize the BeautifulSoupCrawler.
+
+        Args:
+            concurrency: Number of concurrent requests allowed.
+            crawl_delay: Minimum seconds between requests to the same domain.
+            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
+            timeout: Per-request timeout in seconds.
+            max_retries: Number of retries for failed requests.
+            retry_delay_factor: Multiplier for exponential backoff on retries.
+            headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
+            robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
+        """
+        self.concurrency = concurrency
+        self._sem = asyncio.Semaphore(concurrency)
+        self.crawl_delay = crawl_delay
+        self.max_crawl_delay = max_crawl_delay
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.retry_delay_factor = retry_delay_factor
+        self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
+        self.robots_cache_ttl = robots_cache_ttl
+        self._last_request_time_per_domain: Dict[str, float] = {}
+        self._robots_cache: Dict[str, RobotsTxtCache] = {}
+        self._client: Optional[httpx.AsyncClient] = None
+        self._robots_lock = asyncio.Lock()
+
+    async def _ensure_client(self):
+        """Initialize the HTTP client if not already created."""
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
+
+    async def close(self):
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+
+    async def __aenter__(self):
+        """Enter the context manager, initializing the HTTP client."""
+        await self._ensure_client()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Exit the context manager, closing the HTTP client."""
+        await self.close()
+
+    @lru_cache(maxsize=1024)
+    def _domain_from_url(self, url: str) -> str:
+        """Extract the domain (netloc) from a URL.
+
+        Args:
+            url: The URL to parse.
+
+        Returns:
+            str: The domain (netloc) of the URL.
+        """
+        try:
+            return urlparse(url).netloc
+        except Exception:
+            return url
+
+    @lru_cache(maxsize=1024)
+    def _get_domain_root(self, url: str) -> str:
+        """Get the root URL (scheme and netloc) from a URL.
+
+        Args:
+            url: The URL to parse.
+
+        Returns:
+            str: The root URL (e.g., "https://example.com").
+        """
+        parsed = urlparse(url)
+        return f"{parsed.scheme}://{parsed.netloc}"
+
+    async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
+        """Enforce rate limiting for requests to the same domain.
+
+        Args:
+            url: The URL to check.
+            crawl_delay: Custom crawl delay in seconds (if any).
+        """
+        domain = self._domain_from_url(url)
+        last = self._last_request_time_per_domain.get(domain)
+        delay = crawl_delay if crawl_delay is not None else self.crawl_delay
+
+        if last is None:
+            self._last_request_time_per_domain[domain] = time.time()
+            return
+
+        elapsed = time.time() - last
+        wait_for = delay - elapsed
+        if wait_for > 0:
+            logger.info(
+                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
+            )
+            await asyncio.sleep(wait_for)
+            logger.info(f"Rate limit wait completed for {url}")
+        self._last_request_time_per_domain[domain] = time.time()
+
+    async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
+        """Get cached robots.txt data if valid.
+
+        Args:
+            domain_root: The root URL (e.g., "https://example.com").
+
+        Returns:
+            Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
+        """
+        if Protego is None:
+            return None
+
+        cached = self._robots_cache.get(domain_root)
+        if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
+            return cached
+        return None
+
+    async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
+        """Fetch and cache robots.txt data.
+
+        Args:
+            domain_root: The root URL (e.g., "https://example.com").
+
+        Returns:
+            RobotsTxtCache: Cached robots.txt data with crawl delay.
+
+        Raises:
+            Exception: If fetching robots.txt fails.
+        """
+        async with self._robots_lock:
+            cached = await self._get_robots_cache(domain_root)
+            if cached:
+                return cached
+
+            robots_url = f"{domain_root}/robots.txt"
+            try:
+                await self._ensure_client()
+                await self._respect_rate_limit(robots_url, self.crawl_delay)
+                resp = await self._client.get(robots_url, timeout=5.0)
+                content = resp.text if resp.status_code == 200 else ""
+            except Exception as e:
+                logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
+                content = ""
+
+            protego = Protego.parse(content) if content.strip() else None
+            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
+
+            crawl_delay = self.crawl_delay
+            if protego:
+                delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
+                if delay:
+                    # Apply max_crawl_delay cap if configured
+                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
+                        logger.warning(
+                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
+                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
+                        )
+                        crawl_delay = self.max_crawl_delay
+                    else:
+                        crawl_delay = delay
+
+            cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
+            self._robots_cache[domain_root] = cache_entry
+            return cache_entry
+
+    async def _is_url_allowed(self, url: str) -> bool:
+        """Check if a URL is allowed by robots.txt.
+
+        Args:
+            url: The URL to check.
+
+        Returns:
+            bool: True if the URL is allowed, False otherwise.
+        """
+        if Protego is None:
+            return True
+
+        try:
+            domain_root = self._get_domain_root(url)
+            cache = await self._get_robots_cache(domain_root)
+            if cache is None:
+                cache = await self._fetch_and_cache_robots(domain_root)
+
+            if cache.protego is None:
+                return True
+
+            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
+            return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
+        except Exception as e:
+            logger.debug(f"Error checking robots.txt for {url}: {e}")
+            return True
+
+    async def _get_crawl_delay(self, url: str) -> float:
+        """Get the crawl delay for a URL from robots.txt.
+
+        Args:
+            url: The URL to check.
+
+        Returns:
+            float: Crawl delay in seconds.
+        """
+        if Protego is None:
+            return self.crawl_delay
+
+        try:
+            domain_root = self._get_domain_root(url)
+            cache = await self._get_robots_cache(domain_root)
+            if cache is None:
+                cache = await self._fetch_and_cache_robots(domain_root)
+            return cache.crawl_delay
+        except Exception:
+            return self.crawl_delay
+
+    async def _fetch_httpx(self, url: str) -> str:
+        """Fetch a URL using HTTPX with retries.
+
+        Args:
+            url: The URL to fetch.
+
+        Returns:
+            str: The HTML content of the page.
+
+        Raises:
+            Exception: If all retry attempts fail.
+        """
+        await self._ensure_client()
+        assert self._client is not None, "HTTP client not initialized"
+
+        attempt = 0
+        crawl_delay = await self._get_crawl_delay(url)
+        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
+
+        while True:
+            try:
+                await self._respect_rate_limit(url, crawl_delay)
+                resp = await self._client.get(url)
+                resp.raise_for_status()
+                logger.info(
+                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
+                )
+                return resp.text
+            except Exception as exc:
+                attempt += 1
+                if attempt > self.max_retries:
+                    logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
+                    raise
+
+                delay = self.retry_delay_factor * (2 ** (attempt - 1))
+                logger.warning(
+                    f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
+                )
+                await asyncio.sleep(delay)
+
+    async def _render_with_playwright(
+        self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
+    ) -> str:
+        """Fetch and render a URL using Playwright for JavaScript content.
+
+        Args:
+            url: The URL to fetch.
+            js_wait: Seconds to wait for JavaScript to load.
+            timeout: Timeout for the request (in seconds, defaults to instance timeout).
+
+        Returns:
+            str: The rendered HTML content.
+
+        Raises:
+            RuntimeError: If Playwright is not installed.
+            Exception: If all retry attempts fail.
+        """
+        if async_playwright is None:
+            raise RuntimeError(
+                "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
+            )
+
+        timeout_val = timeout or self.timeout
+        logger.info(
+            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
+        )
+
+        attempt = 0
+        while True:
+            try:
+                async with async_playwright() as p:
+                    logger.info(f"Launching headless Chromium browser for {url}")
+                    browser = await p.chromium.launch(headless=True)
+                    try:
+                        context = await browser.new_context()
+                        page = await context.new_page()
+                        logger.info(f"Navigating to {url} and waiting for network idle")
+                        await page.goto(
+                            url,
+                            wait_until="networkidle",
+                            timeout=int(timeout_val * 1000),
+                        )
+                        if js_wait:
+                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
+                            await asyncio.sleep(js_wait)
+                        content = await page.content()
+                        logger.info(
+                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
+                        )
+                        return content
+                    finally:
+                        await browser.close()
+            except Exception as exc:
+                attempt += 1
+                if attempt > self.max_retries:
+                    logger.error(f"Playwright fetch failed for {url}: {exc}")
+                    raise
+                backoff = self.retry_delay_factor * (2 ** (attempt - 1))
+                logger.warning(
+                    f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
+                )
+                await asyncio.sleep(backoff)
+
+    async def fetch_urls(
+        self,
+        urls: Union[str, List[str]],
+        *,
+        use_playwright: bool = False,
+        playwright_js_wait: float = 0.8,
+    ) -> UrlsToHtmls:
+        """Fetch and extract content from URLs using BeautifulSoup or Playwright.
+
+        Args:
+            urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
+            extraction_rules: Default extraction rules for string or list URLs.
+            use_playwright: If True, use Playwright for JavaScript rendering.
+            playwright_js_wait: Seconds to wait for JavaScript to load.
+            join_all_matches: If True, extract all matching elements for each rule.
+
+        Returns:
+            Dict[str, str]: A dictionary mapping URLs to their extracted content.
+
+        Raises:
+            ValueError: If extraction_rules are missing when required or if urls is invalid.
+            Exception: If fetching or extraction fails.
+        """
+        if isinstance(urls, str):
+            urls = [urls]
+        else:
+            raise ValueError(f"Invalid urls type: {type(urls)}")
+
+        async def _task(url: str):
+            async with self._sem:
+                try:
+                    logger.info(f"Processing URL: {url}")
+
+                    # Check robots.txt
+                    allowed = await self._is_url_allowed(url)
+                    if not allowed:
+                        logger.warning(f"URL disallowed by robots.txt: {url}")
+                        return url, ""
+
+                    logger.info(f"Robots.txt check passed for {url}")
+
+                    # Fetch HTML
+                    if use_playwright:
+                        logger.info(
+                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
+                        )
+                        html = await self._render_with_playwright(
+                            url, js_wait=playwright_js_wait, timeout=self.timeout
+                        )
+                    else:
+                        logger.info(f"Fetching {url} with httpx")
+                        html = await self._fetch_httpx(url)
+
+                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
+
+                    return url, html
+
+                except Exception as e:
+                    logger.error(f"Error processing {url}: {e}")
+                    return url, ""
+
+        logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
+        tasks = [asyncio.create_task(_task(u)) for u in urls]
+        results = {}
+        completed = 0
+        total = len(tasks)
+
+        for coro in asyncio.as_completed(tasks):
+            url, html = await coro
+            results[url] = html
+            completed += 1
+            logger.info(f"Progress: {completed}/{total} URLs processed")
+
+        logger.info(f"Completed fetching all {len(results)} URL(s)")
+        return results
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index 8b8bcc11f..0cbd355a3 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -4,19 +4,20 @@ This module provides functions to fetch and extract content from web pages, supp
 both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
 """
 
-from typing import Dict, List, Union, Optional, Literal
-from cognee.context_global_variables import soup_crawler_config, tavily_config
+import os
+from re import L
+from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
+from .default_url_crawler import DefaultUrlCrawler
 from .bs4_crawler import BeautifulSoupCrawler
-from .config import TavilyConfig
+from .config import DefaultCrawlerConfig, TavilyConfig
 
 logger = get_logger(__name__)
 
+UrlsToHtmls: TypeAlias = dict[str, str]
 
-async def fetch_page_content(
-    urls: Union[str, List[str]],
-    preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup",
-) -> Dict[str, str]:
+
+async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
     """Fetch content from one or more URLs using the specified tool.
 
     This function retrieves web page content using either BeautifulSoup (with custom
@@ -29,7 +30,7 @@ async def fetch_page_content(
             Defaults to "beautifulsoup".
         tavily_config: Configuration for Tavily API, including API key.
             Required if preferred_tool is "tavily".
-        soup_crawler_config: Configuration for BeautifulSoup crawler, including
+        default_crawler_config: Configuration for BeautifulSoup crawler, including
             extraction rules. Required if preferred_tool is "beautifulsoup" and
             extraction_rules are needed.
 
@@ -44,51 +45,39 @@ async def fetch_page_content(
             installed.
     """
     url_list = [urls] if isinstance(urls, str) else urls
-    logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}")
 
-    _tavily_config = tavily_config.get()
-    _soup_crawler_config = soup_crawler_config.get()
-
-    if preferred_tool == "tavily":
-        if not tavily_config or tavily_config.api_key is None:
-            raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
-        logger.info("Using Tavily API for content extraction")
+    if os.getenv("TAVILY_API_KEY"):
+        logger.info("Using Tavily API for url fetching")
         return await fetch_with_tavily(urls, tavily_config)
+    else:
+        logger.info("Using default crawler for content extraction")
 
-    if preferred_tool == "beautifulsoup":
-        try:
-            from bs4 import BeautifulSoup as _  # noqa: F401
-        except ImportError:
-            logger.error(
-                "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
-            )
-            raise ImportError
-        if soup_crawler_config is None or soup_crawler_config.extraction_rules is None:
-            raise ValueError("soup_crawler_config must be provided when not using Tavily")
+        default_crawler_config = (
+            DefaultCrawlerConfig()
+        )  # We've decided to use defaults, and configure through env vars as needed
 
-        logger.info("Using BeautifulSoup for content extraction")
         logger.info(
-            f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s"
+            f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
         )
 
-        crawler = BeautifulSoupCrawler(
-            concurrency=soup_crawler_config.concurrency,
-            crawl_delay=soup_crawler_config.crawl_delay,
-            max_crawl_delay=soup_crawler_config.max_crawl_delay,
-            timeout=soup_crawler_config.timeout,
-            max_retries=soup_crawler_config.max_retries,
-            retry_delay_factor=soup_crawler_config.retry_delay_factor,
-            headers=soup_crawler_config.headers,
-            robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
+        crawler = DefaultUrlCrawler(
+            concurrency=default_crawler_config.concurrency,
+            crawl_delay=default_crawler_config.crawl_delay,
+            max_crawl_delay=default_crawler_config.max_crawl_delay,
+            timeout=default_crawler_config.timeout,
+            max_retries=default_crawler_config.max_retries,
+            retry_delay_factor=default_crawler_config.retry_delay_factor,
+            headers=default_crawler_config.headers,
+            robots_cache_ttl=default_crawler_config.robots_cache_ttl,
         )
         try:
             logger.info(
-                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})"
+                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
             )
             results = await crawler.fetch_urls(
                 urls,
-                use_playwright=soup_crawler_config.use_playwright,
-                playwright_js_wait=soup_crawler_config.playwright_js_wait,
+                use_playwright=default_crawler_config.use_playwright,
+                playwright_js_wait=default_crawler_config.playwright_js_wait,
             )
             logger.info(f"Successfully fetched content from {len(results)} URL(s)")
             return results
@@ -102,7 +91,7 @@ async def fetch_page_content(
 
 async def fetch_with_tavily(
     urls: Union[str, List[str]], tavily_config: TavilyConfig
-) -> Dict[str, str]:
+) -> UrlsToHtmls:
     """Fetch content from URLs using the Tavily API.
 
     Args:
diff --git a/cognee/tasks/web_scraper/web_scraper_task.py b/cognee/tasks/web_scraper/web_scraper_task.py
index 52154c6ef..2bade3719 100644
--- a/cognee/tasks/web_scraper/web_scraper_task.py
+++ b/cognee/tasks/web_scraper/web_scraper_task.py
@@ -19,7 +19,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges
 from cognee.modules.engine.operations.setup import setup
 
 from .models import WebPage, WebSite, ScrapingJob
-from .config import SoupCrawlerConfig, TavilyConfig
+from .config import DefaultCrawlerConfig, TavilyConfig
 from .utils import fetch_page_content
 
 try:
@@ -47,7 +47,7 @@ async def cron_web_scraper_task(
     schedule: str = None,
     extraction_rules: dict = None,
     tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
-    soup_crawler_config: SoupCrawlerConfig = None,
+    soup_crawler_config: DefaultCrawlerConfig = None,
     tavily_config: TavilyConfig = None,
     job_name: str = "scraping",
 ):
@@ -121,7 +121,7 @@ async def web_scraper_task(
     schedule: str = None,
     extraction_rules: dict = None,
     tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
-    soup_crawler_config: SoupCrawlerConfig = None,
+    soup_crawler_config: DefaultCrawlerConfig = None,
     tavily_config: TavilyConfig = None,
     job_name: str = None,
 ):
@@ -341,7 +341,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
         soup_crawler_config: Configuration for BeautifulSoup crawler.
 
     Returns:
-        Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
+        Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
             tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
 
     Raises:
@@ -350,7 +350,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
     preferred_tool = "beautifulsoup"
 
     if extraction_rules and not soup_crawler_config:
-        soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
+        soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
 
     if tavily_api_key:
         if not tavily_config:
diff --git a/cognee/tests/tasks/web_scraping/web_scraping_test.py b/cognee/tests/tasks/web_scraping/web_scraping_test.py
index bf66b5155..81c58ac8d 100644
--- a/cognee/tests/tasks/web_scraping/web_scraping_test.py
+++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py
@@ -1,6 +1,6 @@
 import asyncio
 import cognee
-from cognee.tasks.web_scraper.config import SoupCrawlerConfig
+from cognee.tasks.web_scraper.config import DefaultCrawlerConfig
 from cognee.tasks.web_scraper import cron_web_scraper_task
 
 
@@ -14,7 +14,7 @@ async def test_web_scraping_using_bs4():
         "authors": {"selector": ".quote small", "all": True},
     }
 
-    soup_config = SoupCrawlerConfig(
+    soup_config = DefaultCrawlerConfig(
         concurrency=5,
         crawl_delay=0.5,
         timeout=15.0,
@@ -47,7 +47,7 @@ async def test_web_scraping_using_bs4_and_incremental_loading():
     url = "https://books.toscrape.com/"
     rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}
 
-    soup_config = SoupCrawlerConfig(
+    soup_config = DefaultCrawlerConfig(
         concurrency=1,
         crawl_delay=0.1,
         timeout=10.0,

From 16e1c609253f74a36061b49e3ef533e9b5490272 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 16:43:56 +0100
Subject: [PATCH 32/44] move bs4 html parsing into `bs4_loader`

---
 .../loaders/external/bs4_loader.py}           | 24 ++++++++++++++++---
 cognee/tasks/web_scraper/__init__.py          |  1 -
 cognee/tasks/web_scraper/utils.py             |  8 +++----
 .../web_url_crawler/test_bs4_crawler.py       |  4 ++--
 4 files changed, 26 insertions(+), 11 deletions(-)
 rename cognee/{tasks/web_scraper/bs4_crawler.py => infrastructure/loaders/external/bs4_loader.py} (89%)

diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/infrastructure/loaders/external/bs4_loader.py
similarity index 89%
rename from cognee/tasks/web_scraper/bs4_crawler.py
rename to cognee/infrastructure/loaders/external/bs4_loader.py
index 171a76633..8022de04f 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/infrastructure/loaders/external/bs4_loader.py
@@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
 
-from typing import Union, List, Dict, Any, Optional
+from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
+from cognee.infrastructure.loaders import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger(__name__)
@@ -32,8 +33,7 @@ class ExtractionRule:
     join_with: str = " "
 
 
-# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
-class BeautifulSoupCrawler:
+class BeautifulSoupLoader(LoaderInterface):
     """Crawler for fetching and extracting web content using BeautifulSoup.
 
     Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
@@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
         robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
     """
 
+    @property
+    def supported_extensions(self) -> List[str]:
+        return ["html"]
+
+    @property
+    def supported_mime_types(self) -> List[str]:
+        pass
+
+    @property
+    def loader_name(self) -> str:
+        return "beautiful_soup_loader"
+
+    def can_handle(self, extension: str, mime_type: str) -> bool:
+        pass
+
+    async def load(self, file_path: str, **kwargs):
+        pass
+
     def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
 
diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py
index f4d6677c7..26c3e68cf 100644
--- a/cognee/tasks/web_scraper/__init__.py
+++ b/cognee/tasks/web_scraper/__init__.py
@@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
 BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
 """
 
-from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
 from .web_scraper_task import cron_web_scraper_task, web_scraper_task
 from .default_url_crawler import DefaultUrlCrawler
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index 0cbd355a3..b1cbf82e9 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -9,7 +9,6 @@ from re import L
 from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
 from .default_url_crawler import DefaultUrlCrawler
-from .bs4_crawler import BeautifulSoupCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 
 logger = get_logger(__name__)
@@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
 
     if os.getenv("TAVILY_API_KEY"):
         logger.info("Using Tavily API for url fetching")
-        return await fetch_with_tavily(urls, tavily_config)
+        return await fetch_with_tavily(urls)
     else:
         logger.info("Using default crawler for content extraction")
 
@@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
             await crawler.close()
 
 
-async def fetch_with_tavily(
-    urls: Union[str, List[str]], tavily_config: TavilyConfig
-) -> UrlsToHtmls:
+async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
     """Fetch content from URLs using the Tavily API.
 
     Args:
@@ -112,6 +109,7 @@ async def fetch_with_tavily(
         )
         raise
 
+    tavily_config = TavilyConfig()
     url_list = [urls] if isinstance(urls, str) else urls
     extract_depth = tavily_config.extract_depth if tavily_config else "basic"
     timeout = tavily_config.timeout if tavily_config else 10
diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
index 0e7637d86..156cc87a4 100644
--- a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
@@ -1,10 +1,10 @@
 import pytest
-from cognee.tasks.web_scraper import BeautifulSoupCrawler
+from cognee.tasks.web_scraper import DefaultUrlCrawler
 
 
 @pytest.mark.asyncio
 async def test_fetch():
-    crawler = BeautifulSoupCrawler()
+    crawler = DefaultUrlCrawler()
     url = "https://en.wikipedia.org/wiki/Large_language_model"
     results = await crawler.fetch_urls(url)
     assert len(results) == 1

From 7210198f2ef4950cb40f6204e2355e597e3dd6ac Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 16:54:33 +0100
Subject: [PATCH 33/44] implement `bs4_loader.py` methods aside `load` yet

---
 cognee/infrastructure/loaders/external/bs4_loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cognee/infrastructure/loaders/external/bs4_loader.py b/cognee/infrastructure/loaders/external/bs4_loader.py
index 8022de04f..ceea3f9de 100644
--- a/cognee/infrastructure/loaders/external/bs4_loader.py
+++ b/cognee/infrastructure/loaders/external/bs4_loader.py
@@ -56,14 +56,14 @@ class BeautifulSoupLoader(LoaderInterface):
 
     @property
     def supported_mime_types(self) -> List[str]:
-        pass
+        return ["text/html"]
 
     @property
     def loader_name(self) -> str:
         return "beautiful_soup_loader"
 
     def can_handle(self, extension: str, mime_type: str) -> bool:
-        pass
+        return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
 
     async def load(self, file_path: str, **kwargs):
         pass

From 322ef156cb5efce2d75bc7e0df0ebec9484903c9 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 17:10:45 +0100
Subject: [PATCH 34/44] redefine `preferred_loaders` param to allow for args
 per loader

---
 cognee/api/v1/add/add.py                         | 2 +-
 cognee/api/v1/update/update.py                   | 4 ++--
 cognee/infrastructure/loaders/LoaderEngine.py    | 6 ++++--
 cognee/tasks/ingestion/data_item_to_text_file.py | 4 ++--
 cognee/tasks/ingestion/ingest_data.py            | 4 ++--
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 216911ec0..73a3081be 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -23,7 +23,7 @@ async def add(
     vector_db_config: dict = None,
     graph_db_config: dict = None,
     dataset_id: Optional[UUID] = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
     incremental_loading: bool = True,
     data_per_batch: Optional[int] = 20,
 ):
diff --git a/cognee/api/v1/update/update.py b/cognee/api/v1/update/update.py
index a421b3dc0..83b92c50f 100644
--- a/cognee/api/v1/update/update.py
+++ b/cognee/api/v1/update/update.py
@@ -1,5 +1,5 @@
 from uuid import UUID
-from typing import Union, BinaryIO, List, Optional
+from typing import Union, BinaryIO, List, Optional, Any
 
 from cognee.modules.users.models import User
 from cognee.api.v1.delete import delete
@@ -15,7 +15,7 @@ async def update(
     node_set: Optional[List[str]] = None,
     vector_db_config: dict = None,
     graph_db_config: dict = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
     incremental_loading: bool = True,
 ):
     """
diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index 6b62f7641..84ecee0de 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -64,7 +64,9 @@ class LoaderEngine:
         return True
 
     def get_loader(
-        self, file_path: str, preferred_loaders: List[str] = None
+        self,
+        file_path: str,
+        preferred_loaders: dict[str, dict[str, Any]],
     ) -> Optional[LoaderInterface]:
         """
         Get appropriate loader for a file.
@@ -105,7 +107,7 @@ class LoaderEngine:
     async def load_file(
         self,
         file_path: str,
-        preferred_loaders: Optional[List[str]] = None,
+        preferred_loaders: dict[str, dict[str, Any]] = None,
         **kwargs,
     ):
         """
diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py
index 4b9e4bb23..0303f6c92 100644
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@@ -1,6 +1,6 @@
 import os
 from urllib.parse import urlparse
-from typing import List, Tuple
+from typing import Any, List, Tuple
 from pathlib import Path
 import tempfile
 
@@ -35,7 +35,7 @@ async def pull_from_s3(file_path, destination_file) -> None:
 
 async def data_item_to_text_file(
     data_item_path: str,
-    preferred_loaders: List[str],
+    preferred_loaders: dict[str, dict[str, Any]] = None,
 ) -> Tuple[str, LoaderInterface]:
     if isinstance(data_item_path, str):
         parsed_url = urlparse(data_item_path)
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 02987b893..7b081cc34 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -27,7 +27,7 @@ async def ingest_data(
     user: User,
     node_set: Optional[List[str]] = None,
     dataset_id: UUID = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
 ):
     if not user:
         user = await get_default_user()
@@ -44,7 +44,7 @@ async def ingest_data(
         user: User,
         node_set: Optional[List[str]] = None,
         dataset_id: UUID = None,
-        preferred_loaders: List[str] = None,
+        preferred_loaders: dict[str, dict[str, Any]] = None,
     ):
         new_datapoints = []
         existing_data_points = []

From f84e31c626d74e00e1b9a1265997f5a6bac44d10 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 17:11:27 +0100
Subject: [PATCH 35/44] `bs4_loader.py` -> `beautiful_soup_loader.py`, add to
 supported loaders

---
 cognee/infrastructure/loaders/external/__init__.py         | 7 +++++++
 .../external/{bs4_loader.py => beautiful_soup_loader.py}   | 0
 cognee/infrastructure/loaders/supported_loaders.py         | 7 +++++++
 3 files changed, 14 insertions(+)
 rename cognee/infrastructure/loaders/external/{bs4_loader.py => beautiful_soup_loader.py} (100%)

diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py
index 6bf9f9200..785338c09 100644
--- a/cognee/infrastructure/loaders/external/__init__.py
+++ b/cognee/infrastructure/loaders/external/__init__.py
@@ -27,3 +27,10 @@ try:
     __all__.append("AdvancedPdfLoader")
 except ImportError:
     pass
+
+try:
+    from .beautiful_soup_loader import BeautifulSoupLoader
+
+    __all__.append("BeautifulSoupLoader")
+except ImportError:
+    pass
diff --git a/cognee/infrastructure/loaders/external/bs4_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
similarity index 100%
rename from cognee/infrastructure/loaders/external/bs4_loader.py
rename to cognee/infrastructure/loaders/external/beautiful_soup_loader.py
diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py
index d103babe3..156253b53 100644
--- a/cognee/infrastructure/loaders/supported_loaders.py
+++ b/cognee/infrastructure/loaders/supported_loaders.py
@@ -23,3 +23,10 @@ try:
     supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
 except ImportError:
     pass
+
+try:
+    from cognee.infrastructure.loaders.external import BeautifulSoupLoader
+
+    supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
+except ImportError:
+    pass

From 03b4547b7f4e067c8dfed7259e7deff56049a170 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 20:10:43 +0100
Subject: [PATCH 36/44] validate e2e - urls are saved as htmls, and loaders are
 selected correctly

---
 cognee/infrastructure/loaders/LoaderEngine.py |  11 ++
 .../loaders/external/beautiful_soup_loader.py |  21 ++-
 cognee/modules/ingestion/save_data_to_file.py |  11 +-
 cognee/tasks/ingestion/ingest_data.py         |   4 +
 .../ingestion/save_data_item_to_storage.py    |   6 +-
 cognee/tasks/web_scraper/config.py            |   1 -
 .../tasks/web_scraper/default_url_crawler.py  |   2 +-
 cognee/tasks/web_scraper/types.py             |   4 +
 cognee/tasks/web_scraper/utils.py             |   6 +-
 .../integration/web_url_crawler/test_add.py   | 160 ++++++++++++++----
 examples/python/web_url_fetcher_example.py    |   1 +
 11 files changed, 182 insertions(+), 45 deletions(-)
 create mode 100644 cognee/tasks/web_scraper/types.py

diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index 84ecee0de..1a47eea56 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -78,14 +78,21 @@ class LoaderEngine:
         Returns:
             LoaderInterface that can handle the file, or None if not found
         """
+        from pathlib import Path
 
         file_info = filetype.guess(file_path)
 
+        path_extension = Path(file_path).suffix.lstrip(".")
+
         # Try preferred loaders first
         if preferred_loaders:
             for loader_name in preferred_loaders:
                 if loader_name in self._loaders:
                     loader = self._loaders[loader_name]
+                    # Try with path extension first (for text formats like html)
+                    if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                        return loader
+                    # Fall back to content-detected extension
                     if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                         return loader
                 else:
@@ -95,6 +102,10 @@ class LoaderEngine:
         for loader_name in self.default_loader_priority:
             if loader_name in self._loaders:
                 loader = self._loaders[loader_name]
+                # Try with path extension first (for text formats like html)
+                if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                    return loader
+                # Fall back to content-detected extension
                 if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                     return loader
             else:
diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index ceea3f9de..05330a095 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
 from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
-from cognee.infrastructure.loaders import LoaderInterface
+from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger(__name__)
@@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):
 
     @property
     def supported_mime_types(self) -> List[str]:
-        return ["text/html"]
+        return ["text/html", "text/plain"]
 
     @property
     def loader_name(self) -> str:
         return "beautiful_soup_loader"
 
     def can_handle(self, extension: str, mime_type: str) -> bool:
-        return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
+        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
+        return can
 
     async def load(self, file_path: str, **kwargs):
-        pass
+        """Load an HTML file and return its path.
+
+        For HTML files stored on disk, we simply return the file path
+        since the content is already in text format and can be processed directly.
+
+        Args:
+            file_path: Path to the HTML file
+            **kwargs: Additional arguments
+
+        Returns:
+            The file path to the HTML file
+        """
+        raise NotImplementedError
 
     def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py
index 0ba0b2983..42e8d45ba 100644
--- a/cognee/modules/ingestion/save_data_to_file.py
+++ b/cognee/modules/ingestion/save_data_to_file.py
@@ -1,10 +1,12 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
 from .classify import classify
 import hashlib
 
 
-async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
+async def save_data_to_file(
+    data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
+):
     storage_config = get_storage_config()
 
     data_root_directory = storage_config["data_root_directory"]
@@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
 
         file_name = file_metadata["name"]
 
+        if file_extension is not None:
+            extension = file_extension.lstrip(".")
+            file_name_without_ext = file_name.rsplit(".", 1)[0]
+            file_name = f"{file_name_without_ext}.{extension}"
+
         storage = get_file_storage(data_root_directory)
 
         full_file_path = await storage.store(file_name, data)
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 7b081cc34..25b2aa6ae 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
+from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@@ -88,6 +89,9 @@ async def ingest_data(
                 preferred_loaders,
             )
 
+            if loader_engine is None:
+                raise IngestionError("Loader cannot be None")
+
             # Find metadata from original file
             # Standard flow: extract metadata from both original and stored files
             async with open_data_file(original_file_path) as file:
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index 453219f15..05d21e617 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
-from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
+from cognee.tasks.web_scraper.utils import fetch_page_content
 
 
 logger = get_logger()
@@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         if parsed_url.scheme == "s3":
             return data_item
         elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            fetcher = WebUrlFetcher()
-            return await fetcher.fetch(data_item)
+            urls_to_page_contents = await fetch_page_content(data_item)
+            return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:
diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py
index fcf22ab33..f23156f95 100644
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
     max_retries: int = 2
     retry_delay_factor: float = 0.5
     headers: Optional[Dict[str, str]] = None
-    extraction_rules: Dict[str, Any]
     use_playwright: bool = False
     playwright_js_wait: float = 0.8
     robots_cache_ttl: float = 3600.0
diff --git a/cognee/tasks/web_scraper/default_url_crawler.py b/cognee/tasks/web_scraper/default_url_crawler.py
index d9d2ee922..d09bf3e80 100644
--- a/cognee/tasks/web_scraper/default_url_crawler.py
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@@ -7,7 +7,7 @@ from urllib.parse import urlparse
 import httpx
 
 from cognee.shared.logging_utils import get_logger
-from cognee.tasks.web_scraper.utils import UrlsToHtmls
+from cognee.tasks.web_scraper.types import UrlsToHtmls
 
 logger = get_logger()
 
diff --git a/cognee/tasks/web_scraper/types.py b/cognee/tasks/web_scraper/types.py
new file mode 100644
index 000000000..54a3f5d42
--- /dev/null
+++ b/cognee/tasks/web_scraper/types.py
@@ -0,0 +1,4 @@
+from typing import TypeAlias
+
+
+UrlsToHtmls: TypeAlias = dict[str, str]
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index b1cbf82e9..1f51bf98d 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
 """
 
 import os
-from re import L
-from typing import List, Union, TypeAlias
+from typing import List, Union
 from cognee.shared.logging_utils import get_logger
+from cognee.tasks.web_scraper.types import UrlsToHtmls
 from .default_url_crawler import DefaultUrlCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 
 logger = get_logger(__name__)
 
-UrlsToHtmls: TypeAlias = dict[str, str]
-
 
 async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
     """Fetch content from one or more URLs using the specified tool.
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index a00ca9e0d..27a627680 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -1,37 +1,76 @@
-from sys import exc_info
 import pytest
 import cognee
-from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
+from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
+from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
+from cognee.tasks.ingestion import save_data_item_to_storage
+from pathlib import Path
 
 
 @pytest.mark.asyncio
-async def test_add_fails_when_web_url_fetcher_config_not_specified():
-    from cognee.shared.logging_utils import setup_logging, ERROR
-
-    setup_logging(log_level=ERROR)
+async def test_url_saves_as_html_file():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
-    with pytest.raises(IngestionError) as excinfo:
-        await cognee.add(
-            "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
         )
-    assert excinfo.value.message.startswith(
-        "web_url_fetcher configuration must be a valid dictionary"
-    )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
 
 
 @pytest.mark.asyncio
-async def test_add_succesfully_adds_url_when_fetcher_config_specified():
+async def test_saved_html_is_valid():
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        pytest.fail("Test case requires bs4 installed")
+
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-    extraction_rules = {
-        "title": {"selector": "title"},
-        "headings": {"selector": "h1, h2, h3", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "all": True},
-    }
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        content = Path(file_path).read_text()
+
+        soup = BeautifulSoup(content, "html.parser")
+        assert soup.find() is not None, "File should contain parseable HTML"
+
+        has_html_elements = any(
+            [
+                soup.find("html"),
+                soup.find("head"),
+                soup.find("body"),
+                soup.find("div"),
+                soup.find("p"),
+            ]
+        )
+        assert has_html_elements, "File should contain common HTML elements"
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_add_url():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
+
+
+@pytest.mark.asyncio
+async def test_add_url_without_incremental_loading():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
 
     try:
         await cognee.add(
@@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
 
 
 @pytest.mark.asyncio
-async def test_add_with_incremental_loading_works():
+async def test_add_url_with_incremental_loading():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-    extraction_rules = {
-        "title": {"selector": "title"},
-        "headings": {"selector": "h1, h2, h3", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "all": True},
-    }
-
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
@@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():
 
 
 @pytest.mark.asyncio
-async def test_add_without_incremental_loading_works():
+async def test_add_url_with_extraction_rules():  # TODO: this'll fail due to not implemented `load()` yet
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
@@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+            preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
+
+
+@pytest.mark.asyncio
+async def test_loader_is_none_by_default():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader is None
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader == bs_loader
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
diff --git a/examples/python/web_url_fetcher_example.py b/examples/python/web_url_fetcher_example.py
index 2195a62c0..aff8094bf 100644
--- a/examples/python/web_url_fetcher_example.py
+++ b/examples/python/web_url_fetcher_example.py
@@ -23,6 +23,7 @@ async def main():
     await cognee.add(
         "https://en.wikipedia.org/wiki/Large_language_model",
         incremental_loading=False,
+        preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
     )
 
     await cognee.cognify()

From ed4eba4c4415b310d835e187326abb625887f476 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 21:55:06 +0100
Subject: [PATCH 37/44] add back in-code comments for `ingest_data`

---
 cognee/tasks/ingestion/ingest_data.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 25b2aa6ae..0572d0f1e 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -96,9 +96,13 @@ async def ingest_data(
             # Standard flow: extract metadata from both original and stored files
             async with open_data_file(original_file_path) as file:
                 classified_data = ingestion.classify(file)
+
+                # data_id is the hash of original file contents + owner id to avoid duplicate data
+
                 data_id = ingestion.identify(classified_data, user)
                 original_file_metadata = classified_data.get_metadata()
 
+            # Find metadata from Cognee data storage text file
             async with open_data_file(cognee_storage_file_path) as file:
                 classified_data = ingestion.classify(file)
                 storage_file_metadata = classified_data.get_metadata()

From 6895813ae88b49335500d4ea13b8c270f96e1e07 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 22:10:42 +0100
Subject: [PATCH 38/44] tests: name integration tests more meaningfully

---
 .../{test_bs4_crawler.py => test_default_url_crawler.py}          | 0
 .../web_url_crawler/{test_add.py => test_url_adding_e2e.py}       | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename cognee/tests/integration/web_url_crawler/{test_bs4_crawler.py => test_default_url_crawler.py} (100%)
 rename cognee/tests/integration/web_url_crawler/{test_add.py => test_url_adding_e2e.py} (100%)

diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py
similarity index 100%
rename from cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
rename to cognee/tests/integration/web_url_crawler/test_default_url_crawler.py
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
similarity index 100%
rename from cognee/tests/integration/web_url_crawler/test_add.py
rename to cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py

From 0f6aac19e8aef5f071a1c74fa45ad80c97d2ac4f Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 22:35:23 +0100
Subject: [PATCH 39/44] TDD: add test cases and finish loading stage

---
 cognee/infrastructure/loaders/LoaderEngine.py |  14 +-
 .../loaders/external/beautiful_soup_loader.py |  60 +++++++--
 .../web_url_crawler/test_url_adding_e2e.py    | 126 ++++++++++++++++++
 3 files changed, 189 insertions(+), 11 deletions(-)

diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index 1a47eea56..725f37b14 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -126,7 +126,7 @@ class LoaderEngine:
 
         Args:
             file_path: Path to the file to be processed
-            preferred_loaders: List of preferred loader names to try first
+            preferred_loaders: Dict of loader names to their configurations
             **kwargs: Additional loader-specific configuration
 
         Raises:
@@ -138,8 +138,16 @@ class LoaderEngine:
             raise ValueError(f"No loader found for file: {file_path}")
 
         logger.debug(f"Loading {file_path} with {loader.loader_name}")
-        # TODO: loading needs to be reworked to work with both file streams and file locations
-        return await loader.load(file_path, **kwargs)
+
+        # Extract loader-specific config from preferred_loaders
+        loader_config = {}
+        if preferred_loaders and loader.loader_name in preferred_loaders:
+            loader_config = preferred_loaders[loader.loader_name]
+
+        # Merge with any additional kwargs (kwargs take precedence)
+        merged_kwargs = {**loader_config, **kwargs}
+
+        return await loader.load(file_path, **merged_kwargs)
 
     def get_available_loaders(self) -> List[str]:
         """
diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index 05330a095..bd6d8025b 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -66,20 +66,64 @@ class BeautifulSoupLoader(LoaderInterface):
         can = extension in self.supported_extensions and mime_type in self.supported_mime_types
         return can
 
-    async def load(self, file_path: str, **kwargs):
-        """Load an HTML file and return its path.
-
-        For HTML files stored on disk, we simply return the file path
-        since the content is already in text format and can be processed directly.
+    async def load(
+        self,
+        file_path: str,
+        extraction_rules: dict[str, Any] = None,
+        join_all_matches: bool = False,
+        **kwargs,
+    ):
+        """Load an HTML file, extract content, and save to storage.
 
         Args:
             file_path: Path to the HTML file
+            extraction_rules: Dict of CSS selector rules for content extraction
+            join_all_matches: If True, extract all matching elements for each rule
             **kwargs: Additional arguments
 
         Returns:
-            The file path to the HTML file
+            Path to the stored extracted text file
         """
-        raise NotImplementedError
+        if extraction_rules is None:
+            raise ValueError("extraction_rules required for BeautifulSoupLoader")
+
+        logger.info(f"Processing HTML file: {file_path}")
+
+        from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
+        from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
+
+        with open(file_path, "rb") as f:
+            file_metadata = await get_file_metadata(f)
+            f.seek(0)
+            html = f.read()
+
+        storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
+
+        # Normalize extraction rules
+        normalized_rules: List[ExtractionRule] = []
+        for _, rule in extraction_rules.items():
+            r = self._normalize_rule(rule)
+            if join_all_matches:
+                r.all = True
+            normalized_rules.append(r)
+
+        pieces = []
+        for rule in normalized_rules:
+            text = self._extract_from_html(html, rule)
+            if text:
+                pieces.append(text)
+
+        full_content = " ".join(pieces).strip()
+
+        # Store the extracted content
+        storage_config = get_storage_config()
+        data_root_directory = storage_config["data_root_directory"]
+        storage = get_file_storage(data_root_directory)
+
+        full_file_path = await storage.store(storage_file_name, full_content)
+
+        logger.info(f"Extracted {len(full_content)} characters from HTML")
+        return full_file_path
 
     def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
@@ -105,7 +149,7 @@ class BeautifulSoupLoader(LoaderInterface):
             )
         raise ValueError(f"Invalid extraction rule: {rule}")
 
-    def extract(self, html: str, rule: ExtractionRule) -> str:
+    def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
         """Extract content from HTML using BeautifulSoup or lxml XPath.
 
         Args:
diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
index 27a627680..afe2dce7f 100644
--- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
+++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
@@ -182,3 +182,129 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
         assert loader == bs_loader
     except Exception as e:
         pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {}}
+        with pytest.raises(ValueError):
+            await loader_engine.load_file(
+                file_path,
+                preferred_loaders=preferred_loaders,
+            )
+        extraction_rules = {
+            "title": {"selector": "title"},
+            "headings": {"selector": "h1, h2, h3", "all": True},
+            "links": {"selector": "a", "attr": "href", "all": True},
+            "paragraphs": {"selector": "p", "all": True},
+        }
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        await loader_engine.load_file(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        extraction_rules = {
+            "title": {"selector": "title"},
+            "headings": {"selector": "h1, h2, h3", "all": True},
+            "links": {"selector": "a", "attr": "href", "all": True},
+            "paragraphs": {"selector": "p", "all": True},
+        }
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        await loader_engine.load_file(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loads_file_successfully():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        original_file = Path(file_path)
+        assert original_file.exists()
+        assert original_file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader == bs_loader
+
+        cognee_loaded_txt_path = await loader_engine.load_file(
+            file_path=file_path, preferred_loaders=preferred_loaders
+        )
+
+        cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
+
+        assert cognee_loaded_txt_path.endswith(".txt")
+
+        extracted_file = Path(cognee_loaded_txt_path)
+
+        assert extracted_file.exists()
+        assert extracted_file.stat().st_size > 0
+
+        original_basename = original_file.stem
+        extracted_basename = extracted_file.stem
+        assert original_basename == extracted_basename, (
+            f"Expected same base name: {original_basename} vs {extracted_basename}"
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")

From f02aa1abfc4f1f098621383ec6edd57c1fca2fb4 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 23:02:25 +0100
Subject: [PATCH 40/44] ruff format

---
 cognee/context_global_variables.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py
index aad53341a..d52de4b4e 100644
--- a/cognee/context_global_variables.py
+++ b/cognee/context_global_variables.py
@@ -14,6 +14,7 @@ vector_db_config = ContextVar("vector_db_config", default=None)
 graph_db_config = ContextVar("graph_db_config", default=None)
 session_user = ContextVar("session_user", default=None)
 
+
 async def set_session_user_context_variable(user):
     session_user.set(user)
 

From 3f5c09eb45a52e23623bc2ef32e83d251f9afc1f Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 23:11:01 +0100
Subject: [PATCH 41/44] lazy load `cron_web_scraper_task` and
 `web_scraper_task`

---
 cognee/tasks/web_scraper/__init__.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py
index 26c3e68cf..d52129c05 100644
--- a/cognee/tasks/web_scraper/__init__.py
+++ b/cognee/tasks/web_scraper/__init__.py
@@ -6,9 +6,24 @@ BeautifulSoup or Tavily, defining data models, and handling scraping configurati
 """
 
 from .utils import fetch_page_content
-from .web_scraper_task import cron_web_scraper_task, web_scraper_task
 from .default_url_crawler import DefaultUrlCrawler
 
+# Lazy import for web_scraper_task to avoid requiring apscheduler
+# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
+
+
+def __getattr__(name):
+    """Lazy load web scraper task functions that require apscheduler."""
+    if name == "cron_web_scraper_task":
+        from .web_scraper_task import cron_web_scraper_task
+
+        return cron_web_scraper_task
+    elif name == "web_scraper_task":
+        from .web_scraper_task import web_scraper_task
+
+        return web_scraper_task
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
 
 __all__ = [
     "BeautifulSoupCrawler",

From a35bcecdf9dd386e62b07e912c788d0bf20682b4 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 23:13:40 +0100
Subject: [PATCH 42/44] refactor tavily_crawler test

---
 .../tests/integration/web_url_crawler/test_tavily_crawler.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
index 7edb9b8d3..50b409f8f 100644
--- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
@@ -1,14 +1,11 @@
-import os
 import pytest
-from cognee.tasks.web_scraper.config import TavilyConfig
 from cognee.tasks.web_scraper.utils import fetch_with_tavily
 
 
 @pytest.mark.asyncio
 async def test_fetch():
     url = "https://en.wikipedia.org/wiki/Large_language_model"
-    tavily_config = TavilyConfig()
-    results = await fetch_with_tavily(url, tavily_config)
+    results = await fetch_with_tavily(url)
     assert len(results) == 1
     assert isinstance(results, dict)
     html = results[url]

From 20c9e5498b5179cfae0ac56a2579b7b45b1f0b85 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 23:27:18 +0100
Subject: [PATCH 43/44] skip tavily in Github CI for now

---
 .../integration/web_url_crawler/test_tavily_crawler.py   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
index 50b409f8f..946ce8378 100644
--- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
@@ -1,12 +1,19 @@
+import os
 import pytest
 from cognee.tasks.web_scraper.utils import fetch_with_tavily
 
+skip_in_ci = pytest.mark.skipif(
+    os.getenv("GITHUB_ACTIONS") == "true",
+    reason="Skipping in Github for now - before we get TAVILY_API_KEY",
+)
 
+
+@skip_in_ci
 @pytest.mark.asyncio
 async def test_fetch():
     url = "https://en.wikipedia.org/wiki/Large_language_model"
     results = await fetch_with_tavily(url)
-    assert len(results) == 1
     assert isinstance(results, dict)
+    assert len(results) == 1
     html = results[url]
     assert isinstance(html, str)

From 10e4fd7681833013c358f90d2ac7633fea7ec112 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 23:46:21 +0100
Subject: [PATCH 44/44] Make BS4 loader compatible with tavily fetcher

---
 .../loaders/external/beautiful_soup_loader.py   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index bd6d8025b..04954a228 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -115,6 +115,23 @@ class BeautifulSoupLoader(LoaderInterface):
 
         full_content = " ".join(pieces).strip()
 
+        # Fallback: If no content extracted, check if the file is plain text (not HTML)
+        if not full_content:
+            from bs4 import BeautifulSoup
+
+            soup = BeautifulSoup(html, "html.parser")
+            # If there are no HTML tags, treat as plain text
+            if not soup.find():
+                logger.warning(
+                    f"No HTML tags found in {file_path}. Treating as plain text. "
+                    "This may happen when content is pre-extracted (e.g., via Tavily with text format)."
+                )
+                full_content = html.decode("utf-8") if isinstance(html, bytes) else html
+                full_content = full_content.strip()
+
+        if not full_content:
+            logger.warning(f"No content extracted from HTML file: {file_path}")
+
         # Store the extracted content
         storage_config = get_storage_config()
         data_root_directory = storage_config["data_root_directory"]