From 03b4547b7f4e067c8dfed7259e7deff56049a170 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 20:10:43 +0100
Subject: [PATCH] validate e2e - urls are saved as htmls, and loaders are
 selected correctly

---
 cognee/infrastructure/loaders/LoaderEngine.py |  11 ++
 .../loaders/external/beautiful_soup_loader.py |  21 ++-
 cognee/modules/ingestion/save_data_to_file.py |  11 +-
 cognee/tasks/ingestion/ingest_data.py         |   4 +
 .../ingestion/save_data_item_to_storage.py    |   6 +-
 cognee/tasks/web_scraper/config.py            |   1 -
 .../tasks/web_scraper/default_url_crawler.py  |   2 +-
 cognee/tasks/web_scraper/types.py             |   4 +
 cognee/tasks/web_scraper/utils.py             |   6 +-
 .../integration/web_url_crawler/test_add.py   | 160 ++++++++++++++----
 examples/python/web_url_fetcher_example.py    |   1 +
 11 files changed, 182 insertions(+), 45 deletions(-)
 create mode 100644 cognee/tasks/web_scraper/types.py

diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py
index 84ecee0de..1a47eea56 100644
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@@ -78,14 +78,21 @@ class LoaderEngine:
         Returns:
             LoaderInterface that can handle the file, or None if not found
         """
+        from pathlib import Path
 
         file_info = filetype.guess(file_path)
 
+        path_extension = Path(file_path).suffix.lstrip(".")
+
         # Try preferred loaders first
         if preferred_loaders:
             for loader_name in preferred_loaders:
                 if loader_name in self._loaders:
                     loader = self._loaders[loader_name]
+                    # Try with path extension first (for text formats like html)
+                    if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                        return loader
+                    # Fall back to content-detected extension
                     if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                         return loader
                 else:
@@ -95,6 +102,10 @@ class LoaderEngine:
         for loader_name in self.default_loader_priority:
             if loader_name in self._loaders:
                 loader = self._loaders[loader_name]
+                # Try with path extension first (for text formats like html)
+                if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                    return loader
+                # Fall back to content-detected extension
                 if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                     return loader
             else:
diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index ceea3f9de..05330a095 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
 from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
-from cognee.infrastructure.loaders import LoaderInterface
+from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger(__name__)
@@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):
 
     @property
     def supported_mime_types(self) -> List[str]:
-        return ["text/html"]
+        return ["text/html", "text/plain"]
 
     @property
     def loader_name(self) -> str:
         return "beautiful_soup_loader"
 
     def can_handle(self, extension: str, mime_type: str) -> bool:
-        return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
+        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
+        return can
 
     async def load(self, file_path: str, **kwargs):
-        pass
+        """Load an HTML file and return its path.
+
+        For HTML files stored on disk, we simply return the file path
+        since the content is already in text format and can be processed directly.
+
+        Args:
+            file_path: Path to the HTML file
+            **kwargs: Additional arguments
+
+        Returns:
+            The file path to the HTML file
+        """
+        raise NotImplementedError
 
     def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py
index 0ba0b2983..42e8d45ba 100644
--- a/cognee/modules/ingestion/save_data_to_file.py
+++ b/cognee/modules/ingestion/save_data_to_file.py
@@ -1,10 +1,12 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
 from .classify import classify
 import hashlib
 
 
-async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
+async def save_data_to_file(
+    data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
+):
     storage_config = get_storage_config()
 
     data_root_directory = storage_config["data_root_directory"]
@@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
 
         file_name = file_metadata["name"]
 
+        if file_extension is not None:
+            extension = file_extension.lstrip(".")
+            file_name_without_ext = file_name.rsplit(".", 1)[0]
+            file_name = f"{file_name_without_ext}.{extension}"
+
         storage = get_file_storage(data_root_directory)
 
         full_file_path = await storage.store(file_name, data)
diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index 7b081cc34..25b2aa6ae 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
+from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@@ -88,6 +89,9 @@ async def ingest_data(
                 preferred_loaders,
             )
 
+            if loader_engine is None:
+                raise IngestionError("Loader cannot be None")
+
             # Find metadata from original file
             # Standard flow: extract metadata from both original and stored files
             async with open_data_file(original_file_path) as file:
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index 453219f15..05d21e617 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
-from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
+from cognee.tasks.web_scraper.utils import fetch_page_content
 
 
 logger = get_logger()
@@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         if parsed_url.scheme == "s3":
             return data_item
         elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            fetcher = WebUrlFetcher()
-            return await fetcher.fetch(data_item)
+            urls_to_page_contents = await fetch_page_content(data_item)
+            return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
         # data is local file path
         elif parsed_url.scheme == "file":
             if settings.accept_local_file_path:
diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py
index fcf22ab33..f23156f95 100644
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
     max_retries: int = 2
     retry_delay_factor: float = 0.5
     headers: Optional[Dict[str, str]] = None
-    extraction_rules: Dict[str, Any]
     use_playwright: bool = False
     playwright_js_wait: float = 0.8
     robots_cache_ttl: float = 3600.0
diff --git a/cognee/tasks/web_scraper/default_url_crawler.py b/cognee/tasks/web_scraper/default_url_crawler.py
index d9d2ee922..d09bf3e80 100644
--- a/cognee/tasks/web_scraper/default_url_crawler.py
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@@ -7,7 +7,7 @@ from urllib.parse import urlparse
 import httpx
 
 from cognee.shared.logging_utils import get_logger
-from cognee.tasks.web_scraper.utils import UrlsToHtmls
+from cognee.tasks.web_scraper.types import UrlsToHtmls
 
 logger = get_logger()
 
diff --git a/cognee/tasks/web_scraper/types.py b/cognee/tasks/web_scraper/types.py
new file mode 100644
index 000000000..54a3f5d42
--- /dev/null
+++ b/cognee/tasks/web_scraper/types.py
@@ -0,0 +1,4 @@
+from typing import TypeAlias
+
+
+UrlsToHtmls: TypeAlias = dict[str, str]
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index b1cbf82e9..1f51bf98d 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
 """
 
 import os
-from re import L
-from typing import List, Union, TypeAlias
+from typing import List, Union
 from cognee.shared.logging_utils import get_logger
+from cognee.tasks.web_scraper.types import UrlsToHtmls
 from .default_url_crawler import DefaultUrlCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 
 logger = get_logger(__name__)
 
-UrlsToHtmls: TypeAlias = dict[str, str]
-
 
 async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
     """Fetch content from one or more URLs using the specified tool.
diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py
index a00ca9e0d..27a627680 100644
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@@ -1,37 +1,76 @@
-from sys import exc_info
 import pytest
 import cognee
-from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
+from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
+from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
+from cognee.tasks.ingestion import save_data_item_to_storage
+from pathlib import Path
 
 
 @pytest.mark.asyncio
-async def test_add_fails_when_web_url_fetcher_config_not_specified():
-    from cognee.shared.logging_utils import setup_logging, ERROR
-
-    setup_logging(log_level=ERROR)
+async def test_url_saves_as_html_file():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
-    with pytest.raises(IngestionError) as excinfo:
-        await cognee.add(
-            "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
         )
-    assert excinfo.value.message.startswith(
-        "web_url_fetcher configuration must be a valid dictionary"
-    )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
 
 
 @pytest.mark.asyncio
-async def test_add_succesfully_adds_url_when_fetcher_config_specified():
+async def test_saved_html_is_valid():
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        pytest.fail("Test case requires bs4 installed")
+
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-    extraction_rules = {
-        "title": {"selector": "title"},
-        "headings": {"selector": "h1, h2, h3", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "all": True},
-    }
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        content = Path(file_path).read_text()
+
+        soup = BeautifulSoup(content, "html.parser")
+        assert soup.find() is not None, "File should contain parseable HTML"
+
+        has_html_elements = any(
+            [
+                soup.find("html"),
+                soup.find("head"),
+                soup.find("body"),
+                soup.find("div"),
+                soup.find("p"),
+            ]
+        )
+        assert has_html_elements, "File should contain common HTML elements"
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_add_url():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
+
+
+@pytest.mark.asyncio
+async def test_add_url_without_incremental_loading():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
 
     try:
         await cognee.add(
@@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
 
 
 @pytest.mark.asyncio
-async def test_add_with_incremental_loading_works():
+async def test_add_url_with_incremental_loading():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-    extraction_rules = {
-        "title": {"selector": "title"},
-        "headings": {"selector": "h1, h2, h3", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "all": True},
-    }
-
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
@@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():
 
 
 @pytest.mark.asyncio
-async def test_add_without_incremental_loading_works():
+async def test_add_url_with_extraction_rules():  # TODO: this'll fail due to not implemented `load()` yet
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
@@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
     try:
         await cognee.add(
             "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+            preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
         )
     except Exception as e:
         pytest.fail(f"Failed to add url: {e}")
+
+
+@pytest.mark.asyncio
+async def test_loader_is_none_by_default():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader is None
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader == bs_loader
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
diff --git a/examples/python/web_url_fetcher_example.py b/examples/python/web_url_fetcher_example.py
index 2195a62c0..aff8094bf 100644
--- a/examples/python/web_url_fetcher_example.py
+++ b/examples/python/web_url_fetcher_example.py
@@ -23,6 +23,7 @@ async def main():
     await cognee.add(
         "https://en.wikipedia.org/wiki/Large_language_model",
         incremental_loading=False,
+        preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
     )
 
     await cognee.cognify()