validate e2e - urls are saved as htmls, and loaders are selected correctly

2025-10-21 20:10:43 +01:00 · 2025-10-21 20:10:43 +01:00 · 03b4547b7f
commit 03b4547b7f
parent f84e31c626
11 changed files with 182 additions and 45 deletions
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@ -78,14 +78,21 @@ class LoaderEngine:
        Returns:
            LoaderInterface that can handle the file, or None if not found
        """
        from pathlib import Path
        file_info = filetype.guess(file_path)
        path_extension = Path(file_path).suffix.lstrip(".")
        # Try preferred loaders first
        if preferred_loaders:
            for loader_name in preferred_loaders:
                if loader_name in self._loaders:
                    loader = self._loaders[loader_name]
                    # Try with path extension first (for text formats like html)
                    if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
                        return loader
                    # Fall back to content-detected extension
                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                        return loader
                else:
@ -95,6 +102,10 @@ class LoaderEngine:
        for loader_name in self.default_loader_priority:
            if loader_name in self._loaders:
                loader = self._loaders[loader_name]
                # Try with path extension first (for text formats like html)
                if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
                    return loader
                # Fall back to content-detected extension
                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                    return loader
            else:
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
 from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
-from cognee.infrastructure.loaders import LoaderInterface
+from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 logger = get_logger(__name__)
@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):
    @property
    def supported_mime_types(self) -> List[str]:
-        return ["text/html"]
+        return ["text/html", "text/plain"]
    @property
    def loader_name(self) -> str:
        return "beautiful_soup_loader"
    def can_handle(self, extension: str, mime_type: str) -> bool:
-        return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
+        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
        return can
    async def load(self, file_path: str, **kwargs):
-        pass
+        """Load an HTML file and return its path.
        For HTML files stored on disk, we simply return the file path
        since the content is already in text format and can be processed directly.
        Args:
            file_path: Path to the HTML file
            **kwargs: Additional arguments
        Returns:
            The file path to the HTML file
        """
        raise NotImplementedError
    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.
--- a/cognee/modules/ingestion/save_data_to_file.py
+++ b/cognee/modules/ingestion/save_data_to_file.py
@ -1,10 +1,12 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
 from .classify import classify
 import hashlib
-async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
+async def save_data_to_file(
    data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
 ):
    storage_config = get_storage_config()
    data_root_directory = storage_config["data_root_directory"]
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
        file_name = file_metadata["name"]
        if file_extension is not None:
            extension = file_extension.lstrip(".")
            file_name_without_ext = file_name.rsplit(".", 1)[0]
            file_name = f"{file_name_without_ext}.{extension}"
        storage = get_file_storage(data_root_directory)
        full_file_path = await storage.store(file_name, data)
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
 from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@ -88,6 +89,9 @@ async def ingest_data(
                preferred_loaders,
            )
            if loader_engine is None:
                raise IngestionError("Loader cannot be None")
            # Find metadata from original file
            # Standard flow: extract metadata from both original and stored files
            async with open_data_file(original_file_path) as file:
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
-from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
+from cognee.tasks.web_scraper.utils import fetch_page_content
 logger = get_logger()
@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
        if parsed_url.scheme == "s3":
            return data_item
        elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            fetcher = WebUrlFetcher()
+            urls_to_page_contents = await fetch_page_content(data_item)
-            return await fetcher.fetch(data_item)
+            return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
        # data is local file path
        elif parsed_url.scheme == "file":
            if settings.accept_local_file_path:
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
    max_retries: int = 2
    retry_delay_factor: float = 0.5
    headers: Optional[Dict[str, str]] = None
    extraction_rules: Dict[str, Any]
    use_playwright: bool = False
    playwright_js_wait: float = 0.8
    robots_cache_ttl: float = 3600.0
--- a/cognee/tasks/web_scraper/default_url_crawler.py
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@ -7,7 +7,7 @@ from urllib.parse import urlparse
 import httpx
 from cognee.shared.logging_utils import get_logger
-from cognee.tasks.web_scraper.utils import UrlsToHtmls
+from cognee.tasks.web_scraper.types import UrlsToHtmls
 logger = get_logger()
--- a/cognee/tasks/web_scraper/types.py
+++ b/cognee/tasks/web_scraper/types.py
@ -0,0 +1,4 @@
 from typing import TypeAlias
 UrlsToHtmls: TypeAlias = dict[str, str]
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
 """
 import os
-from re import L
+from typing import List, Union
 from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
 from cognee.tasks.web_scraper.types import UrlsToHtmls
 from .default_url_crawler import DefaultUrlCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 logger = get_logger(__name__)
 UrlsToHtmls: TypeAlias = dict[str, str]
 async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
    """Fetch content from one or more URLs using the specified tool.
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@ -1,37 +1,76 @@
 from sys import exc_info
 import pytest
 import cognee
-from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
 from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
 from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
 from cognee.tasks.ingestion import save_data_item_to_storage
 from pathlib import Path
@pytest.mark.asyncio
-async def test_add_fails_when_web_url_fetcher_config_not_specified():
+async def test_url_saves_as_html_file():
    from cognee.shared.logging_utils import setup_logging, ERROR
    setup_logging(log_level=ERROR)
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
-    with pytest.raises(IngestionError) as excinfo:
+
-        await cognee.add(
+    try:
-            "https://en.wikipedia.org/wiki/Large_language_model",
+        original_file_path = await save_data_item_to_storage(
-            incremental_loading=False,
+            "https://en.wikipedia.org/wiki/Large_language_model"
        )
-    assert excinfo.value.message.startswith(
+        file_path = get_data_file_path(original_file_path)
-        "web_url_fetcher configuration must be a valid dictionary"
+        assert file_path.endswith(".html")
-    )
+        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
-async def test_add_succesfully_adds_url_when_fetcher_config_specified():
+async def test_saved_html_is_valid():
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        pytest.fail("Test case requires bs4 installed")
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
-    extraction_rules = {
+    try:
-        "title": {"selector": "title"},
+        original_file_path = await save_data_item_to_storage(
-        "headings": {"selector": "h1, h2, h3", "all": True},
+            "https://en.wikipedia.org/wiki/Large_language_model"
-        "links": {"selector": "a", "attr": "href", "all": True},
+        )
-        "paragraphs": {"selector": "p", "all": True},
+        file_path = get_data_file_path(original_file_path)
-    }
+        content = Path(file_path).read_text()
        soup = BeautifulSoup(content, "html.parser")
        assert soup.find() is not None, "File should contain parseable HTML"
        has_html_elements = any(
            [
                soup.find("html"),
                soup.find("head"),
                soup.find("body"),
                soup.find("div"),
                soup.find("p"),
            ]
        )
        assert has_html_elements, "File should contain common HTML elements"
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_add_url():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
@pytest.mark.asyncio
 async def test_add_url_without_incremental_loading():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        await cognee.add(
@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
@pytest.mark.asyncio
-async def test_add_with_incremental_loading_works():
+async def test_add_url_with_incremental_loading():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():
@pytest.mark.asyncio
-async def test_add_without_incremental_loading_works():
+async def test_add_url_with_extraction_rules():  # TODO: this'll fail due to not implemented `load()` yet
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+            preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
        )
    except Exception as e:
        pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
 async def test_loader_is_none_by_default():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
        loader_engine = LoaderEngine()
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        loader = loader_engine.get_loader(
            file_path,
            preferred_loaders=preferred_loaders,
        )
        assert loader is None
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
        loader_engine = LoaderEngine()
        bs_loader = BeautifulSoupLoader()
        loader_engine.register_loader(bs_loader)
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        loader = loader_engine.get_loader(
            file_path,
            preferred_loaders=preferred_loaders,
        )
        assert loader == bs_loader
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
--- a/examples/python/web_url_fetcher_example.py
+++ b/examples/python/web_url_fetcher_example.py
@ -23,6 +23,7 @@ async def main():
    await cognee.add(
        "https://en.wikipedia.org/wiki/Large_language_model",
        incremental_loading=False,
        preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
    )
    await cognee.cognify()
		`@ -0,0 +1,4 @@`
							`from typing import TypeAlias`


							`UrlsToHtmls: TypeAlias = dict[str, str]`