validate e2e - urls are saved as htmls, and loaders are selected correctly

2025-10-21 20:10:43 +01:00 · 2025-10-21 20:10:43 +01:00 · 03b4547b7f
commit 03b4547b7f
parent f84e31c626
11 changed files with 182 additions and 45 deletions
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@ -78,14 +78,21 @@ class LoaderEngine:
        Returns:
            LoaderInterface that can handle the file, or None if not found
        """
+        from pathlib import Path

        file_info = filetype.guess(file_path)

+        path_extension = Path(file_path).suffix.lstrip(".")
+
        # Try preferred loaders first
        if preferred_loaders:
            for loader_name in preferred_loaders:
                if loader_name in self._loaders:
                    loader = self._loaders[loader_name]
+                    # Try with path extension first (for text formats like html)
+                    if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                        return loader
+                    # Fall back to content-detected extension
                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                        return loader
                else:
@ -95,6 +102,10 @@ class LoaderEngine:
        for loader_name in self.default_loader_priority:
            if loader_name in self._loaders:
                loader = self._loaders[loader_name]
+                # Try with path extension first (for text formats like html)
+                if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                    return loader
+                # Fall back to content-detected extension
                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                    return loader
            else:
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
 from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
-from cognee.infrastructure.loaders import LoaderInterface
+from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from cognee.shared.logging_utils import get_logger

 logger = get_logger(__name__)
@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):

    @property
    def supported_mime_types(self) -> List[str]:
-        return ["text/html"]
+        return ["text/html", "text/plain"]

    @property
    def loader_name(self) -> str:
        return "beautiful_soup_loader"

    def can_handle(self, extension: str, mime_type: str) -> bool:
-        return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
+        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
+        return can

    async def load(self, file_path: str, **kwargs):
-        pass
+        """Load an HTML file and return its path.
+
+        For HTML files stored on disk, we simply return the file path
+        since the content is already in text format and can be processed directly.
+
+        Args:
+            file_path: Path to the HTML file
+            **kwargs: Additional arguments
+
+        Returns:
+            The file path to the HTML file
+        """
+        raise NotImplementedError

    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.
--- a/cognee/modules/ingestion/save_data_to_file.py
+++ b/cognee/modules/ingestion/save_data_to_file.py
@ -1,10 +1,12 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
 from .classify import classify
 import hashlib


-async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
+async def save_data_to_file(
+    data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
+):
    storage_config = get_storage_config()

    data_root_directory = storage_config["data_root_directory"]
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):

        file_name = file_metadata["name"]

+        if file_extension is not None:
+            extension = file_extension.lstrip(".")
+            file_name_without_ext = file_name.rsplit(".", 1)[0]
+            file_name = f"{file_name_without_ext}.{extension}"
+
        storage = get_file_storage(data_root_directory)

        full_file_path = await storage.store(file_name, data)
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
+from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@ -88,6 +89,9 @@ async def ingest_data(
                preferred_loaders,
            )

+            if loader_engine is None:
+                raise IngestionError("Loader cannot be None")
+
            # Find metadata from original file
            # Standard flow: extract metadata from both original and stored files
            async with open_data_file(original_file_path) as file:
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict

-from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
+from cognee.tasks.web_scraper.utils import fetch_page_content


 logger = get_logger()
@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
        if parsed_url.scheme == "s3":
            return data_item
        elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            fetcher = WebUrlFetcher()
-            return await fetcher.fetch(data_item)
+            urls_to_page_contents = await fetch_page_content(data_item)
+            return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
        # data is local file path
        elif parsed_url.scheme == "file":
            if settings.accept_local_file_path:
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
    max_retries: int = 2
    retry_delay_factor: float = 0.5
    headers: Optional[Dict[str, str]] = None
-    extraction_rules: Dict[str, Any]
    use_playwright: bool = False
    playwright_js_wait: float = 0.8
    robots_cache_ttl: float = 3600.0
--- a/cognee/tasks/web_scraper/default_url_crawler.py
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@ -7,7 +7,7 @@ from urllib.parse import urlparse
 import httpx

 from cognee.shared.logging_utils import get_logger
-from cognee.tasks.web_scraper.utils import UrlsToHtmls
+from cognee.tasks.web_scraper.types import UrlsToHtmls

 logger = get_logger()

--- a/cognee/tasks/web_scraper/types.py
+++ b/cognee/tasks/web_scraper/types.py
@ -0,0 +1,4 @@
+from typing import TypeAlias
+
+
+UrlsToHtmls: TypeAlias = dict[str, str]
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
 """

 import os
-from re import L
-from typing import List, Union, TypeAlias
+from typing import List, Union
 from cognee.shared.logging_utils import get_logger
+from cognee.tasks.web_scraper.types import UrlsToHtmls
 from .default_url_crawler import DefaultUrlCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig

 logger = get_logger(__name__)

-UrlsToHtmls: TypeAlias = dict[str, str]
-

 async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
    """Fetch content from one or more URLs using the specified tool.
--- a/cognee/tests/integration/web_url_crawler/test_add.py
+++ b/cognee/tests/integration/web_url_crawler/test_add.py
@ -1,37 +1,76 @@
-from sys import exc_info
 import pytest
 import cognee
-from cognee.modules.ingestion.exceptions.exceptions import IngestionError
+from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
+from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
+from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
+from cognee.tasks.ingestion import save_data_item_to_storage
+from pathlib import Path


@pytest.mark.asyncio
-async def test_add_fails_when_web_url_fetcher_config_not_specified():
-    from cognee.shared.logging_utils import setup_logging, ERROR
-
-    setup_logging(log_level=ERROR)
+async def test_url_saves_as_html_file():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
-    with pytest.raises(IngestionError) as excinfo:
-        await cognee.add(
-            "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
        )
-    assert excinfo.value.message.startswith(
-        "web_url_fetcher configuration must be a valid dictionary"
-    )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")


@pytest.mark.asyncio
-async def test_add_succesfully_adds_url_when_fetcher_config_specified():
+async def test_saved_html_is_valid():
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        pytest.fail("Test case requires bs4 installed")
+
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)

-    extraction_rules = {
-        "title": {"selector": "title"},
-        "headings": {"selector": "h1, h2, h3", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "all": True},
-    }
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        content = Path(file_path).read_text()
+
+        soup = BeautifulSoup(content, "html.parser")
+        assert soup.find() is not None, "File should contain parseable HTML"
+
+        has_html_elements = any(
+            [
+                soup.find("html"),
+                soup.find("head"),
+                soup.find("body"),
+                soup.find("div"),
+                soup.find("p"),
+            ]
+        )
+        assert has_html_elements, "File should contain common HTML elements"
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_add_url():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
+
+
+@pytest.mark.asyncio
+async def test_add_url_without_incremental_loading():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)

    try:
        await cognee.add(
@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():


@pytest.mark.asyncio
-async def test_add_with_incremental_loading_works():
+async def test_add_url_with_incremental_loading():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)

-    extraction_rules = {
-        "title": {"selector": "title"},
-        "headings": {"selector": "h1, h2, h3", "all": True},
-        "links": {"selector": "a", "attr": "href", "all": True},
-        "paragraphs": {"selector": "p", "all": True},
-    }
-
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():


@pytest.mark.asyncio
-async def test_add_without_incremental_loading_works():
+async def test_add_url_with_extraction_rules():  # TODO: this'll fail due to not implemented `load()` yet
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)

@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
-            incremental_loading=False,
+            preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
        )
    except Exception as e:
        pytest.fail(f"Failed to add url: {e}")
+
+
+@pytest.mark.asyncio
+async def test_loader_is_none_by_default():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader is None
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader == bs_loader
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
--- a/examples/python/web_url_fetcher_example.py
+++ b/examples/python/web_url_fetcher_example.py
@ -23,6 +23,7 @@ async def main():
    await cognee.add(
        "https://en.wikipedia.org/wiki/Large_language_model",
        incremental_loading=False,
+        preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
    )

    await cognee.cognify()