From 03b4547b7f4e067c8dfed7259e7deff56049a170 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 20:10:43 +0100 Subject: [PATCH] validate e2e - urls are saved as htmls, and loaders are selected correctly --- cognee/infrastructure/loaders/LoaderEngine.py | 11 ++ .../loaders/external/beautiful_soup_loader.py | 21 ++- cognee/modules/ingestion/save_data_to_file.py | 11 +- cognee/tasks/ingestion/ingest_data.py | 4 + .../ingestion/save_data_item_to_storage.py | 6 +- cognee/tasks/web_scraper/config.py | 1 - .../tasks/web_scraper/default_url_crawler.py | 2 +- cognee/tasks/web_scraper/types.py | 4 + cognee/tasks/web_scraper/utils.py | 6 +- .../integration/web_url_crawler/test_add.py | 160 ++++++++++++++---- examples/python/web_url_fetcher_example.py | 1 + 11 files changed, 182 insertions(+), 45 deletions(-) create mode 100644 cognee/tasks/web_scraper/types.py diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 84ecee0de..1a47eea56 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -78,14 +78,21 @@ class LoaderEngine: Returns: LoaderInterface that can handle the file, or None if not found """ + from pathlib import Path file_info = filetype.guess(file_path) + path_extension = Path(file_path).suffix.lstrip(".") + # Try preferred loaders first if preferred_loaders: for loader_name in preferred_loaders: if loader_name in self._loaders: loader = self._loaders[loader_name] + # Try with path extension first (for text formats like html) + if loader.can_handle(extension=path_extension, mime_type=file_info.mime): + return loader + # Fall back to content-detected extension if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): return loader else: @@ -95,6 +102,10 @@ class LoaderEngine: for loader_name in self.default_loader_priority: if loader_name in self._loaders: loader = self._loaders[loader_name] + # Try with path extension first (for text formats like html) + if loader.can_handle(extension=path_extension, mime_type=file_info.mime): + return loader + # Fall back to content-detected extension if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): return loader else: diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index ceea3f9de..05330a095 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules. from typing import Union, Dict, Any, Optional, List from dataclasses import dataclass from bs4 import BeautifulSoup -from cognee.infrastructure.loaders import LoaderInterface +from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) @@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface): @property def supported_mime_types(self) -> List[str]: - return ["text/html"] + return ["text/html", "text/plain"] @property def loader_name(self) -> str: return "beautiful_soup_loader" def can_handle(self, extension: str, mime_type: str) -> bool: - return extension in self.supported_extensions() and mime_type in self.supported_mime_types() + can = extension in self.supported_extensions and mime_type in self.supported_mime_types + return can async def load(self, file_path: str, **kwargs): - pass + """Load an HTML file and return its path. + + For HTML files stored on disk, we simply return the file path + since the content is already in text format and can be processed directly. + + Args: + file_path: Path to the HTML file + **kwargs: Additional arguments + + Returns: + The file path to the HTML file + """ + raise NotImplementedError def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py index 0ba0b2983..42e8d45ba 100644 --- a/cognee/modules/ingestion/save_data_to_file.py +++ b/cognee/modules/ingestion/save_data_to_file.py @@ -1,10 +1,12 @@ -from typing import BinaryIO, Union +from typing import BinaryIO, Union, Optional from cognee.infrastructure.files.storage import get_file_storage, get_storage_config from .classify import classify import hashlib -async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): +async def save_data_to_file( + data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None +): storage_config = get_storage_config() data_root_directory = storage_config["data_root_directory"] @@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): file_name = file_metadata["name"] + if file_extension is not None: + extension = file_extension.lstrip(".") + file_name_without_ext = file_name.rsplit(".", 1)[0] + file_name = f"{file_name_without_ext}.{extension}" + storage = get_file_storage(data_root_directory) full_file_path = await storage.store(file_name, data) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 7b081cc34..25b2aa6ae 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data +from cognee.modules.ingestion.exceptions import IngestionError from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets @@ -88,6 +89,9 @@ async def ingest_data( preferred_loaders, ) + if loader_engine is None: + raise IngestionError("Loader cannot be None") + # Find metadata from original file # Standard flow: extract metadata from both original and stored files async with open_data_file(original_file_path) as file: diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 453219f15..05d21e617 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file from cognee.shared.logging_utils import get_logger from pydantic_settings import BaseSettings, SettingsConfigDict -from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher +from cognee.tasks.web_scraper.utils import fetch_page_content logger = get_logger() @@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str if parsed_url.scheme == "s3": return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": - fetcher = WebUrlFetcher() - return await fetcher.fetch(data_item) + urls_to_page_contents = await fetch_page_content(data_item) + return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html") # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index fcf22ab33..f23156f95 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel): max_retries: int = 2 retry_delay_factor: float = 0.5 headers: Optional[Dict[str, str]] = None - extraction_rules: Dict[str, Any] use_playwright: bool = False playwright_js_wait: float = 0.8 robots_cache_ttl: float = 3600.0 diff --git a/cognee/tasks/web_scraper/default_url_crawler.py b/cognee/tasks/web_scraper/default_url_crawler.py index d9d2ee922..d09bf3e80 100644 --- a/cognee/tasks/web_scraper/default_url_crawler.py +++ b/cognee/tasks/web_scraper/default_url_crawler.py @@ -7,7 +7,7 @@ from urllib.parse import urlparse import httpx from cognee.shared.logging_utils import get_logger -from cognee.tasks.web_scraper.utils import UrlsToHtmls +from cognee.tasks.web_scraper.types import UrlsToHtmls logger = get_logger() diff --git a/cognee/tasks/web_scraper/types.py b/cognee/tasks/web_scraper/types.py new file mode 100644 index 000000000..54a3f5d42 --- /dev/null +++ b/cognee/tasks/web_scraper/types.py @@ -0,0 +1,4 @@ +from typing import TypeAlias + + +UrlsToHtmls: TypeAlias = dict[str, str] diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index b1cbf82e9..1f51bf98d 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping """ import os -from re import L -from typing import List, Union, TypeAlias +from typing import List, Union from cognee.shared.logging_utils import get_logger +from cognee.tasks.web_scraper.types import UrlsToHtmls from .default_url_crawler import DefaultUrlCrawler from .config import DefaultCrawlerConfig, TavilyConfig logger = get_logger(__name__) -UrlsToHtmls: TypeAlias = dict[str, str] - async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: """Fetch content from one or more URLs using the specified tool. diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index a00ca9e0d..27a627680 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -1,37 +1,76 @@ -from sys import exc_info import pytest import cognee -from cognee.modules.ingestion.exceptions.exceptions import IngestionError +from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path +from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine +from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader +from cognee.tasks.ingestion import save_data_item_to_storage +from pathlib import Path @pytest.mark.asyncio -async def test_add_fails_when_web_url_fetcher_config_not_specified(): - from cognee.shared.logging_utils import setup_logging, ERROR - - setup_logging(log_level=ERROR) +async def test_url_saves_as_html_file(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - with pytest.raises(IngestionError) as excinfo: - await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", - incremental_loading=False, + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" ) - assert excinfo.value.message.startswith( - "web_url_fetcher configuration must be a valid dictionary" - ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") @pytest.mark.asyncio -async def test_add_succesfully_adds_url_when_fetcher_config_specified(): +async def test_saved_html_is_valid(): + try: + from bs4 import BeautifulSoup + except ImportError: + pytest.fail("Test case requires bs4 installed") + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - extraction_rules = { - "title": {"selector": "title"}, - "headings": {"selector": "h1, h2, h3", "all": True}, - "links": {"selector": "a", "attr": "href", "all": True}, - "paragraphs": {"selector": "p", "all": True}, - } + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + content = Path(file_path).read_text() + + soup = BeautifulSoup(content, "html.parser") + assert soup.find() is not None, "File should contain parseable HTML" + + has_html_elements = any( + [ + soup.find("html"), + soup.find("head"), + soup.find("body"), + soup.find("div"), + soup.find("p"), + ] + ) + assert has_html_elements, "File should contain common HTML elements" + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_add_url(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + + +@pytest.mark.asyncio +async def test_add_url_without_incremental_loading(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) try: await cognee.add( @@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified(): @pytest.mark.asyncio -async def test_add_with_incremental_loading_works(): +async def test_add_url_with_incremental_loading(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - extraction_rules = { - "title": {"selector": "title"}, - "headings": {"selector": "h1, h2, h3", "all": True}, - "links": {"selector": "a", "attr": "href", "all": True}, - "paragraphs": {"selector": "p", "all": True}, - } - try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", @@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works(): @pytest.mark.asyncio -async def test_add_without_incremental_loading_works(): +async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works(): try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - incremental_loading=False, + preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") + + +@pytest.mark.asyncio +async def test_loader_is_none_by_default(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + loader = loader_engine.get_loader( + file_path, + preferred_loaders=preferred_loaders, + ) + + assert loader is None + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + loader = loader_engine.get_loader( + file_path, + preferred_loaders=preferred_loaders, + ) + + assert loader == bs_loader + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") diff --git a/examples/python/web_url_fetcher_example.py b/examples/python/web_url_fetcher_example.py index 2195a62c0..aff8094bf 100644 --- a/examples/python/web_url_fetcher_example.py +++ b/examples/python/web_url_fetcher_example.py @@ -23,6 +23,7 @@ async def main(): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", incremental_loading=False, + preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) await cognee.cognify()