validate e2e - urls are saved as htmls, and loaders are selected correctly

This commit is contained in:
Daulet Amirkhanov 2025-10-21 20:10:43 +01:00
parent f84e31c626
commit 03b4547b7f
11 changed files with 182 additions and 45 deletions

View file

@ -78,14 +78,21 @@ class LoaderEngine:
Returns: Returns:
LoaderInterface that can handle the file, or None if not found LoaderInterface that can handle the file, or None if not found
""" """
from pathlib import Path
file_info = filetype.guess(file_path) file_info = filetype.guess(file_path)
path_extension = Path(file_path).suffix.lstrip(".")
# Try preferred loaders first # Try preferred loaders first
if preferred_loaders: if preferred_loaders:
for loader_name in preferred_loaders: for loader_name in preferred_loaders:
if loader_name in self._loaders: if loader_name in self._loaders:
loader = self._loaders[loader_name] loader = self._loaders[loader_name]
# Try with path extension first (for text formats like html)
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
return loader
# Fall back to content-detected extension
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
return loader return loader
else: else:
@ -95,6 +102,10 @@ class LoaderEngine:
for loader_name in self.default_loader_priority: for loader_name in self.default_loader_priority:
if loader_name in self._loaders: if loader_name in self._loaders:
loader = self._loaders[loader_name] loader = self._loaders[loader_name]
# Try with path extension first (for text formats like html)
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
return loader
# Fall back to content-detected extension
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
return loader return loader
else: else:

View file

@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
from typing import Union, Dict, Any, Optional, List from typing import Union, Dict, Any, Optional, List
from dataclasses import dataclass from dataclasses import dataclass
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from cognee.infrastructure.loaders import LoaderInterface from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
logger = get_logger(__name__) logger = get_logger(__name__)
@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):
@property @property
def supported_mime_types(self) -> List[str]: def supported_mime_types(self) -> List[str]:
return ["text/html"] return ["text/html", "text/plain"]
@property @property
def loader_name(self) -> str: def loader_name(self) -> str:
return "beautiful_soup_loader" return "beautiful_soup_loader"
def can_handle(self, extension: str, mime_type: str) -> bool: def can_handle(self, extension: str, mime_type: str) -> bool:
return extension in self.supported_extensions() and mime_type in self.supported_mime_types() can = extension in self.supported_extensions and mime_type in self.supported_mime_types
return can
async def load(self, file_path: str, **kwargs): async def load(self, file_path: str, **kwargs):
pass """Load an HTML file and return its path.
For HTML files stored on disk, we simply return the file path
since the content is already in text format and can be processed directly.
Args:
file_path: Path to the HTML file
**kwargs: Additional arguments
Returns:
The file path to the HTML file
"""
raise NotImplementedError
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass. """Normalize an extraction rule to an ExtractionRule dataclass.

View file

@ -1,10 +1,12 @@
from typing import BinaryIO, Union from typing import BinaryIO, Union, Optional
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
from .classify import classify from .classify import classify
import hashlib import hashlib
async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): async def save_data_to_file(
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
):
storage_config = get_storage_config() storage_config = get_storage_config()
data_root_directory = storage_config["data_root_directory"] data_root_directory = storage_config["data_root_directory"]
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
file_name = file_metadata["name"] file_name = file_metadata["name"]
if file_extension is not None:
extension = file_extension.lstrip(".")
file_name_without_ext = file_name.rsplit(".", 1)[0]
file_name = f"{file_name_without_ext}.{extension}"
storage = get_file_storage(data_root_directory) storage = get_file_storage(data_root_directory)
full_file_path = await storage.store(file_name, data) full_file_path = await storage.store(file_name, data)

View file

@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
import cognee.modules.ingestion as ingestion import cognee.modules.ingestion as ingestion
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Data from cognee.modules.data.models import Data
from cognee.modules.ingestion.exceptions import IngestionError
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user from cognee.modules.users.methods import get_default_user
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@ -88,6 +89,9 @@ async def ingest_data(
preferred_loaders, preferred_loaders,
) )
if loader_engine is None:
raise IngestionError("Loader cannot be None")
# Find metadata from original file # Find metadata from original file
# Standard flow: extract metadata from both original and stored files # Standard flow: extract metadata from both original and stored files
async with open_data_file(original_file_path) as file: async with open_data_file(original_file_path) as file:

View file

@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher from cognee.tasks.web_scraper.utils import fetch_page_content
logger = get_logger() logger = get_logger()
@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
if parsed_url.scheme == "s3": if parsed_url.scheme == "s3":
return data_item return data_item
elif parsed_url.scheme == "http" or parsed_url.scheme == "https": elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
fetcher = WebUrlFetcher() urls_to_page_contents = await fetch_page_content(data_item)
return await fetcher.fetch(data_item) return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
# data is local file path # data is local file path
elif parsed_url.scheme == "file": elif parsed_url.scheme == "file":
if settings.accept_local_file_path: if settings.accept_local_file_path:

View file

@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
max_retries: int = 2 max_retries: int = 2
retry_delay_factor: float = 0.5 retry_delay_factor: float = 0.5
headers: Optional[Dict[str, str]] = None headers: Optional[Dict[str, str]] = None
extraction_rules: Dict[str, Any]
use_playwright: bool = False use_playwright: bool = False
playwright_js_wait: float = 0.8 playwright_js_wait: float = 0.8
robots_cache_ttl: float = 3600.0 robots_cache_ttl: float = 3600.0

View file

@ -7,7 +7,7 @@ from urllib.parse import urlparse
import httpx import httpx
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.tasks.web_scraper.utils import UrlsToHtmls from cognee.tasks.web_scraper.types import UrlsToHtmls
logger = get_logger() logger = get_logger()

View file

@ -0,0 +1,4 @@
from typing import TypeAlias
UrlsToHtmls: TypeAlias = dict[str, str]

View file

@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
""" """
import os import os
from re import L from typing import List, Union
from typing import List, Union, TypeAlias
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.tasks.web_scraper.types import UrlsToHtmls
from .default_url_crawler import DefaultUrlCrawler from .default_url_crawler import DefaultUrlCrawler
from .config import DefaultCrawlerConfig, TavilyConfig from .config import DefaultCrawlerConfig, TavilyConfig
logger = get_logger(__name__) logger = get_logger(__name__)
UrlsToHtmls: TypeAlias = dict[str, str]
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
"""Fetch content from one or more URLs using the specified tool. """Fetch content from one or more URLs using the specified tool.

View file

@ -1,37 +1,76 @@
from sys import exc_info
import pytest import pytest
import cognee import cognee
from cognee.modules.ingestion.exceptions.exceptions import IngestionError from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
from cognee.tasks.ingestion import save_data_item_to_storage
from pathlib import Path
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_add_fails_when_web_url_fetcher_config_not_specified(): async def test_url_saves_as_html_file():
from cognee.shared.logging_utils import setup_logging, ERROR
setup_logging(log_level=ERROR)
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
with pytest.raises(IngestionError) as excinfo:
await cognee.add( try:
"https://en.wikipedia.org/wiki/Large_language_model", original_file_path = await save_data_item_to_storage(
incremental_loading=False, "https://en.wikipedia.org/wiki/Large_language_model"
) )
assert excinfo.value.message.startswith( file_path = get_data_file_path(original_file_path)
"web_url_fetcher configuration must be a valid dictionary" assert file_path.endswith(".html")
) file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_add_succesfully_adds_url_when_fetcher_config_specified(): async def test_saved_html_is_valid():
try:
from bs4 import BeautifulSoup
except ImportError:
pytest.fail("Test case requires bs4 installed")
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
extraction_rules = { try:
"title": {"selector": "title"}, original_file_path = await save_data_item_to_storage(
"headings": {"selector": "h1, h2, h3", "all": True}, "https://en.wikipedia.org/wiki/Large_language_model"
"links": {"selector": "a", "attr": "href", "all": True}, )
"paragraphs": {"selector": "p", "all": True}, file_path = get_data_file_path(original_file_path)
} content = Path(file_path).read_text()
soup = BeautifulSoup(content, "html.parser")
assert soup.find() is not None, "File should contain parseable HTML"
has_html_elements = any(
[
soup.find("html"),
soup.find("head"),
soup.find("body"),
soup.find("div"),
soup.find("p"),
]
)
assert has_html_elements, "File should contain common HTML elements"
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_add_url():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
@pytest.mark.asyncio
async def test_add_url_without_incremental_loading():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try: try:
await cognee.add( await cognee.add(
@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_add_with_incremental_loading_works(): async def test_add_url_with_incremental_loading():
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try: try:
await cognee.add( await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model", "https://en.wikipedia.org/wiki/Large_language_model",
@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_add_without_incremental_loading_works(): async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
try: try:
await cognee.add( await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model", "https://en.wikipedia.org/wiki/Large_language_model",
incremental_loading=False, preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
) )
except Exception as e: except Exception as e:
pytest.fail(f"Failed to add url: {e}") pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
async def test_loader_is_none_by_default():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
loader = loader_engine.get_loader(
file_path,
preferred_loaders=preferred_loaders,
)
assert loader is None
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
loader = loader_engine.get_loader(
file_path,
preferred_loaders=preferred_loaders,
)
assert loader == bs_loader
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")

View file

@ -23,6 +23,7 @@ async def main():
await cognee.add( await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model", "https://en.wikipedia.org/wiki/Large_language_model",
incremental_loading=False, incremental_loading=False,
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
) )
await cognee.cognify() await cognee.cognify()