validate e2e - urls are saved as htmls, and loaders are selected correctly
This commit is contained in:
parent
f84e31c626
commit
03b4547b7f
11 changed files with 182 additions and 45 deletions
|
|
@ -78,14 +78,21 @@ class LoaderEngine:
|
|||
Returns:
|
||||
LoaderInterface that can handle the file, or None if not found
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
file_info = filetype.guess(file_path)
|
||||
|
||||
path_extension = Path(file_path).suffix.lstrip(".")
|
||||
|
||||
# Try preferred loaders first
|
||||
if preferred_loaders:
|
||||
for loader_name in preferred_loaders:
|
||||
if loader_name in self._loaders:
|
||||
loader = self._loaders[loader_name]
|
||||
# Try with path extension first (for text formats like html)
|
||||
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
# Fall back to content-detected extension
|
||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
else:
|
||||
|
|
@ -95,6 +102,10 @@ class LoaderEngine:
|
|||
for loader_name in self.default_loader_priority:
|
||||
if loader_name in self._loaders:
|
||||
loader = self._loaders[loader_name]
|
||||
# Try with path extension first (for text formats like html)
|
||||
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
# Fall back to content-detected extension
|
||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
|
|||
from typing import Union, Dict, Any, Optional, List
|
||||
from dataclasses import dataclass
|
||||
from bs4 import BeautifulSoup
|
||||
from cognee.infrastructure.loaders import LoaderInterface
|
||||
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):
|
|||
|
||||
@property
|
||||
def supported_mime_types(self) -> List[str]:
|
||||
return ["text/html"]
|
||||
return ["text/html", "text/plain"]
|
||||
|
||||
@property
|
||||
def loader_name(self) -> str:
|
||||
return "beautiful_soup_loader"
|
||||
|
||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||
return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
|
||||
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
||||
return can
|
||||
|
||||
async def load(self, file_path: str, **kwargs):
|
||||
pass
|
||||
"""Load an HTML file and return its path.
|
||||
|
||||
For HTML files stored on disk, we simply return the file path
|
||||
since the content is already in text format and can be processed directly.
|
||||
|
||||
Args:
|
||||
file_path: Path to the HTML file
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
The file path to the HTML file
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
from typing import BinaryIO, Union
|
||||
from typing import BinaryIO, Union, Optional
|
||||
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||
from .classify import classify
|
||||
import hashlib
|
||||
|
||||
|
||||
async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
||||
async def save_data_to_file(
|
||||
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
|
||||
):
|
||||
storage_config = get_storage_config()
|
||||
|
||||
data_root_directory = storage_config["data_root_directory"]
|
||||
|
|
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
|||
|
||||
file_name = file_metadata["name"]
|
||||
|
||||
if file_extension is not None:
|
||||
extension = file_extension.lstrip(".")
|
||||
file_name_without_ext = file_name.rsplit(".", 1)[0]
|
||||
file_name = f"{file_name_without_ext}.{extension}"
|
||||
|
||||
storage = get_file_storage(data_root_directory)
|
||||
|
||||
full_file_path = await storage.store(file_name, data)
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
|
|||
import cognee.modules.ingestion as ingestion
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from cognee.modules.data.models import Data
|
||||
from cognee.modules.ingestion.exceptions import IngestionError
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
||||
|
|
@ -88,6 +89,9 @@ async def ingest_data(
|
|||
preferred_loaders,
|
||||
)
|
||||
|
||||
if loader_engine is None:
|
||||
raise IngestionError("Loader cannot be None")
|
||||
|
||||
# Find metadata from original file
|
||||
# Standard flow: extract metadata from both original and stored files
|
||||
async with open_data_file(original_file_path) as file:
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
|
|||
from cognee.shared.logging_utils import get_logger
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
|
||||
from cognee.tasks.web_scraper.utils import fetch_page_content
|
||||
|
||||
|
||||
logger = get_logger()
|
||||
|
|
@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|||
if parsed_url.scheme == "s3":
|
||||
return data_item
|
||||
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
||||
fetcher = WebUrlFetcher()
|
||||
return await fetcher.fetch(data_item)
|
||||
urls_to_page_contents = await fetch_page_content(data_item)
|
||||
return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
|
||||
# data is local file path
|
||||
elif parsed_url.scheme == "file":
|
||||
if settings.accept_local_file_path:
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
|
|||
max_retries: int = 2
|
||||
retry_delay_factor: float = 0.5
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
extraction_rules: Dict[str, Any]
|
||||
use_playwright: bool = False
|
||||
playwright_js_wait: float = 0.8
|
||||
robots_cache_ttl: float = 3600.0
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from urllib.parse import urlparse
|
|||
import httpx
|
||||
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.tasks.web_scraper.utils import UrlsToHtmls
|
||||
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
|
|||
4
cognee/tasks/web_scraper/types.py
Normal file
4
cognee/tasks/web_scraper/types.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from typing import TypeAlias
|
||||
|
||||
|
||||
UrlsToHtmls: TypeAlias = dict[str, str]
|
||||
|
|
@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
|
|||
"""
|
||||
|
||||
import os
|
||||
from re import L
|
||||
from typing import List, Union, TypeAlias
|
||||
from typing import List, Union
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
||||
from .default_url_crawler import DefaultUrlCrawler
|
||||
from .config import DefaultCrawlerConfig, TavilyConfig
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
UrlsToHtmls: TypeAlias = dict[str, str]
|
||||
|
||||
|
||||
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||
"""Fetch content from one or more URLs using the specified tool.
|
||||
|
|
|
|||
|
|
@ -1,37 +1,76 @@
|
|||
from sys import exc_info
|
||||
import pytest
|
||||
import cognee
|
||||
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
||||
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
|
||||
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
|
||||
from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
|
||||
from cognee.tasks.ingestion import save_data_item_to_storage
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_fails_when_web_url_fetcher_config_not_specified():
|
||||
from cognee.shared.logging_utils import setup_logging, ERROR
|
||||
|
||||
setup_logging(log_level=ERROR)
|
||||
async def test_url_saves_as_html_file():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
with pytest.raises(IngestionError) as excinfo:
|
||||
await cognee.add(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||
incremental_loading=False,
|
||||
|
||||
try:
|
||||
original_file_path = await save_data_item_to_storage(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||
)
|
||||
assert excinfo.value.message.startswith(
|
||||
"web_url_fetcher configuration must be a valid dictionary"
|
||||
)
|
||||
file_path = get_data_file_path(original_file_path)
|
||||
assert file_path.endswith(".html")
|
||||
file = Path(file_path)
|
||||
assert file.exists()
|
||||
assert file.stat().st_size > 0
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_succesfully_adds_url_when_fetcher_config_specified():
|
||||
async def test_saved_html_is_valid():
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
pytest.fail("Test case requires bs4 installed")
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
extraction_rules = {
|
||||
"title": {"selector": "title"},
|
||||
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||
"links": {"selector": "a", "attr": "href", "all": True},
|
||||
"paragraphs": {"selector": "p", "all": True},
|
||||
}
|
||||
try:
|
||||
original_file_path = await save_data_item_to_storage(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||
)
|
||||
file_path = get_data_file_path(original_file_path)
|
||||
content = Path(file_path).read_text()
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
assert soup.find() is not None, "File should contain parseable HTML"
|
||||
|
||||
has_html_elements = any(
|
||||
[
|
||||
soup.find("html"),
|
||||
soup.find("head"),
|
||||
soup.find("body"),
|
||||
soup.find("div"),
|
||||
soup.find("p"),
|
||||
]
|
||||
)
|
||||
assert has_html_elements, "File should contain common HTML elements"
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_url():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_url_without_incremental_loading():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
try:
|
||||
await cognee.add(
|
||||
|
|
@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_with_incremental_loading_works():
|
||||
async def test_add_url_with_incremental_loading():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
extraction_rules = {
|
||||
"title": {"selector": "title"},
|
||||
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||
"links": {"selector": "a", "attr": "href", "all": True},
|
||||
"paragraphs": {"selector": "p", "all": True},
|
||||
}
|
||||
|
||||
try:
|
||||
await cognee.add(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||
|
|
@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_without_incremental_loading_works():
|
||||
async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
|
|
@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
|
|||
try:
|
||||
await cognee.add(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||
incremental_loading=False,
|
||||
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
||||
)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed to add url: {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_loader_is_none_by_default():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
extraction_rules = {
|
||||
"title": {"selector": "title"},
|
||||
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||
"links": {"selector": "a", "attr": "href", "all": True},
|
||||
"paragraphs": {"selector": "p", "all": True},
|
||||
}
|
||||
|
||||
try:
|
||||
original_file_path = await save_data_item_to_storage(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||
)
|
||||
file_path = get_data_file_path(original_file_path)
|
||||
assert file_path.endswith(".html")
|
||||
file = Path(file_path)
|
||||
assert file.exists()
|
||||
assert file.stat().st_size > 0
|
||||
|
||||
loader_engine = LoaderEngine()
|
||||
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||
loader = loader_engine.get_loader(
|
||||
file_path,
|
||||
preferred_loaders=preferred_loaders,
|
||||
)
|
||||
|
||||
assert loader is None
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
extraction_rules = {
|
||||
"title": {"selector": "title"},
|
||||
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||
"links": {"selector": "a", "attr": "href", "all": True},
|
||||
"paragraphs": {"selector": "p", "all": True},
|
||||
}
|
||||
|
||||
try:
|
||||
original_file_path = await save_data_item_to_storage(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||
)
|
||||
file_path = get_data_file_path(original_file_path)
|
||||
assert file_path.endswith(".html")
|
||||
file = Path(file_path)
|
||||
assert file.exists()
|
||||
assert file.stat().st_size > 0
|
||||
|
||||
loader_engine = LoaderEngine()
|
||||
bs_loader = BeautifulSoupLoader()
|
||||
loader_engine.register_loader(bs_loader)
|
||||
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||
loader = loader_engine.get_loader(
|
||||
file_path,
|
||||
preferred_loaders=preferred_loaders,
|
||||
)
|
||||
|
||||
assert loader == bs_loader
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ async def main():
|
|||
await cognee.add(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||
incremental_loading=False,
|
||||
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
||||
)
|
||||
|
||||
await cognee.cognify()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue