validate e2e - urls are saved as htmls, and loaders are selected correctly
This commit is contained in:
parent
f84e31c626
commit
03b4547b7f
11 changed files with 182 additions and 45 deletions
|
|
@ -78,14 +78,21 @@ class LoaderEngine:
|
||||||
Returns:
|
Returns:
|
||||||
LoaderInterface that can handle the file, or None if not found
|
LoaderInterface that can handle the file, or None if not found
|
||||||
"""
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
file_info = filetype.guess(file_path)
|
file_info = filetype.guess(file_path)
|
||||||
|
|
||||||
|
path_extension = Path(file_path).suffix.lstrip(".")
|
||||||
|
|
||||||
# Try preferred loaders first
|
# Try preferred loaders first
|
||||||
if preferred_loaders:
|
if preferred_loaders:
|
||||||
for loader_name in preferred_loaders:
|
for loader_name in preferred_loaders:
|
||||||
if loader_name in self._loaders:
|
if loader_name in self._loaders:
|
||||||
loader = self._loaders[loader_name]
|
loader = self._loaders[loader_name]
|
||||||
|
# Try with path extension first (for text formats like html)
|
||||||
|
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||||
|
return loader
|
||||||
|
# Fall back to content-detected extension
|
||||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||||
return loader
|
return loader
|
||||||
else:
|
else:
|
||||||
|
|
@ -95,6 +102,10 @@ class LoaderEngine:
|
||||||
for loader_name in self.default_loader_priority:
|
for loader_name in self.default_loader_priority:
|
||||||
if loader_name in self._loaders:
|
if loader_name in self._loaders:
|
||||||
loader = self._loaders[loader_name]
|
loader = self._loaders[loader_name]
|
||||||
|
# Try with path extension first (for text formats like html)
|
||||||
|
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||||
|
return loader
|
||||||
|
# Fall back to content-detected extension
|
||||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||||
return loader
|
return loader
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules.
|
||||||
from typing import Union, Dict, Any, Optional, List
|
from typing import Union, Dict, Any, Optional, List
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from cognee.infrastructure.loaders import LoaderInterface
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def supported_mime_types(self) -> List[str]:
|
def supported_mime_types(self) -> List[str]:
|
||||||
return ["text/html"]
|
return ["text/html", "text/plain"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def loader_name(self) -> str:
|
def loader_name(self) -> str:
|
||||||
return "beautiful_soup_loader"
|
return "beautiful_soup_loader"
|
||||||
|
|
||||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||||
return extension in self.supported_extensions() and mime_type in self.supported_mime_types()
|
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
||||||
|
return can
|
||||||
|
|
||||||
async def load(self, file_path: str, **kwargs):
|
async def load(self, file_path: str, **kwargs):
|
||||||
pass
|
"""Load an HTML file and return its path.
|
||||||
|
|
||||||
|
For HTML files stored on disk, we simply return the file path
|
||||||
|
since the content is already in text format and can be processed directly.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the HTML file
|
||||||
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The file path to the HTML file
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import BinaryIO, Union
|
from typing import BinaryIO, Union, Optional
|
||||||
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||||
from .classify import classify
|
from .classify import classify
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
async def save_data_to_file(
|
||||||
|
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
|
||||||
|
):
|
||||||
storage_config = get_storage_config()
|
storage_config = get_storage_config()
|
||||||
|
|
||||||
data_root_directory = storage_config["data_root_directory"]
|
data_root_directory = storage_config["data_root_directory"]
|
||||||
|
|
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
||||||
|
|
||||||
file_name = file_metadata["name"]
|
file_name = file_metadata["name"]
|
||||||
|
|
||||||
|
if file_extension is not None:
|
||||||
|
extension = file_extension.lstrip(".")
|
||||||
|
file_name_without_ext = file_name.rsplit(".", 1)[0]
|
||||||
|
file_name = f"{file_name_without_ext}.{extension}"
|
||||||
|
|
||||||
storage = get_file_storage(data_root_directory)
|
storage = get_file_storage(data_root_directory)
|
||||||
|
|
||||||
full_file_path = await storage.store(file_name, data)
|
full_file_path = await storage.store(file_name, data)
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
|
||||||
import cognee.modules.ingestion as ingestion
|
import cognee.modules.ingestion as ingestion
|
||||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
from cognee.modules.data.models import Data
|
from cognee.modules.data.models import Data
|
||||||
|
from cognee.modules.ingestion.exceptions import IngestionError
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
from cognee.modules.users.methods import get_default_user
|
from cognee.modules.users.methods import get_default_user
|
||||||
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
||||||
|
|
@ -88,6 +89,9 @@ async def ingest_data(
|
||||||
preferred_loaders,
|
preferred_loaders,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if loader_engine is None:
|
||||||
|
raise IngestionError("Loader cannot be None")
|
||||||
|
|
||||||
# Find metadata from original file
|
# Find metadata from original file
|
||||||
# Standard flow: extract metadata from both original and stored files
|
# Standard flow: extract metadata from both original and stored files
|
||||||
async with open_data_file(original_file_path) as file:
|
async with open_data_file(original_file_path) as file:
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher
|
from cognee.tasks.web_scraper.utils import fetch_page_content
|
||||||
|
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
||||||
if parsed_url.scheme == "s3":
|
if parsed_url.scheme == "s3":
|
||||||
return data_item
|
return data_item
|
||||||
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
||||||
fetcher = WebUrlFetcher()
|
urls_to_page_contents = await fetch_page_content(data_item)
|
||||||
return await fetcher.fetch(data_item)
|
return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
|
||||||
# data is local file path
|
# data is local file path
|
||||||
elif parsed_url.scheme == "file":
|
elif parsed_url.scheme == "file":
|
||||||
if settings.accept_local_file_path:
|
if settings.accept_local_file_path:
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel):
|
||||||
max_retries: int = 2
|
max_retries: int = 2
|
||||||
retry_delay_factor: float = 0.5
|
retry_delay_factor: float = 0.5
|
||||||
headers: Optional[Dict[str, str]] = None
|
headers: Optional[Dict[str, str]] = None
|
||||||
extraction_rules: Dict[str, Any]
|
|
||||||
use_playwright: bool = False
|
use_playwright: bool = False
|
||||||
playwright_js_wait: float = 0.8
|
playwright_js_wait: float = 0.8
|
||||||
robots_cache_ttl: float = 3600.0
|
robots_cache_ttl: float = 3600.0
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from urllib.parse import urlparse
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from cognee.tasks.web_scraper.utils import UrlsToHtmls
|
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
|
||||||
4
cognee/tasks/web_scraper/types.py
Normal file
4
cognee/tasks/web_scraper/types.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from typing import TypeAlias
|
||||||
|
|
||||||
|
|
||||||
|
UrlsToHtmls: TypeAlias = dict[str, str]
|
||||||
|
|
@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from re import L
|
from typing import List, Union
|
||||||
from typing import List, Union, TypeAlias
|
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
||||||
from .default_url_crawler import DefaultUrlCrawler
|
from .default_url_crawler import DefaultUrlCrawler
|
||||||
from .config import DefaultCrawlerConfig, TavilyConfig
|
from .config import DefaultCrawlerConfig, TavilyConfig
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
UrlsToHtmls: TypeAlias = dict[str, str]
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||||
"""Fetch content from one or more URLs using the specified tool.
|
"""Fetch content from one or more URLs using the specified tool.
|
||||||
|
|
|
||||||
|
|
@ -1,37 +1,76 @@
|
||||||
from sys import exc_info
|
|
||||||
import pytest
|
import pytest
|
||||||
import cognee
|
import cognee
|
||||||
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
|
||||||
|
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
|
||||||
|
from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
|
||||||
|
from cognee.tasks.ingestion import save_data_item_to_storage
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_fails_when_web_url_fetcher_config_not_specified():
|
async def test_url_saves_as_html_file():
|
||||||
from cognee.shared.logging_utils import setup_logging, ERROR
|
|
||||||
|
|
||||||
setup_logging(log_level=ERROR)
|
|
||||||
await cognee.prune.prune_data()
|
await cognee.prune.prune_data()
|
||||||
await cognee.prune.prune_system(metadata=True)
|
await cognee.prune.prune_system(metadata=True)
|
||||||
with pytest.raises(IngestionError) as excinfo:
|
|
||||||
await cognee.add(
|
try:
|
||||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
original_file_path = await save_data_item_to_storage(
|
||||||
incremental_loading=False,
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
)
|
)
|
||||||
assert excinfo.value.message.startswith(
|
file_path = get_data_file_path(original_file_path)
|
||||||
"web_url_fetcher configuration must be a valid dictionary"
|
assert file_path.endswith(".html")
|
||||||
)
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_succesfully_adds_url_when_fetcher_config_specified():
|
async def test_saved_html_is_valid():
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
except ImportError:
|
||||||
|
pytest.fail("Test case requires bs4 installed")
|
||||||
|
|
||||||
await cognee.prune.prune_data()
|
await cognee.prune.prune_data()
|
||||||
await cognee.prune.prune_system(metadata=True)
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
extraction_rules = {
|
try:
|
||||||
"title": {"selector": "title"},
|
original_file_path = await save_data_item_to_storage(
|
||||||
"headings": {"selector": "h1, h2, h3", "all": True},
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
"links": {"selector": "a", "attr": "href", "all": True},
|
)
|
||||||
"paragraphs": {"selector": "p", "all": True},
|
file_path = get_data_file_path(original_file_path)
|
||||||
}
|
content = Path(file_path).read_text()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
assert soup.find() is not None, "File should contain parseable HTML"
|
||||||
|
|
||||||
|
has_html_elements = any(
|
||||||
|
[
|
||||||
|
soup.find("html"),
|
||||||
|
soup.find("head"),
|
||||||
|
soup.find("body"),
|
||||||
|
soup.find("div"),
|
||||||
|
soup.find("p"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert has_html_elements, "File should contain common HTML elements"
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_url():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_url_without_incremental_loading():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await cognee.add(
|
await cognee.add(
|
||||||
|
|
@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_with_incremental_loading_works():
|
async def test_add_url_with_incremental_loading():
|
||||||
await cognee.prune.prune_data()
|
await cognee.prune.prune_data()
|
||||||
await cognee.prune.prune_system(metadata=True)
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
extraction_rules = {
|
|
||||||
"title": {"selector": "title"},
|
|
||||||
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
||||||
"links": {"selector": "a", "attr": "href", "all": True},
|
|
||||||
"paragraphs": {"selector": "p", "all": True},
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await cognee.add(
|
await cognee.add(
|
||||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
|
|
@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_without_incremental_loading_works():
|
async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet
|
||||||
await cognee.prune.prune_data()
|
await cognee.prune.prune_data()
|
||||||
await cognee.prune.prune_system(metadata=True)
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
|
@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works():
|
||||||
try:
|
try:
|
||||||
await cognee.add(
|
await cognee.add(
|
||||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
incremental_loading=False,
|
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Failed to add url: {e}")
|
pytest.fail(f"Failed to add url: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_loader_is_none_by_default():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
loader = loader_engine.get_loader(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader is None
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
loader = loader_engine.get_loader(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader == bs_loader
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ async def main():
|
||||||
await cognee.add(
|
await cognee.add(
|
||||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
incremental_loading=False,
|
incremental_loading=False,
|
||||||
|
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
||||||
)
|
)
|
||||||
|
|
||||||
await cognee.cognify()
|
await cognee.cognify()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue