(WIP) Fix/fix web parsing (#1552)

<!-- .github/pull_request_template.md -->

## Description
<!--
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
-->

This PR (using TDD):
1. Separates web crawling implementation into separate fetching, and
parsing (loader) steps
2. Fetching is used in `save_data_item_to_storage`. Default settings are
used for fetching
3. Loader produces a txt file, scraping the fetched html and saves it in
a txt file (`html_hash.html` -> `html_hash.txt`), similar to how we
process pdf files

## Type of Change
<!-- Please check the relevant option -->
- [x] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [x] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-10-22 11:57:40 +02:00 committed by GitHub
commit 738759bc5b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 859 additions and 339 deletions

View file

@ -123,6 +123,7 @@ jobs:
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
extra-dependencies: "scraping"
- name: Run Integration Tests
run: uv run pytest cognee/tests/integration/

View file

@ -1,8 +1,5 @@
from uuid import UUID
import os
from typing import Union, BinaryIO, List, Optional, Dict, Any
from pydantic import BaseModel
from urllib.parse import urlparse
from typing import Union, BinaryIO, List, Optional, Any
from cognee.modules.users.models import User
from cognee.modules.pipelines import Task, run_pipeline
from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
@ -17,16 +14,6 @@ from cognee.shared.logging_utils import get_logger
logger = get_logger()
try:
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
from cognee.context_global_variables import (
tavily_config as tavily,
soup_crawler_config as soup_crawler,
)
except ImportError:
logger.debug(f"Unable to import {str(ImportError)}")
pass
async def add(
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
@ -36,11 +23,8 @@ async def add(
vector_db_config: dict = None,
graph_db_config: dict = None,
dataset_id: Optional[UUID] = None,
preferred_loaders: List[str] = None,
preferred_loaders: dict[str, dict[str, Any]] = None,
incremental_loading: bool = True,
extraction_rules: Optional[Dict[str, Any]] = None,
tavily_config: Optional[BaseModel] = None,
soup_crawler_config: Optional[BaseModel] = None,
data_per_batch: Optional[int] = 20,
):
"""
@ -180,29 +164,6 @@ async def add(
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
"""
try:
if not soup_crawler_config and extraction_rules:
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
if not tavily_config and os.getenv("TAVILY_API_KEY"):
tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
soup_crawler.set(soup_crawler_config)
tavily.set(tavily_config)
http_schemes = {"http", "https"}
def _is_http_url(item: Union[str, BinaryIO]) -> bool:
return isinstance(item, str) and urlparse(item).scheme in http_schemes
if _is_http_url(data):
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
except NameError:
logger.debug(f"Unable to import {str(ImportError)}")
pass
tasks = [
Task(resolve_data_directories, include_subdirectories=True),
Task(

View file

@ -1,5 +1,5 @@
from uuid import UUID
from typing import Union, BinaryIO, List, Optional
from typing import Union, BinaryIO, List, Optional, Any
from cognee.modules.users.models import User
from cognee.api.v1.delete import delete
@ -15,7 +15,7 @@ async def update(
node_set: Optional[List[str]] = None,
vector_db_config: dict = None,
graph_db_config: dict = None,
preferred_loaders: List[str] = None,
preferred_loaders: dict[str, dict[str, Any]] = None,
incremental_loading: bool = True,
):
"""

View file

@ -13,8 +13,6 @@ from cognee.modules.users.methods import get_user
vector_db_config = ContextVar("vector_db_config", default=None)
graph_db_config = ContextVar("graph_db_config", default=None)
session_user = ContextVar("session_user", default=None)
soup_crawler_config = ContextVar("soup_crawler_config", default=None)
tavily_config = ContextVar("tavily_config", default=None)
async def set_session_user_context_variable(user):

View file

@ -64,7 +64,9 @@ class LoaderEngine:
return True
def get_loader(
self, file_path: str, preferred_loaders: List[str] = None
self,
file_path: str,
preferred_loaders: dict[str, dict[str, Any]],
) -> Optional[LoaderInterface]:
"""
Get appropriate loader for a file.
@ -76,14 +78,21 @@ class LoaderEngine:
Returns:
LoaderInterface that can handle the file, or None if not found
"""
from pathlib import Path
file_info = filetype.guess(file_path)
path_extension = Path(file_path).suffix.lstrip(".")
# Try preferred loaders first
if preferred_loaders:
for loader_name in preferred_loaders:
if loader_name in self._loaders:
loader = self._loaders[loader_name]
# Try with path extension first (for text formats like html)
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
return loader
# Fall back to content-detected extension
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
return loader
else:
@ -93,6 +102,10 @@ class LoaderEngine:
for loader_name in self.default_loader_priority:
if loader_name in self._loaders:
loader = self._loaders[loader_name]
# Try with path extension first (for text formats like html)
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
return loader
# Fall back to content-detected extension
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
return loader
else:
@ -105,7 +118,7 @@ class LoaderEngine:
async def load_file(
self,
file_path: str,
preferred_loaders: Optional[List[str]] = None,
preferred_loaders: dict[str, dict[str, Any]] = None,
**kwargs,
):
"""
@ -113,7 +126,7 @@ class LoaderEngine:
Args:
file_path: Path to the file to be processed
preferred_loaders: List of preferred loader names to try first
preferred_loaders: Dict of loader names to their configurations
**kwargs: Additional loader-specific configuration
Raises:
@ -125,8 +138,16 @@ class LoaderEngine:
raise ValueError(f"No loader found for file: {file_path}")
logger.debug(f"Loading {file_path} with {loader.loader_name}")
# TODO: loading needs to be reworked to work with both file streams and file locations
return await loader.load(file_path, **kwargs)
# Extract loader-specific config from preferred_loaders
loader_config = {}
if preferred_loaders and loader.loader_name in preferred_loaders:
loader_config = preferred_loaders[loader.loader_name]
# Merge with any additional kwargs (kwargs take precedence)
merged_kwargs = {**loader_config, **kwargs}
return await loader.load(file_path, **merged_kwargs)
def get_available_loaders(self) -> List[str]:
"""

View file

@ -27,3 +27,10 @@ try:
__all__.append("AdvancedPdfLoader")
except ImportError:
pass
try:
from .beautiful_soup_loader import BeautifulSoupLoader
__all__.append("BeautifulSoupLoader")
except ImportError:
pass

View file

@ -0,0 +1,224 @@
"""BeautifulSoup-based web crawler for extracting content from web pages.
This module provides the BeautifulSoupCrawler class for fetching and extracting content
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
supports robots.txt handling, rate limiting, and custom extraction rules.
"""
from typing import Union, Dict, Any, Optional, List
from dataclasses import dataclass
from bs4 import BeautifulSoup
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
from cognee.shared.logging_utils import get_logger
logger = get_logger(__name__)
@dataclass
class ExtractionRule:
"""Normalized extraction rule for web content.
Attributes:
selector: CSS selector for extraction (if any).
xpath: XPath expression for extraction (if any).
attr: HTML attribute to extract (if any).
all: If True, extract all matching elements; otherwise, extract first.
join_with: String to join multiple extracted elements.
"""
selector: Optional[str] = None
xpath: Optional[str] = None
attr: Optional[str] = None
all: bool = False
join_with: str = " "
class BeautifulSoupLoader(LoaderInterface):
"""Crawler for fetching and extracting web content using BeautifulSoup.
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
Attributes:
concurrency: Number of concurrent requests allowed.
crawl_delay: Minimum seconds between requests to the same domain.
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
timeout: Per-request timeout in seconds.
max_retries: Number of retries for failed requests.
retry_delay_factor: Multiplier for exponential backoff on retries.
headers: HTTP headers for requests (e.g., User-Agent).
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
"""
@property
def supported_extensions(self) -> List[str]:
return ["html"]
@property
def supported_mime_types(self) -> List[str]:
return ["text/html", "text/plain"]
@property
def loader_name(self) -> str:
return "beautiful_soup_loader"
def can_handle(self, extension: str, mime_type: str) -> bool:
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
return can
async def load(
self,
file_path: str,
extraction_rules: dict[str, Any] = None,
join_all_matches: bool = False,
**kwargs,
):
"""Load an HTML file, extract content, and save to storage.
Args:
file_path: Path to the HTML file
extraction_rules: Dict of CSS selector rules for content extraction
join_all_matches: If True, extract all matching elements for each rule
**kwargs: Additional arguments
Returns:
Path to the stored extracted text file
"""
if extraction_rules is None:
raise ValueError("extraction_rules required for BeautifulSoupLoader")
logger.info(f"Processing HTML file: {file_path}")
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
with open(file_path, "rb") as f:
file_metadata = await get_file_metadata(f)
f.seek(0)
html = f.read()
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
# Normalize extraction rules
normalized_rules: List[ExtractionRule] = []
for _, rule in extraction_rules.items():
r = self._normalize_rule(rule)
if join_all_matches:
r.all = True
normalized_rules.append(r)
pieces = []
for rule in normalized_rules:
text = self._extract_from_html(html, rule)
if text:
pieces.append(text)
full_content = " ".join(pieces).strip()
# Fallback: If no content extracted, check if the file is plain text (not HTML)
if not full_content:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# If there are no HTML tags, treat as plain text
if not soup.find():
logger.warning(
f"No HTML tags found in {file_path}. Treating as plain text. "
"This may happen when content is pre-extracted (e.g., via Tavily with text format)."
)
full_content = html.decode("utf-8") if isinstance(html, bytes) else html
full_content = full_content.strip()
if not full_content:
logger.warning(f"No content extracted from HTML file: {file_path}")
# Store the extracted content
storage_config = get_storage_config()
data_root_directory = storage_config["data_root_directory"]
storage = get_file_storage(data_root_directory)
full_file_path = await storage.store(storage_file_name, full_content)
logger.info(f"Extracted {len(full_content)} characters from HTML")
return full_file_path
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass.
Args:
rule: A string (CSS selector) or dict with extraction parameters.
Returns:
ExtractionRule: Normalized extraction rule.
Raises:
ValueError: If the rule is invalid.
"""
if isinstance(rule, str):
return ExtractionRule(selector=rule)
if isinstance(rule, dict):
return ExtractionRule(
selector=rule.get("selector"),
xpath=rule.get("xpath"),
attr=rule.get("attr"),
all=bool(rule.get("all", False)),
join_with=rule.get("join_with", " "),
)
raise ValueError(f"Invalid extraction rule: {rule}")
def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
"""Extract content from HTML using BeautifulSoup or lxml XPath.
Args:
html: The HTML content to extract from.
rule: The extraction rule to apply.
Returns:
str: The extracted content.
Raises:
RuntimeError: If XPath is used but lxml is not installed.
"""
soup = BeautifulSoup(html, "html.parser")
if rule.xpath:
try:
from lxml import html as lxml_html
except ImportError:
raise RuntimeError(
"XPath requested but lxml is not available. Install lxml or use CSS selectors."
)
doc = lxml_html.fromstring(html)
nodes = doc.xpath(rule.xpath)
texts = []
for n in nodes:
if hasattr(n, "text_content"):
texts.append(n.text_content().strip())
else:
texts.append(str(n).strip())
return rule.join_with.join(t for t in texts if t)
if not rule.selector:
return ""
if rule.all:
nodes = soup.select(rule.selector)
pieces = []
for el in nodes:
if rule.attr:
val = el.get(rule.attr)
if val:
pieces.append(val.strip())
else:
text = el.get_text(strip=True)
if text:
pieces.append(text)
return rule.join_with.join(pieces).strip()
else:
el = soup.select_one(rule.selector)
if el is None:
return ""
if rule.attr:
val = el.get(rule.attr)
return (val or "").strip()
return el.get_text(strip=True)

View file

@ -23,3 +23,10 @@ try:
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
except ImportError:
pass
try:
from cognee.infrastructure.loaders.external import BeautifulSoupLoader
supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
except ImportError:
pass

View file

@ -1,10 +1,12 @@
from typing import BinaryIO, Union
from typing import BinaryIO, Union, Optional
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
from .classify import classify
import hashlib
async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
async def save_data_to_file(
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
):
storage_config = get_storage_config()
data_root_directory = storage_config["data_root_directory"]
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
file_name = file_metadata["name"]
if file_extension is not None:
extension = file_extension.lstrip(".")
file_name_without_ext = file_name.rsplit(".", 1)[0]
file_name = f"{file_name_without_ext}.{extension}"
storage = get_file_storage(data_root_directory)
full_file_path = await storage.store(file_name, data)

View file

@ -20,6 +20,7 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
check_pipeline_run_qualification,
)
from typing import Any
logger = get_logger("cognee.pipeline")
@ -80,7 +81,14 @@ async def run_pipeline_per_dataset(
return
pipeline_run = run_tasks(
tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_per_batch
tasks,
dataset.id,
data,
user,
pipeline_name,
context,
incremental_loading,
data_per_batch,
)
async for pipeline_run_info in pipeline_run:

View file

@ -1,6 +1,6 @@
import os
from urllib.parse import urlparse
from typing import List, Tuple
from typing import Any, List, Tuple
from pathlib import Path
import tempfile
@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
async def data_item_to_text_file(
data_item_path: str, preferred_loaders: List[str]
data_item_path: str,
preferred_loaders: dict[str, dict[str, Any]] = None,
) -> Tuple[str, LoaderInterface]:
if isinstance(data_item_path, str):
parsed_url = urlparse(data_item_path)
@ -74,6 +75,5 @@ async def data_item_to_text_file(
)
else:
raise IngestionError(message="Local files are not accepted.")
# data is not a supported type
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")

View file

@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
import cognee.modules.ingestion as ingestion
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Data
from cognee.modules.ingestion.exceptions import IngestionError
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@ -27,7 +28,7 @@ async def ingest_data(
user: User,
node_set: Optional[List[str]] = None,
dataset_id: UUID = None,
preferred_loaders: List[str] = None,
preferred_loaders: dict[str, dict[str, Any]] = None,
):
if not user:
user = await get_default_user()
@ -44,7 +45,7 @@ async def ingest_data(
user: User,
node_set: Optional[List[str]] = None,
dataset_id: UUID = None,
preferred_loaders: List[str] = None,
preferred_loaders: dict[str, dict[str, Any]] = None,
):
new_datapoints = []
existing_data_points = []
@ -77,22 +78,27 @@ async def ingest_data(
dataset_data_map = {str(data.id): True for data in dataset_data}
for data_item in data:
# Get file path of data item or create a file it doesn't exist
# Get file path of data item or create a file if it doesn't exist
original_file_path = await save_data_item_to_storage(data_item)
# Transform file path to be OS usable
actual_file_path = get_data_file_path(original_file_path)
# Store all input data as text files in Cognee data storage
cognee_storage_file_path, loader_engine = await data_item_to_text_file(
actual_file_path, preferred_loaders
actual_file_path,
preferred_loaders,
)
if loader_engine is None:
raise IngestionError("Loader cannot be None")
# Find metadata from original file
# Standard flow: extract metadata from both original and stored files
async with open_data_file(original_file_path) as file:
classified_data = ingestion.classify(file)
# data_id is the hash of original file contents + owner id to avoid duplicate data
data_id = ingestion.identify(classified_data, user)
original_file_metadata = classified_data.get_metadata()

View file

@ -8,6 +8,8 @@ from cognee.modules.ingestion import save_data_to_file
from cognee.shared.logging_utils import get_logger
from pydantic_settings import BaseSettings, SettingsConfigDict
from cognee.tasks.web_scraper.utils import fetch_page_content
logger = get_logger()
@ -18,13 +20,6 @@ class SaveDataSettings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="allow")
class HTMLContent(str):
def __new__(cls, value: str):
if not ("<" in value and ">" in value):
raise ValueError("Not valid HTML-like content")
return super().__new__(cls, value)
settings = SaveDataSettings()
@ -63,40 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
if parsed_url.scheme == "s3":
return data_item
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
# Validate URL by sending a HEAD request
try:
from cognee.context_global_variables import tavily_config, soup_crawler_config
from cognee.tasks.web_scraper import fetch_page_content
tavily = tavily_config.get()
soup_crawler = soup_crawler_config.get()
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
if preferred_tool == "tavily" and tavily is None:
raise IngestionError(
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
)
if preferred_tool == "beautifulsoup" and soup_crawler is None:
raise IngestionError(
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
)
data = await fetch_page_content(
data_item,
preferred_tool=preferred_tool,
tavily_config=tavily,
soup_crawler_config=soup_crawler,
)
content = ""
for key, value in data.items():
content += f"{key}:\n{value}\n\n"
return await save_data_to_file(content)
except IngestionError:
raise
except Exception as e:
raise IngestionError(
message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
)
urls_to_page_contents = await fetch_page_content(data_item)
return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
# data is local file path
elif parsed_url.scheme == "file":
if settings.accept_local_file_path:

View file

@ -5,9 +5,24 @@ data in a graph database. It includes classes and functions for crawling web pag
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
"""
from .bs4_crawler import BeautifulSoupCrawler
from .utils import fetch_page_content
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
from .default_url_crawler import DefaultUrlCrawler
# Lazy import for web_scraper_task to avoid requiring apscheduler
# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
def __getattr__(name):
"""Lazy load web scraper task functions that require apscheduler."""
if name == "cron_web_scraper_task":
from .web_scraper_task import cron_web_scraper_task
return cron_web_scraper_task
elif name == "web_scraper_task":
from .web_scraper_task import web_scraper_task
return web_scraper_task
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
__all__ = [
@ -15,4 +30,5 @@ __all__ = [
"fetch_page_content",
"cron_web_scraper_task",
"web_scraper_task",
"DefaultUrlCrawler",
]

View file

@ -10,14 +10,16 @@ class TavilyConfig(BaseModel):
timeout: Optional[int] = Field(default=10, ge=1, le=60)
class SoupCrawlerConfig(BaseModel):
class DefaultCrawlerConfig(BaseModel):
concurrency: int = 5
crawl_delay: float = 0.5
max_crawl_delay: Optional[float] = (
10.0 # Maximum crawl delay to respect from robots.txt (None = no limit)
)
timeout: float = 15.0
max_retries: int = 2
retry_delay_factor: float = 0.5
headers: Optional[Dict[str, str]] = None
extraction_rules: Dict[str, Any]
use_playwright: bool = False
playwright_js_wait: float = 0.8
robots_cache_ttl: float = 3600.0

View file

@ -1,21 +1,21 @@
"""BeautifulSoup-based web crawler for extracting content from web pages.
This module provides the BeautifulSoupCrawler class for fetching and extracting content
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
supports robots.txt handling, rate limiting, and custom extraction rules.
"""
import asyncio
import time
from typing import Union, List, Dict, Any, Optional
from urllib.parse import urlparse
from dataclasses import dataclass, field
from functools import lru_cache
import time
from typing import Any, Union, List, Dict, Optional
from urllib.parse import urlparse
import httpx
from bs4 import BeautifulSoup
from cognee.shared.logging_utils import get_logger
logger = get_logger(__name__)
from cognee.shared.logging_utils import get_logger
from cognee.tasks.web_scraper.types import UrlsToHtmls
logger = get_logger()
try:
from protego import Protego
except ImportError:
logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
Protego = None
try:
from playwright.async_api import async_playwright
@ -25,31 +25,6 @@ except ImportError:
)
async_playwright = None
try:
from protego import Protego
except ImportError:
logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
Protego = None
@dataclass
class ExtractionRule:
"""Normalized extraction rule for web content.
Attributes:
selector: CSS selector for extraction (if any).
xpath: XPath expression for extraction (if any).
attr: HTML attribute to extract (if any).
all: If True, extract all matching elements; otherwise, extract first.
join_with: String to join multiple extracted elements.
"""
selector: Optional[str] = None
xpath: Optional[str] = None
attr: Optional[str] = None
all: bool = False
join_with: str = " "
@dataclass
class RobotsTxtCache:
@ -66,27 +41,13 @@ class RobotsTxtCache:
timestamp: float = field(default_factory=time.time)
class BeautifulSoupCrawler:
"""Crawler for fetching and extracting web content using BeautifulSoup.
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
Attributes:
concurrency: Number of concurrent requests allowed.
crawl_delay: Minimum seconds between requests to the same domain.
timeout: Per-request timeout in seconds.
max_retries: Number of retries for failed requests.
retry_delay_factor: Multiplier for exponential backoff on retries.
headers: HTTP headers for requests (e.g., User-Agent).
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
"""
class DefaultUrlCrawler:
def __init__(
self,
*,
concurrency: int = 5,
crawl_delay: float = 0.5,
max_crawl_delay: Optional[float] = 10.0,
timeout: float = 15.0,
max_retries: int = 2,
retry_delay_factor: float = 0.5,
@ -98,6 +59,7 @@ class BeautifulSoupCrawler:
Args:
concurrency: Number of concurrent requests allowed.
crawl_delay: Minimum seconds between requests to the same domain.
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
timeout: Per-request timeout in seconds.
max_retries: Number of retries for failed requests.
retry_delay_factor: Multiplier for exponential backoff on retries.
@ -107,6 +69,7 @@ class BeautifulSoupCrawler:
self.concurrency = concurrency
self._sem = asyncio.Semaphore(concurrency)
self.crawl_delay = crawl_delay
self.max_crawl_delay = max_crawl_delay
self.timeout = timeout
self.max_retries = max_retries
self.retry_delay_factor = retry_delay_factor
@ -183,7 +146,11 @@ class BeautifulSoupCrawler:
elapsed = time.time() - last
wait_for = delay - elapsed
if wait_for > 0:
logger.info(
f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
)
await asyncio.sleep(wait_for)
logger.info(f"Rate limit wait completed for {url}")
self._last_request_time_per_domain[domain] = time.time()
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
@ -236,7 +203,16 @@ class BeautifulSoupCrawler:
crawl_delay = self.crawl_delay
if protego:
delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
crawl_delay = delay if delay else self.crawl_delay
if delay:
# Apply max_crawl_delay cap if configured
if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
logger.warning(
f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
f"capping to max_crawl_delay={self.max_crawl_delay}s"
)
crawl_delay = self.max_crawl_delay
else:
crawl_delay = delay
cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
self._robots_cache[domain_root] = cache_entry
@ -307,12 +283,16 @@ class BeautifulSoupCrawler:
attempt = 0
crawl_delay = await self._get_crawl_delay(url)
logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
while True:
try:
await self._respect_rate_limit(url, crawl_delay)
resp = await self._client.get(url)
resp.raise_for_status()
logger.info(
f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
)
return resp.text
except Exception as exc:
attempt += 1
@ -347,22 +327,35 @@ class BeautifulSoupCrawler:
raise RuntimeError(
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
)
timeout_val = timeout or self.timeout
logger.info(
f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
)
attempt = 0
while True:
try:
async with async_playwright() as p:
logger.info(f"Launching headless Chromium browser for {url}")
browser = await p.chromium.launch(headless=True)
try:
context = await browser.new_context()
page = await context.new_page()
logger.info(f"Navigating to {url} and waiting for network idle")
await page.goto(
url,
wait_until="networkidle",
timeout=int((timeout or self.timeout) * 1000),
timeout=int(timeout_val * 1000),
)
if js_wait:
logger.info(f"Waiting {js_wait}s for JavaScript to execute")
await asyncio.sleep(js_wait)
return await page.content()
content = await page.content()
logger.info(
f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
)
return content
finally:
await browser.close()
except Exception as exc:
@ -376,96 +369,13 @@ class BeautifulSoupCrawler:
)
await asyncio.sleep(backoff)
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass.
Args:
rule: A string (CSS selector) or dict with extraction parameters.
Returns:
ExtractionRule: Normalized extraction rule.
Raises:
ValueError: If the rule is invalid.
"""
if isinstance(rule, str):
return ExtractionRule(selector=rule)
if isinstance(rule, dict):
return ExtractionRule(
selector=rule.get("selector"),
xpath=rule.get("xpath"),
attr=rule.get("attr"),
all=bool(rule.get("all", False)),
join_with=rule.get("join_with", " "),
)
raise ValueError(f"Invalid extraction rule: {rule}")
def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
"""Extract content from HTML using BeautifulSoup or lxml XPath.
Args:
html: The HTML content to extract from.
rule: The extraction rule to apply.
Returns:
str: The extracted content.
Raises:
RuntimeError: If XPath is used but lxml is not installed.
"""
soup = BeautifulSoup(html, "html.parser")
if rule.xpath:
try:
from lxml import html as lxml_html
except ImportError:
raise RuntimeError(
"XPath requested but lxml is not available. Install lxml or use CSS selectors."
)
doc = lxml_html.fromstring(html)
nodes = doc.xpath(rule.xpath)
texts = []
for n in nodes:
if hasattr(n, "text_content"):
texts.append(n.text_content().strip())
else:
texts.append(str(n).strip())
return rule.join_with.join(t for t in texts if t)
if not rule.selector:
return ""
if rule.all:
nodes = soup.select(rule.selector)
pieces = []
for el in nodes:
if rule.attr:
val = el.get(rule.attr)
if val:
pieces.append(val.strip())
else:
text = el.get_text(strip=True)
if text:
pieces.append(text)
return rule.join_with.join(pieces).strip()
else:
el = soup.select_one(rule.selector)
if el is None:
return ""
if rule.attr:
val = el.get(rule.attr)
return (val or "").strip()
return el.get_text(strip=True)
async def fetch_with_bs4(
async def fetch_urls(
self,
urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
extraction_rules: Optional[Dict[str, Any]] = None,
urls: Union[str, List[str]],
*,
use_playwright: bool = False,
playwright_js_wait: float = 0.8,
join_all_matches: bool = False,
) -> Dict[str, str]:
) -> UrlsToHtmls:
"""Fetch and extract content from URLs using BeautifulSoup or Playwright.
Args:
@ -482,65 +392,55 @@ class BeautifulSoupCrawler:
ValueError: If extraction_rules are missing when required or if urls is invalid.
Exception: If fetching or extraction fails.
"""
url_rules_map: Dict[str, Dict[str, Any]] = {}
if isinstance(urls, str):
if not extraction_rules:
raise ValueError("extraction_rules required when urls is a string")
url_rules_map[urls] = extraction_rules
elif isinstance(urls, list):
if not extraction_rules:
raise ValueError("extraction_rules required when urls is a list")
for url in urls:
url_rules_map[url] = extraction_rules
elif isinstance(urls, dict):
url_rules_map = urls
urls = [urls]
else:
raise ValueError(f"Invalid urls type: {type(urls)}")
normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
for url, rules in url_rules_map.items():
normalized_rules = []
for _, rule in rules.items():
r = self._normalize_rule(rule)
if join_all_matches:
r.all = True
normalized_rules.append(r)
normalized_url_rules[url] = normalized_rules
async def _task(url: str):
async with self._sem:
try:
logger.info(f"Processing URL: {url}")
# Check robots.txt
allowed = await self._is_url_allowed(url)
if not allowed:
logger.warning(f"URL disallowed by robots.txt: {url}")
return url, ""
logger.info(f"Robots.txt check passed for {url}")
# Fetch HTML
if use_playwright:
logger.info(
f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
)
html = await self._render_with_playwright(
url, js_wait=playwright_js_wait, timeout=self.timeout
)
else:
logger.info(f"Fetching {url} with httpx")
html = await self._fetch_httpx(url)
pieces = []
for rule in normalized_url_rules[url]:
text = self._extract_with_bs4(html, rule)
if text:
pieces.append(text)
logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
concatenated = " ".join(pieces).strip()
return url, concatenated
return url, html
except Exception as e:
logger.error(f"Error processing {url}: {e}")
return url, ""
tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
tasks = [asyncio.create_task(_task(u)) for u in urls]
results = {}
completed = 0
total = len(tasks)
for coro in asyncio.as_completed(tasks):
url, text = await coro
results[url] = text
url, html = await coro
results[url] = html
completed += 1
logger.info(f"Progress: {completed}/{total} URLs processed")
logger.info(f"Completed fetching all {len(results)} URL(s)")
return results

View file

@ -0,0 +1,4 @@
from typing import TypeAlias
UrlsToHtmls: TypeAlias = dict[str, str]

View file

@ -4,21 +4,17 @@ This module provides functions to fetch and extract content from web pages, supp
both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
"""
from typing import Dict, List, Union, Optional, Literal
import os
from typing import List, Union
from cognee.shared.logging_utils import get_logger
from .bs4_crawler import BeautifulSoupCrawler
from .config import TavilyConfig, SoupCrawlerConfig
from cognee.tasks.web_scraper.types import UrlsToHtmls
from .default_url_crawler import DefaultUrlCrawler
from .config import DefaultCrawlerConfig, TavilyConfig
logger = get_logger(__name__)
async def fetch_page_content(
urls: Union[str, List[str]],
*,
preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup",
tavily_config: Optional[TavilyConfig] = None,
soup_crawler_config: Optional[SoupCrawlerConfig] = None,
) -> Dict[str, str]:
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
"""Fetch content from one or more URLs using the specified tool.
This function retrieves web page content using either BeautifulSoup (with custom
@ -31,7 +27,7 @@ async def fetch_page_content(
Defaults to "beautifulsoup".
tavily_config: Configuration for Tavily API, including API key.
Required if preferred_tool is "tavily".
soup_crawler_config: Configuration for BeautifulSoup crawler, including
default_crawler_config: Configuration for BeautifulSoup crawler, including
extraction rules. Required if preferred_tool is "beautifulsoup" and
extraction_rules are needed.
@ -45,50 +41,52 @@ async def fetch_page_content(
ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
installed.
"""
if preferred_tool == "tavily":
if not tavily_config or tavily_config.api_key is None:
raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
return await fetch_with_tavily(urls, tavily_config)
url_list = [urls] if isinstance(urls, str) else urls
if preferred_tool == "beautifulsoup":
try:
from bs4 import BeautifulSoup as _ # noqa: F401
except ImportError:
logger.error(
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
)
raise ImportError
if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
raise ValueError("extraction_rules must be provided when not using Tavily")
extraction_rules = soup_crawler_config.extraction_rules
crawler = BeautifulSoupCrawler(
concurrency=soup_crawler_config.concurrency,
crawl_delay=soup_crawler_config.crawl_delay,
timeout=soup_crawler_config.timeout,
max_retries=soup_crawler_config.max_retries,
retry_delay_factor=soup_crawler_config.retry_delay_factor,
headers=soup_crawler_config.headers,
robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
if os.getenv("TAVILY_API_KEY"):
logger.info("Using Tavily API for url fetching")
return await fetch_with_tavily(urls)
else:
logger.info("Using default crawler for content extraction")
default_crawler_config = (
DefaultCrawlerConfig()
) # We've decided to use defaults, and configure through env vars as needed
logger.info(
f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
)
crawler = DefaultUrlCrawler(
concurrency=default_crawler_config.concurrency,
crawl_delay=default_crawler_config.crawl_delay,
max_crawl_delay=default_crawler_config.max_crawl_delay,
timeout=default_crawler_config.timeout,
max_retries=default_crawler_config.max_retries,
retry_delay_factor=default_crawler_config.retry_delay_factor,
headers=default_crawler_config.headers,
robots_cache_ttl=default_crawler_config.robots_cache_ttl,
)
try:
results = await crawler.fetch_with_bs4(
urls,
extraction_rules,
use_playwright=soup_crawler_config.use_playwright,
playwright_js_wait=soup_crawler_config.playwright_js_wait,
join_all_matches=soup_crawler_config.join_all_matches,
logger.info(
f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
)
results = await crawler.fetch_urls(
urls,
use_playwright=default_crawler_config.use_playwright,
playwright_js_wait=default_crawler_config.playwright_js_wait,
)
logger.info(f"Successfully fetched content from {len(results)} URL(s)")
return results
except Exception as e:
logger.error(f"Error fetching page content: {str(e)}")
raise
finally:
logger.info("Closing BeautifulSoup crawler")
await crawler.close()
async def fetch_with_tavily(
urls: Union[str, List[str]], tavily_config: Optional[TavilyConfig] = None
) -> Dict[str, str]:
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
"""Fetch content from URLs using the Tavily API.
Args:
@ -108,19 +106,37 @@ async def fetch_with_tavily(
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
)
raise
client = AsyncTavilyClient(
api_key=tavily_config.api_key if tavily_config else None,
proxies=tavily_config.proxies if tavily_config else None,
tavily_config = TavilyConfig()
url_list = [urls] if isinstance(urls, str) else urls
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
timeout = tavily_config.timeout if tavily_config else 10
logger.info(
f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
)
client = AsyncTavilyClient(
api_key=tavily_config.api_key,
proxies=tavily_config.proxies,
)
logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
results = await client.extract(
urls,
format="text",
extract_depth=tavily_config.extract_depth if tavily_config else "basic",
timeout=tavily_config.timeout if tavily_config else 10,
extract_depth=extract_depth,
timeout=timeout,
)
for failed_result in results.get("failed_results", []):
logger.warning(f"Failed to fetch {failed_result}")
failed_count = len(results.get("failed_results", []))
if failed_count > 0:
logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
for failed_result in results.get("failed_results", []):
logger.warning(f"Failed to fetch {failed_result}")
return_results = {}
for result in results.get("results", []):
return_results[result["url"]] = result["raw_content"]
logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
return return_results

View file

@ -19,7 +19,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges
from cognee.modules.engine.operations.setup import setup
from .models import WebPage, WebSite, ScrapingJob
from .config import SoupCrawlerConfig, TavilyConfig
from .config import DefaultCrawlerConfig, TavilyConfig
from .utils import fetch_page_content
try:
@ -47,7 +47,7 @@ async def cron_web_scraper_task(
schedule: str = None,
extraction_rules: dict = None,
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
soup_crawler_config: SoupCrawlerConfig = None,
soup_crawler_config: DefaultCrawlerConfig = None,
tavily_config: TavilyConfig = None,
job_name: str = "scraping",
):
@ -121,7 +121,7 @@ async def web_scraper_task(
schedule: str = None,
extraction_rules: dict = None,
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
soup_crawler_config: SoupCrawlerConfig = None,
soup_crawler_config: DefaultCrawlerConfig = None,
tavily_config: TavilyConfig = None,
job_name: str = None,
):
@ -341,7 +341,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
soup_crawler_config: Configuration for BeautifulSoup crawler.
Returns:
Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
Raises:
@ -350,7 +350,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
preferred_tool = "beautifulsoup"
if extraction_rules and not soup_crawler_config:
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
if tavily_api_key:
if not tavily_config:

View file

@ -0,0 +1,13 @@
import pytest
from cognee.tasks.web_scraper import DefaultUrlCrawler
@pytest.mark.asyncio
async def test_fetch():
crawler = DefaultUrlCrawler()
url = "https://en.wikipedia.org/wiki/Large_language_model"
results = await crawler.fetch_urls(url)
assert len(results) == 1
assert isinstance(results, dict)
html = results[url]
assert isinstance(html, str)

View file

@ -0,0 +1,19 @@
import os
import pytest
from cognee.tasks.web_scraper.utils import fetch_with_tavily
skip_in_ci = pytest.mark.skipif(
os.getenv("GITHUB_ACTIONS") == "true",
reason="Skipping in Github for now - before we get TAVILY_API_KEY",
)
@skip_in_ci
@pytest.mark.asyncio
async def test_fetch():
url = "https://en.wikipedia.org/wiki/Large_language_model"
results = await fetch_with_tavily(url)
assert isinstance(results, dict)
assert len(results) == 1
html = results[url]
assert isinstance(html, str)

View file

@ -0,0 +1,310 @@
import pytest
import cognee
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
from cognee.tasks.ingestion import save_data_item_to_storage
from pathlib import Path
@pytest.mark.asyncio
async def test_url_saves_as_html_file():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_saved_html_is_valid():
try:
from bs4 import BeautifulSoup
except ImportError:
pytest.fail("Test case requires bs4 installed")
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
content = Path(file_path).read_text()
soup = BeautifulSoup(content, "html.parser")
assert soup.find() is not None, "File should contain parseable HTML"
has_html_elements = any(
[
soup.find("html"),
soup.find("head"),
soup.find("body"),
soup.find("div"),
soup.find("p"),
]
)
assert has_html_elements, "File should contain common HTML elements"
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_add_url():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
@pytest.mark.asyncio
async def test_add_url_without_incremental_loading():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
incremental_loading=False,
)
except Exception as e:
pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
async def test_add_url_with_incremental_loading():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
incremental_loading=True,
)
except Exception as e:
pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
)
except Exception as e:
pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
async def test_loader_is_none_by_default():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
loader = loader_engine.get_loader(
file_path,
preferred_loaders=preferred_loaders,
)
assert loader is None
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
loader = loader_engine.get_loader(
file_path,
preferred_loaders=preferred_loaders,
)
assert loader == bs_loader
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {}}
with pytest.raises(ValueError):
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loads_file_successfully():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
original_file = Path(file_path)
assert original_file.exists()
assert original_file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
loader = loader_engine.get_loader(
file_path,
preferred_loaders=preferred_loaders,
)
assert loader == bs_loader
cognee_loaded_txt_path = await loader_engine.load_file(
file_path=file_path, preferred_loaders=preferred_loaders
)
cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
assert cognee_loaded_txt_path.endswith(".txt")
extracted_file = Path(cognee_loaded_txt_path)
assert extracted_file.exists()
assert extracted_file.stat().st_size > 0
original_basename = original_file.stem
extracted_basename = extracted_file.stem
assert original_basename == extracted_basename, (
f"Expected same base name: {original_basename} vs {extracted_basename}"
)
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")

View file

@ -1,6 +1,6 @@
import asyncio
import cognee
from cognee.tasks.web_scraper.config import SoupCrawlerConfig
from cognee.tasks.web_scraper.config import DefaultCrawlerConfig
from cognee.tasks.web_scraper import cron_web_scraper_task
@ -14,7 +14,7 @@ async def test_web_scraping_using_bs4():
"authors": {"selector": ".quote small", "all": True},
}
soup_config = SoupCrawlerConfig(
soup_config = DefaultCrawlerConfig(
concurrency=5,
crawl_delay=0.5,
timeout=15.0,
@ -47,7 +47,7 @@ async def test_web_scraping_using_bs4_and_incremental_loading():
url = "https://books.toscrape.com/"
rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}
soup_config = SoupCrawlerConfig(
soup_config = DefaultCrawlerConfig(
concurrency=1,
crawl_delay=0.1,
timeout=10.0,

View file

@ -0,0 +1,37 @@
import asyncio
import cognee
async def main():
await cognee.prune.prune_data()
print("Data pruned.")
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {
"selector": "a",
"attr": "href",
"all": True,
},
"paragraphs": {"selector": "p", "all": True},
}
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
incremental_loading=False,
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
)
await cognee.cognify()
print("Knowledge graph created.")
await cognee.visualize_graph()
print("Data visualized")
if __name__ == "__main__":
asyncio.run(main())