Separate BeautifulSoup crawling from fetching

2025-10-21 15:49:12 +01:00 · 2025-10-21 15:49:12 +01:00 · 9d9969676f
commit 9d9969676f
parent a7ff188018
11 changed files with 489 additions and 573 deletions
--- a/cognee/context_global_variables.py
+++ b/cognee/context_global_variables.py
@ -7,18 +7,12 @@ from cognee.base_config import get_base_config
 from cognee.infrastructure.databases.utils import get_or_create_dataset_database
 from cognee.infrastructure.files.storage.config import file_storage_config
 from cognee.modules.users.methods import get_user
 from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
 # Note: ContextVar allows us to use different graph db configurations in Cognee
 #       for different async tasks, threads and processes
 vector_db_config = ContextVar("vector_db_config", default=None)
 graph_db_config = ContextVar("graph_db_config", default=None)
 session_user = ContextVar("session_user", default=None)
 soup_crawler_config: ContextVar[SoupCrawlerConfig | None] = ContextVar(
    "soup_crawler_config", default=None
 )
 tavily_config: ContextVar[TavilyConfig | None] = ContextVar("tavily_config", default=None)
 async def set_session_user_context_variable(user):
    session_user.set(user)
--- a/cognee/tasks/ingestion/data_fetchers/init.py
+++ b/cognee/tasks/ingestion/data_fetchers/init.py
@ -1,8 +0,0 @@
 __all__ = []
 try:
    from .web_url_fetcher import WebUrlFetcher
    __all__.append("WebUrlFetcher")
 except ImportError:
    pass
--- a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
+++ b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
@ -1,15 +0,0 @@
 from abc import ABC, abstractmethod
 from typing import Any
 class DataFetcherInterface(ABC):
    @abstractmethod
    def fetcher_name(self) -> str:
        pass
    @abstractmethod
    async def fetch(self, data_item_path: str) -> str:
        """
        args: data_item_path - path to the data item
        """
        pass
--- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
+++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
@ -1,55 +0,0 @@
 import os
 from cognee.modules.ingestion import save_data_to_file
 from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface
 from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
 from cognee.shared.logging_utils import get_logger
 logger = get_logger()
 class WebUrlFetcher(DataFetcherInterface):
    def __init__(self): ...
    def fetcher_name(self):
        return "web_url_fetcher"
    async def fetch(self, data_item_path: str):
        from cognee.context_global_variables import tavily_config, soup_crawler_config
        from cognee.tasks.web_scraper import fetch_page_content
        if os.getenv("TAVILY_API_KEY"):
            _tavily_config = TavilyConfig()
            _soup_config = None
            preferred_tool = "tavily"
        else:
            _tavily_config = None
            _soup_config = SoupCrawlerConfig()
            preferred_tool = "beautifulsoup"
        tavily_config.set(_tavily_config)
        soup_crawler_config.set(_soup_config)
        logger.info(f"Starting web URL crawling for: {data_item_path}")
        logger.info(f"Using scraping tool: {preferred_tool}")
        data = await fetch_page_content(
            data_item_path,
            preferred_tool=preferred_tool,
        )
        logger.info(f"Successfully fetched content from URL {data_item_path}")
        # fetch_page_content returns a dict like {url: content}
        # Extract the content string before saving
        if isinstance(data, dict):
            # Concatenate all URL contents (usually just one URL)
            content = ""
            for url, text in data.items():
                content += f"{url}:\n{text}\n\n"
            logger.info(
                f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters"
            )
        else:
            content = data
        return await save_data_to_file(content)
--- a/cognee/tasks/web_scraper/init.py
+++ b/cognee/tasks/web_scraper/init.py
@ -8,6 +8,7 @@ BeautifulSoup or Tavily, defining data models, and handling scraping configurati
 from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
 from .web_scraper_task import cron_web_scraper_task, web_scraper_task
 from .default_url_crawler import DefaultUrlCrawler
 __all__ = [
@ -15,4 +16,5 @@ __all__ = [
    "fetch_page_content",
    "cron_web_scraper_task",
    "web_scraper_task",
    "DefaultUrlCrawler",
 ]
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@ -5,32 +5,13 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
 import asyncio
 import time
 from typing import Union, List, Dict, Any, Optional
-from urllib.parse import urlparse
+from dataclasses import dataclass
 from dataclasses import dataclass, field
 from functools import lru_cache
 import httpx
 from bs4 import BeautifulSoup
 from cognee.shared.logging_utils import get_logger
 logger = get_logger(__name__)
 try:
    from playwright.async_api import async_playwright
 except ImportError:
    logger.warning(
        "Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
    )
    async_playwright = None
 try:
    from protego import Protego
 except ImportError:
    logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
    Protego = None
@dataclass
 class ExtractionRule:
@ -51,21 +32,6 @@ class ExtractionRule:
    join_with: str = " "
@dataclass
 class RobotsTxtCache:
    """Cache for robots.txt data.
    Attributes:
        protego: Parsed robots.txt object (Protego instance).
        crawl_delay: Delay between requests (in seconds).
        timestamp: Time when the cache entry was created.
    """
    protego: Any
    crawl_delay: float
    timestamp: float = field(default_factory=time.time)
 # TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
 class BeautifulSoupCrawler:
    """Crawler for fetching and extracting web content using BeautifulSoup.
@ -84,333 +50,6 @@ class BeautifulSoupCrawler:
        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
    """
    def __init__(
        self,
        *,
        concurrency: int = 5,
        crawl_delay: float = 0.5,
        max_crawl_delay: Optional[float] = 10.0,
        timeout: float = 15.0,
        max_retries: int = 2,
        retry_delay_factor: float = 0.5,
        headers: Optional[Dict[str, str]] = None,
        robots_cache_ttl: float = 3600.0,
    ):
        """Initialize the BeautifulSoupCrawler.
        Args:
            concurrency: Number of concurrent requests allowed.
            crawl_delay: Minimum seconds between requests to the same domain.
            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
            timeout: Per-request timeout in seconds.
            max_retries: Number of retries for failed requests.
            retry_delay_factor: Multiplier for exponential backoff on retries.
            headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
            robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
        """
        self.concurrency = concurrency
        self._sem = asyncio.Semaphore(concurrency)
        self.crawl_delay = crawl_delay
        self.max_crawl_delay = max_crawl_delay
        self.timeout = timeout
        self.max_retries = max_retries
        self.retry_delay_factor = retry_delay_factor
        self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
        self.robots_cache_ttl = robots_cache_ttl
        self._last_request_time_per_domain: Dict[str, float] = {}
        self._robots_cache: Dict[str, RobotsTxtCache] = {}
        self._client: Optional[httpx.AsyncClient] = None
        self._robots_lock = asyncio.Lock()
    async def _ensure_client(self):
        """Initialize the HTTP client if not already created."""
        if self._client is None:
            self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
    async def close(self):
        """Close the HTTP client."""
        if self._client:
            await self._client.aclose()
            self._client = None
    async def __aenter__(self):
        """Enter the context manager, initializing the HTTP client."""
        await self._ensure_client()
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Exit the context manager, closing the HTTP client."""
        await self.close()
    @lru_cache(maxsize=1024)
    def _domain_from_url(self, url: str) -> str:
        """Extract the domain (netloc) from a URL.
        Args:
            url: The URL to parse.
        Returns:
            str: The domain (netloc) of the URL.
        """
        try:
            return urlparse(url).netloc
        except Exception:
            return url
    @lru_cache(maxsize=1024)
    def _get_domain_root(self, url: str) -> str:
        """Get the root URL (scheme and netloc) from a URL.
        Args:
            url: The URL to parse.
        Returns:
            str: The root URL (e.g., "https://example.com").
        """
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"
    async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
        """Enforce rate limiting for requests to the same domain.
        Args:
            url: The URL to check.
            crawl_delay: Custom crawl delay in seconds (if any).
        """
        domain = self._domain_from_url(url)
        last = self._last_request_time_per_domain.get(domain)
        delay = crawl_delay if crawl_delay is not None else self.crawl_delay
        if last is None:
            self._last_request_time_per_domain[domain] = time.time()
            return
        elapsed = time.time() - last
        wait_for = delay - elapsed
        if wait_for > 0:
            logger.info(
                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
            )
            await asyncio.sleep(wait_for)
            logger.info(f"Rate limit wait completed for {url}")
        self._last_request_time_per_domain[domain] = time.time()
    async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
        """Get cached robots.txt data if valid.
        Args:
            domain_root: The root URL (e.g., "https://example.com").
        Returns:
            Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
        """
        if Protego is None:
            return None
        cached = self._robots_cache.get(domain_root)
        if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
            return cached
        return None
    async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
        """Fetch and cache robots.txt data.
        Args:
            domain_root: The root URL (e.g., "https://example.com").
        Returns:
            RobotsTxtCache: Cached robots.txt data with crawl delay.
        Raises:
            Exception: If fetching robots.txt fails.
        """
        async with self._robots_lock:
            cached = await self._get_robots_cache(domain_root)
            if cached:
                return cached
            robots_url = f"{domain_root}/robots.txt"
            try:
                await self._ensure_client()
                await self._respect_rate_limit(robots_url, self.crawl_delay)
                resp = await self._client.get(robots_url, timeout=5.0)
                content = resp.text if resp.status_code == 200 else ""
            except Exception as e:
                logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
                content = ""
            protego = Protego.parse(content) if content.strip() else None
            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
            crawl_delay = self.crawl_delay
            if protego:
                delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
                if delay:
                    # Apply max_crawl_delay cap if configured
                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
                        logger.warning(
                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
                        )
                        crawl_delay = self.max_crawl_delay
                    else:
                        crawl_delay = delay
            cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
            self._robots_cache[domain_root] = cache_entry
            return cache_entry
    async def _is_url_allowed(self, url: str) -> bool:
        """Check if a URL is allowed by robots.txt.
        Args:
            url: The URL to check.
        Returns:
            bool: True if the URL is allowed, False otherwise.
        """
        if Protego is None:
            return True
        try:
            domain_root = self._get_domain_root(url)
            cache = await self._get_robots_cache(domain_root)
            if cache is None:
                cache = await self._fetch_and_cache_robots(domain_root)
            if cache.protego is None:
                return True
            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
            return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
        except Exception as e:
            logger.debug(f"Error checking robots.txt for {url}: {e}")
            return True
    async def _get_crawl_delay(self, url: str) -> float:
        """Get the crawl delay for a URL from robots.txt.
        Args:
            url: The URL to check.
        Returns:
            float: Crawl delay in seconds.
        """
        if Protego is None:
            return self.crawl_delay
        try:
            domain_root = self._get_domain_root(url)
            cache = await self._get_robots_cache(domain_root)
            if cache is None:
                cache = await self._fetch_and_cache_robots(domain_root)
            return cache.crawl_delay
        except Exception:
            return self.crawl_delay
    async def _fetch_httpx(self, url: str) -> str:
        """Fetch a URL using HTTPX with retries.
        Args:
            url: The URL to fetch.
        Returns:
            str: The HTML content of the page.
        Raises:
            Exception: If all retry attempts fail.
        """
        await self._ensure_client()
        assert self._client is not None, "HTTP client not initialized"
        attempt = 0
        crawl_delay = await self._get_crawl_delay(url)
        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
        while True:
            try:
                await self._respect_rate_limit(url, crawl_delay)
                resp = await self._client.get(url)
                resp.raise_for_status()
                logger.info(
                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
                )
                return resp.text
            except Exception as exc:
                attempt += 1
                if attempt > self.max_retries:
                    logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
                    raise
                delay = self.retry_delay_factor * (2 ** (attempt - 1))
                logger.warning(
                    f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
                )
                await asyncio.sleep(delay)
    async def _render_with_playwright(
        self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
    ) -> str:
        """Fetch and render a URL using Playwright for JavaScript content.
        Args:
            url: The URL to fetch.
            js_wait: Seconds to wait for JavaScript to load.
            timeout: Timeout for the request (in seconds, defaults to instance timeout).
        Returns:
            str: The rendered HTML content.
        Raises:
            RuntimeError: If Playwright is not installed.
            Exception: If all retry attempts fail.
        """
        if async_playwright is None:
            raise RuntimeError(
                "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
            )
        timeout_val = timeout or self.timeout
        logger.info(
            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
        )
        attempt = 0
        while True:
            try:
                async with async_playwright() as p:
                    logger.info(f"Launching headless Chromium browser for {url}")
                    browser = await p.chromium.launch(headless=True)
                    try:
                        context = await browser.new_context()
                        page = await context.new_page()
                        logger.info(f"Navigating to {url} and waiting for network idle")
                        await page.goto(
                            url,
                            wait_until="networkidle",
                            timeout=int(timeout_val * 1000),
                        )
                        if js_wait:
                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
                            await asyncio.sleep(js_wait)
                        content = await page.content()
                        logger.info(
                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
                        )
                        return content
                    finally:
                        await browser.close()
            except Exception as exc:
                attempt += 1
                if attempt > self.max_retries:
                    logger.error(f"Playwright fetch failed for {url}: {exc}")
                    raise
                backoff = self.retry_delay_factor * (2 ** (attempt - 1))
                logger.warning(
                    f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
                )
                await asyncio.sleep(backoff)
    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.
@ -435,7 +74,7 @@ class BeautifulSoupCrawler:
            )
        raise ValueError(f"Invalid extraction rule: {rule}")
-    def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
+    def extract(self, html: str, rule: ExtractionRule) -> str:
        """Extract content from HTML using BeautifulSoup or lxml XPath.
        Args:
@ -491,79 +130,3 @@ class BeautifulSoupCrawler:
                val = el.get(rule.attr)
                return (val or "").strip()
            return el.get_text(strip=True)
    async def fetch_urls(
        self,
        urls: Union[str, List[str]],
        *,
        use_playwright: bool = False,
        playwright_js_wait: float = 0.8,
    ) -> Dict[str, str]:
        """Fetch and extract content from URLs using BeautifulSoup or Playwright.
        Args:
            urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
            extraction_rules: Default extraction rules for string or list URLs.
            use_playwright: If True, use Playwright for JavaScript rendering.
            playwright_js_wait: Seconds to wait for JavaScript to load.
            join_all_matches: If True, extract all matching elements for each rule.
        Returns:
            Dict[str, str]: A dictionary mapping URLs to their extracted content.
        Raises:
            ValueError: If extraction_rules are missing when required or if urls is invalid.
            Exception: If fetching or extraction fails.
        """
        if isinstance(urls, str):
            urls = [urls]
        else:
            raise ValueError(f"Invalid urls type: {type(urls)}")
        async def _task(url: str):
            async with self._sem:
                try:
                    logger.info(f"Processing URL: {url}")
                    # Check robots.txt
                    allowed = await self._is_url_allowed(url)
                    if not allowed:
                        logger.warning(f"URL disallowed by robots.txt: {url}")
                        return url, ""
                    logger.info(f"Robots.txt check passed for {url}")
                    # Fetch HTML
                    if use_playwright:
                        logger.info(
                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
                        )
                        html = await self._render_with_playwright(
                            url, js_wait=playwright_js_wait, timeout=self.timeout
                        )
                    else:
                        logger.info(f"Fetching {url} with httpx")
                        html = await self._fetch_httpx(url)
                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
                    return url, html
                except Exception as e:
                    logger.error(f"Error processing {url}: {e}")
                    return url, ""
        logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
        tasks = [asyncio.create_task(_task(u)) for u in urls]
        results = {}
        completed = 0
        total = len(tasks)
        for coro in asyncio.as_completed(tasks):
            url, html = await coro
            results[url] = html
            completed += 1
            logger.info(f"Progress: {completed}/{total} URLs processed")
        logger.info(f"Completed fetching all {len(results)} URL(s)")
        return results
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -10,7 +10,7 @@ class TavilyConfig(BaseModel):
    timeout: Optional[int] = Field(default=10, ge=1, le=60)
-class SoupCrawlerConfig(BaseModel):
+class DefaultCrawlerConfig(BaseModel):
    concurrency: int = 5
    crawl_delay: float = 0.5
    max_crawl_delay: Optional[float] = (
--- a/cognee/tasks/web_scraper/default_url_crawler.py
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@ -0,0 +1,446 @@
 import asyncio
 from dataclasses import dataclass, field
 from functools import lru_cache
 import time
 from typing import Any, Union, List, Dict, Optional
 from urllib.parse import urlparse
 import httpx
 from cognee.shared.logging_utils import get_logger
 from cognee.tasks.web_scraper.utils import UrlsToHtmls
 logger = get_logger()
 try:
    from protego import Protego
 except ImportError:
    logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
    Protego = None
 try:
    from playwright.async_api import async_playwright
 except ImportError:
    logger.warning(
        "Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
    )
    async_playwright = None
@dataclass
 class RobotsTxtCache:
    """Cache for robots.txt data.
    Attributes:
        protego: Parsed robots.txt object (Protego instance).
        crawl_delay: Delay between requests (in seconds).
        timestamp: Time when the cache entry was created.
    """
    protego: Any
    crawl_delay: float
    timestamp: float = field(default_factory=time.time)
 class DefaultUrlCrawler:
    def __init__(
        self,
        *,
        concurrency: int = 5,
        crawl_delay: float = 0.5,
        max_crawl_delay: Optional[float] = 10.0,
        timeout: float = 15.0,
        max_retries: int = 2,
        retry_delay_factor: float = 0.5,
        headers: Optional[Dict[str, str]] = None,
        robots_cache_ttl: float = 3600.0,
    ):
        """Initialize the BeautifulSoupCrawler.
        Args:
            concurrency: Number of concurrent requests allowed.
            crawl_delay: Minimum seconds between requests to the same domain.
            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
            timeout: Per-request timeout in seconds.
            max_retries: Number of retries for failed requests.
            retry_delay_factor: Multiplier for exponential backoff on retries.
            headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
            robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
        """
        self.concurrency = concurrency
        self._sem = asyncio.Semaphore(concurrency)
        self.crawl_delay = crawl_delay
        self.max_crawl_delay = max_crawl_delay
        self.timeout = timeout
        self.max_retries = max_retries
        self.retry_delay_factor = retry_delay_factor
        self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
        self.robots_cache_ttl = robots_cache_ttl
        self._last_request_time_per_domain: Dict[str, float] = {}
        self._robots_cache: Dict[str, RobotsTxtCache] = {}
        self._client: Optional[httpx.AsyncClient] = None
        self._robots_lock = asyncio.Lock()
    async def _ensure_client(self):
        """Initialize the HTTP client if not already created."""
        if self._client is None:
            self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
    async def close(self):
        """Close the HTTP client."""
        if self._client:
            await self._client.aclose()
            self._client = None
    async def __aenter__(self):
        """Enter the context manager, initializing the HTTP client."""
        await self._ensure_client()
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Exit the context manager, closing the HTTP client."""
        await self.close()
    @lru_cache(maxsize=1024)
    def _domain_from_url(self, url: str) -> str:
        """Extract the domain (netloc) from a URL.
        Args:
            url: The URL to parse.
        Returns:
            str: The domain (netloc) of the URL.
        """
        try:
            return urlparse(url).netloc
        except Exception:
            return url
    @lru_cache(maxsize=1024)
    def _get_domain_root(self, url: str) -> str:
        """Get the root URL (scheme and netloc) from a URL.
        Args:
            url: The URL to parse.
        Returns:
            str: The root URL (e.g., "https://example.com").
        """
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"
    async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
        """Enforce rate limiting for requests to the same domain.
        Args:
            url: The URL to check.
            crawl_delay: Custom crawl delay in seconds (if any).
        """
        domain = self._domain_from_url(url)
        last = self._last_request_time_per_domain.get(domain)
        delay = crawl_delay if crawl_delay is not None else self.crawl_delay
        if last is None:
            self._last_request_time_per_domain[domain] = time.time()
            return
        elapsed = time.time() - last
        wait_for = delay - elapsed
        if wait_for > 0:
            logger.info(
                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
            )
            await asyncio.sleep(wait_for)
            logger.info(f"Rate limit wait completed for {url}")
        self._last_request_time_per_domain[domain] = time.time()
    async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
        """Get cached robots.txt data if valid.
        Args:
            domain_root: The root URL (e.g., "https://example.com").
        Returns:
            Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
        """
        if Protego is None:
            return None
        cached = self._robots_cache.get(domain_root)
        if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
            return cached
        return None
    async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
        """Fetch and cache robots.txt data.
        Args:
            domain_root: The root URL (e.g., "https://example.com").
        Returns:
            RobotsTxtCache: Cached robots.txt data with crawl delay.
        Raises:
            Exception: If fetching robots.txt fails.
        """
        async with self._robots_lock:
            cached = await self._get_robots_cache(domain_root)
            if cached:
                return cached
            robots_url = f"{domain_root}/robots.txt"
            try:
                await self._ensure_client()
                await self._respect_rate_limit(robots_url, self.crawl_delay)
                resp = await self._client.get(robots_url, timeout=5.0)
                content = resp.text if resp.status_code == 200 else ""
            except Exception as e:
                logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
                content = ""
            protego = Protego.parse(content) if content.strip() else None
            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
            crawl_delay = self.crawl_delay
            if protego:
                delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
                if delay:
                    # Apply max_crawl_delay cap if configured
                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
                        logger.warning(
                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
                        )
                        crawl_delay = self.max_crawl_delay
                    else:
                        crawl_delay = delay
            cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
            self._robots_cache[domain_root] = cache_entry
            return cache_entry
    async def _is_url_allowed(self, url: str) -> bool:
        """Check if a URL is allowed by robots.txt.
        Args:
            url: The URL to check.
        Returns:
            bool: True if the URL is allowed, False otherwise.
        """
        if Protego is None:
            return True
        try:
            domain_root = self._get_domain_root(url)
            cache = await self._get_robots_cache(domain_root)
            if cache is None:
                cache = await self._fetch_and_cache_robots(domain_root)
            if cache.protego is None:
                return True
            agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
            return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
        except Exception as e:
            logger.debug(f"Error checking robots.txt for {url}: {e}")
            return True
    async def _get_crawl_delay(self, url: str) -> float:
        """Get the crawl delay for a URL from robots.txt.
        Args:
            url: The URL to check.
        Returns:
            float: Crawl delay in seconds.
        """
        if Protego is None:
            return self.crawl_delay
        try:
            domain_root = self._get_domain_root(url)
            cache = await self._get_robots_cache(domain_root)
            if cache is None:
                cache = await self._fetch_and_cache_robots(domain_root)
            return cache.crawl_delay
        except Exception:
            return self.crawl_delay
    async def _fetch_httpx(self, url: str) -> str:
        """Fetch a URL using HTTPX with retries.
        Args:
            url: The URL to fetch.
        Returns:
            str: The HTML content of the page.
        Raises:
            Exception: If all retry attempts fail.
        """
        await self._ensure_client()
        assert self._client is not None, "HTTP client not initialized"
        attempt = 0
        crawl_delay = await self._get_crawl_delay(url)
        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
        while True:
            try:
                await self._respect_rate_limit(url, crawl_delay)
                resp = await self._client.get(url)
                resp.raise_for_status()
                logger.info(
                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
                )
                return resp.text
            except Exception as exc:
                attempt += 1
                if attempt > self.max_retries:
                    logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
                    raise
                delay = self.retry_delay_factor * (2 ** (attempt - 1))
                logger.warning(
                    f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
                )
                await asyncio.sleep(delay)
    async def _render_with_playwright(
        self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
    ) -> str:
        """Fetch and render a URL using Playwright for JavaScript content.
        Args:
            url: The URL to fetch.
            js_wait: Seconds to wait for JavaScript to load.
            timeout: Timeout for the request (in seconds, defaults to instance timeout).
        Returns:
            str: The rendered HTML content.
        Raises:
            RuntimeError: If Playwright is not installed.
            Exception: If all retry attempts fail.
        """
        if async_playwright is None:
            raise RuntimeError(
                "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
            )
        timeout_val = timeout or self.timeout
        logger.info(
            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
        )
        attempt = 0
        while True:
            try:
                async with async_playwright() as p:
                    logger.info(f"Launching headless Chromium browser for {url}")
                    browser = await p.chromium.launch(headless=True)
                    try:
                        context = await browser.new_context()
                        page = await context.new_page()
                        logger.info(f"Navigating to {url} and waiting for network idle")
                        await page.goto(
                            url,
                            wait_until="networkidle",
                            timeout=int(timeout_val * 1000),
                        )
                        if js_wait:
                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
                            await asyncio.sleep(js_wait)
                        content = await page.content()
                        logger.info(
                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
                        )
                        return content
                    finally:
                        await browser.close()
            except Exception as exc:
                attempt += 1
                if attempt > self.max_retries:
                    logger.error(f"Playwright fetch failed for {url}: {exc}")
                    raise
                backoff = self.retry_delay_factor * (2 ** (attempt - 1))
                logger.warning(
                    f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
                )
                await asyncio.sleep(backoff)
    async def fetch_urls(
        self,
        urls: Union[str, List[str]],
        *,
        use_playwright: bool = False,
        playwright_js_wait: float = 0.8,
    ) -> UrlsToHtmls:
        """Fetch and extract content from URLs using BeautifulSoup or Playwright.
        Args:
            urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
            extraction_rules: Default extraction rules for string or list URLs.
            use_playwright: If True, use Playwright for JavaScript rendering.
            playwright_js_wait: Seconds to wait for JavaScript to load.
            join_all_matches: If True, extract all matching elements for each rule.
        Returns:
            Dict[str, str]: A dictionary mapping URLs to their extracted content.
        Raises:
            ValueError: If extraction_rules are missing when required or if urls is invalid.
            Exception: If fetching or extraction fails.
        """
        if isinstance(urls, str):
            urls = [urls]
        else:
            raise ValueError(f"Invalid urls type: {type(urls)}")
        async def _task(url: str):
            async with self._sem:
                try:
                    logger.info(f"Processing URL: {url}")
                    # Check robots.txt
                    allowed = await self._is_url_allowed(url)
                    if not allowed:
                        logger.warning(f"URL disallowed by robots.txt: {url}")
                        return url, ""
                    logger.info(f"Robots.txt check passed for {url}")
                    # Fetch HTML
                    if use_playwright:
                        logger.info(
                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
                        )
                        html = await self._render_with_playwright(
                            url, js_wait=playwright_js_wait, timeout=self.timeout
                        )
                    else:
                        logger.info(f"Fetching {url} with httpx")
                        html = await self._fetch_httpx(url)
                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
                    return url, html
                except Exception as e:
                    logger.error(f"Error processing {url}: {e}")
                    return url, ""
        logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
        tasks = [asyncio.create_task(_task(u)) for u in urls]
        results = {}
        completed = 0
        total = len(tasks)
        for coro in asyncio.as_completed(tasks):
            url, html = await coro
            results[url] = html
            completed += 1
            logger.info(f"Progress: {completed}/{total} URLs processed")
        logger.info(f"Completed fetching all {len(results)} URL(s)")
        return results
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -4,19 +4,20 @@ This module provides functions to fetch and extract content from web pages, supp
 both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
 """
-from typing import Dict, List, Union, Optional, Literal
+import os
-from cognee.context_global_variables import soup_crawler_config, tavily_config
+from re import L
 from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
 from .default_url_crawler import DefaultUrlCrawler
 from .bs4_crawler import BeautifulSoupCrawler
-from .config import TavilyConfig
+from .config import DefaultCrawlerConfig, TavilyConfig
 logger = get_logger(__name__)
 UrlsToHtmls: TypeAlias = dict[str, str]
-async def fetch_page_content(
+
-    urls: Union[str, List[str]],
+async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
    preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup",
 ) -> Dict[str, str]:
    """Fetch content from one or more URLs using the specified tool.
    This function retrieves web page content using either BeautifulSoup (with custom
@ -29,7 +30,7 @@ async def fetch_page_content(
            Defaults to "beautifulsoup".
        tavily_config: Configuration for Tavily API, including API key.
            Required if preferred_tool is "tavily".
-        soup_crawler_config: Configuration for BeautifulSoup crawler, including
+        default_crawler_config: Configuration for BeautifulSoup crawler, including
            extraction rules. Required if preferred_tool is "beautifulsoup" and
            extraction_rules are needed.
@ -44,51 +45,39 @@ async def fetch_page_content(
            installed.
    """
    url_list = [urls] if isinstance(urls, str) else urls
    logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}")
-    _tavily_config = tavily_config.get()
+    if os.getenv("TAVILY_API_KEY"):
-    _soup_crawler_config = soup_crawler_config.get()
+        logger.info("Using Tavily API for url fetching")
    if preferred_tool == "tavily":
        if not tavily_config or tavily_config.api_key is None:
            raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
        logger.info("Using Tavily API for content extraction")
        return await fetch_with_tavily(urls, tavily_config)
    else:
        logger.info("Using default crawler for content extraction")
-    if preferred_tool == "beautifulsoup":
+        default_crawler_config = (
-        try:
+            DefaultCrawlerConfig()
-            from bs4 import BeautifulSoup as _  # noqa: F401
+        )  # We've decided to use defaults, and configure through env vars as needed
        except ImportError:
            logger.error(
                "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
            )
            raise ImportError
        if soup_crawler_config is None or soup_crawler_config.extraction_rules is None:
            raise ValueError("soup_crawler_config must be provided when not using Tavily")
        logger.info("Using BeautifulSoup for content extraction")
        logger.info(
-            f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s"
+            f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
        )
-        crawler = BeautifulSoupCrawler(
+        crawler = DefaultUrlCrawler(
-            concurrency=soup_crawler_config.concurrency,
+            concurrency=default_crawler_config.concurrency,
-            crawl_delay=soup_crawler_config.crawl_delay,
+            crawl_delay=default_crawler_config.crawl_delay,
-            max_crawl_delay=soup_crawler_config.max_crawl_delay,
+            max_crawl_delay=default_crawler_config.max_crawl_delay,
-            timeout=soup_crawler_config.timeout,
+            timeout=default_crawler_config.timeout,
-            max_retries=soup_crawler_config.max_retries,
+            max_retries=default_crawler_config.max_retries,
-            retry_delay_factor=soup_crawler_config.retry_delay_factor,
+            retry_delay_factor=default_crawler_config.retry_delay_factor,
-            headers=soup_crawler_config.headers,
+            headers=default_crawler_config.headers,
-            robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
+            robots_cache_ttl=default_crawler_config.robots_cache_ttl,
        )
        try:
            logger.info(
-                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})"
+                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
            )
            results = await crawler.fetch_urls(
                urls,
-                use_playwright=soup_crawler_config.use_playwright,
+                use_playwright=default_crawler_config.use_playwright,
-                playwright_js_wait=soup_crawler_config.playwright_js_wait,
+                playwright_js_wait=default_crawler_config.playwright_js_wait,
            )
            logger.info(f"Successfully fetched content from {len(results)} URL(s)")
            return results
@ -102,7 +91,7 @@ async def fetch_page_content(
 async def fetch_with_tavily(
    urls: Union[str, List[str]], tavily_config: TavilyConfig
-) -> Dict[str, str]:
+) -> UrlsToHtmls:
    """Fetch content from URLs using the Tavily API.
    Args:
--- a/cognee/tasks/web_scraper/web_scraper_task.py
+++ b/cognee/tasks/web_scraper/web_scraper_task.py
@ -19,7 +19,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges
 from cognee.modules.engine.operations.setup import setup
 from .models import WebPage, WebSite, ScrapingJob
-from .config import SoupCrawlerConfig, TavilyConfig
+from .config import DefaultCrawlerConfig, TavilyConfig
 from .utils import fetch_page_content
 try:
@ -47,7 +47,7 @@ async def cron_web_scraper_task(
    schedule: str = None,
    extraction_rules: dict = None,
    tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
-    soup_crawler_config: SoupCrawlerConfig = None,
+    soup_crawler_config: DefaultCrawlerConfig = None,
    tavily_config: TavilyConfig = None,
    job_name: str = "scraping",
 ):
@ -121,7 +121,7 @@ async def web_scraper_task(
    schedule: str = None,
    extraction_rules: dict = None,
    tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
-    soup_crawler_config: SoupCrawlerConfig = None,
+    soup_crawler_config: DefaultCrawlerConfig = None,
    tavily_config: TavilyConfig = None,
    job_name: str = None,
 ):
@ -341,7 +341,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
        soup_crawler_config: Configuration for BeautifulSoup crawler.
    Returns:
-        Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
+        Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
            tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
    Raises:
@ -350,7 +350,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
    preferred_tool = "beautifulsoup"
    if extraction_rules and not soup_crawler_config:
-        soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
+        soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
    if tavily_api_key:
        if not tavily_config:
--- a/cognee/tests/tasks/web_scraping/web_scraping_test.py
+++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py
@ -1,6 +1,6 @@
 import asyncio
 import cognee
-from cognee.tasks.web_scraper.config import SoupCrawlerConfig
+from cognee.tasks.web_scraper.config import DefaultCrawlerConfig
 from cognee.tasks.web_scraper import cron_web_scraper_task
@ -14,7 +14,7 @@ async def test_web_scraping_using_bs4():
        "authors": {"selector": ".quote small", "all": True},
    }
-    soup_config = SoupCrawlerConfig(
+    soup_config = DefaultCrawlerConfig(
        concurrency=5,
        crawl_delay=0.5,
        timeout=15.0,
@ -47,7 +47,7 @@ async def test_web_scraping_using_bs4_and_incremental_loading():
    url = "https://books.toscrape.com/"
    rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}
-    soup_config = SoupCrawlerConfig(
+    soup_config = DefaultCrawlerConfig(
        concurrency=1,
        crawl_delay=0.1,
        timeout=10.0,