cognee/cognee/infrastructure/loaders/external/bs4_loader.py

"""BeautifulSoup-based web crawler for extracting content from web pages.

This module provides the BeautifulSoupCrawler class for fetching and extracting content
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
supports robots.txt handling, rate limiting, and custom extraction rules.
"""

from typing import Union, Dict, Any, Optional, List
from dataclasses import dataclass
from bs4 import BeautifulSoup
from cognee.infrastructure.loaders import LoaderInterface
from cognee.shared.logging_utils import get_logger

logger = get_logger(__name__)


@dataclass
class ExtractionRule:
    """Normalized extraction rule for web content.

    Attributes:
        selector: CSS selector for extraction (if any).
        xpath: XPath expression for extraction (if any).
        attr: HTML attribute to extract (if any).
        all: If True, extract all matching elements; otherwise, extract first.
        join_with: String to join multiple extracted elements.
    """

    selector: Optional[str] = None
    xpath: Optional[str] = None
    attr: Optional[str] = None
    all: bool = False
    join_with: str = " "


class BeautifulSoupLoader(LoaderInterface):
    """Crawler for fetching and extracting web content using BeautifulSoup.

    Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
    compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.

    Attributes:
        concurrency: Number of concurrent requests allowed.
        crawl_delay: Minimum seconds between requests to the same domain.
        max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
        timeout: Per-request timeout in seconds.
        max_retries: Number of retries for failed requests.
        retry_delay_factor: Multiplier for exponential backoff on retries.
        headers: HTTP headers for requests (e.g., User-Agent).
        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
    """

    @property
    def supported_extensions(self) -> List[str]:
        return ["html"]

    @property
    def supported_mime_types(self) -> List[str]:
        return ["text/html"]

    @property
    def loader_name(self) -> str:
        return "beautiful_soup_loader"

    def can_handle(self, extension: str, mime_type: str) -> bool:
        return extension in self.supported_extensions() and mime_type in self.supported_mime_types()

    async def load(self, file_path: str, **kwargs):
        pass

    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.

        Args:
            rule: A string (CSS selector) or dict with extraction parameters.

        Returns:
            ExtractionRule: Normalized extraction rule.

        Raises:
            ValueError: If the rule is invalid.
        """
        if isinstance(rule, str):
            return ExtractionRule(selector=rule)
        if isinstance(rule, dict):
            return ExtractionRule(
                selector=rule.get("selector"),
                xpath=rule.get("xpath"),
                attr=rule.get("attr"),
                all=bool(rule.get("all", False)),
                join_with=rule.get("join_with", " "),
            )
        raise ValueError(f"Invalid extraction rule: {rule}")

    def extract(self, html: str, rule: ExtractionRule) -> str:
        """Extract content from HTML using BeautifulSoup or lxml XPath.

        Args:
            html: The HTML content to extract from.
            rule: The extraction rule to apply.

        Returns:
            str: The extracted content.

        Raises:
            RuntimeError: If XPath is used but lxml is not installed.
        """
        soup = BeautifulSoup(html, "html.parser")

        if rule.xpath:
            try:
                from lxml import html as lxml_html
            except ImportError:
                raise RuntimeError(
                    "XPath requested but lxml is not available. Install lxml or use CSS selectors."
                )
            doc = lxml_html.fromstring(html)
            nodes = doc.xpath(rule.xpath)
            texts = []
            for n in nodes:
                if hasattr(n, "text_content"):
                    texts.append(n.text_content().strip())
                else:
                    texts.append(str(n).strip())
            return rule.join_with.join(t for t in texts if t)

        if not rule.selector:
            return ""

        if rule.all:
            nodes = soup.select(rule.selector)
            pieces = []
            for el in nodes:
                if rule.attr:
                    val = el.get(rule.attr)
                    if val:
                        pieces.append(val.strip())
                else:
                    text = el.get_text(strip=True)
                    if text:
                        pieces.append(text)
            return rule.join_with.join(pieces).strip()
        else:
            el = soup.select_one(rule.selector)
            if el is None:
                return ""
            if rule.attr:
                val = el.get(rule.attr)
                return (val or "").strip()
            return el.get_text(strip=True)