Added Documentation
This commit is contained in:
parent
0f64f6804d
commit
4d5146c802
4 changed files with 305 additions and 68 deletions
|
|
@ -1,3 +1,10 @@
|
||||||
|
"""Web scraping module for cognee.
|
||||||
|
|
||||||
|
This module provides tools for scraping web content, managing scraping jobs, and storing
|
||||||
|
data in a graph database. It includes classes and functions for crawling web pages using
|
||||||
|
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
||||||
|
"""
|
||||||
|
|
||||||
from .bs4_crawler import BeautifulSoupCrawler
|
from .bs4_crawler import BeautifulSoupCrawler
|
||||||
from .utils import fetch_page_content
|
from .utils import fetch_page_content
|
||||||
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
|
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,16 @@
|
||||||
|
"""BeautifulSoup-based web crawler for extracting content from web pages.
|
||||||
|
|
||||||
|
This module provides the BeautifulSoupCrawler class for fetching and extracting content
|
||||||
|
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
|
||||||
|
supports robots.txt handling, rate limiting, and custom extraction rules.
|
||||||
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from typing import Union, List, Dict, Any, Optional
|
from typing import Union, List, Dict, Any, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
@ -28,7 +34,15 @@ except ImportError:
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ExtractionRule:
|
class ExtractionRule:
|
||||||
"""Normalized extraction rule"""
|
"""Normalized extraction rule for web content.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
selector: CSS selector for extraction (if any).
|
||||||
|
xpath: XPath expression for extraction (if any).
|
||||||
|
attr: HTML attribute to extract (if any).
|
||||||
|
all: If True, extract all matching elements; otherwise, extract first.
|
||||||
|
join_with: String to join multiple extracted elements.
|
||||||
|
"""
|
||||||
|
|
||||||
selector: Optional[str] = None
|
selector: Optional[str] = None
|
||||||
xpath: Optional[str] = None
|
xpath: Optional[str] = None
|
||||||
|
|
@ -39,7 +53,13 @@ class ExtractionRule:
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RobotsTxtCache:
|
class RobotsTxtCache:
|
||||||
"""Cache for robots.txt data"""
|
"""Cache for robots.txt data.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
protego: Parsed robots.txt object (Protego instance).
|
||||||
|
crawl_delay: Delay between requests (in seconds).
|
||||||
|
timestamp: Time when the cache entry was created.
|
||||||
|
"""
|
||||||
|
|
||||||
protego: Any
|
protego: Any
|
||||||
crawl_delay: float
|
crawl_delay: float
|
||||||
|
|
@ -47,6 +67,21 @@ class RobotsTxtCache:
|
||||||
|
|
||||||
|
|
||||||
class BeautifulSoupCrawler:
|
class BeautifulSoupCrawler:
|
||||||
|
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
||||||
|
|
||||||
|
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
||||||
|
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
concurrency: Number of concurrent requests allowed.
|
||||||
|
crawl_delay: Minimum seconds between requests to the same domain.
|
||||||
|
timeout: Per-request timeout in seconds.
|
||||||
|
max_retries: Number of retries for failed requests.
|
||||||
|
retry_delay_factor: Multiplier for exponential backoff on retries.
|
||||||
|
headers: HTTP headers for requests (e.g., User-Agent).
|
||||||
|
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
|
@ -56,16 +91,18 @@ class BeautifulSoupCrawler:
|
||||||
max_retries: int = 2,
|
max_retries: int = 2,
|
||||||
retry_delay_factor: float = 0.5,
|
retry_delay_factor: float = 0.5,
|
||||||
headers: Optional[Dict[str, str]] = None,
|
headers: Optional[Dict[str, str]] = None,
|
||||||
robots_cache_ttl: float = 3600.0, # Cache robots.txt for 1 hour
|
robots_cache_ttl: float = 3600.0,
|
||||||
):
|
):
|
||||||
"""
|
"""Initialize the BeautifulSoupCrawler.
|
||||||
concurrency: number of concurrent requests allowed
|
|
||||||
crawl_delay: minimum seconds to wait between requests to the SAME domain
|
Args:
|
||||||
timeout: per-request timeout
|
concurrency: Number of concurrent requests allowed.
|
||||||
max_retries: number of retries on network errors
|
crawl_delay: Minimum seconds between requests to the same domain.
|
||||||
retry_delay_factor: multiplier for exponential retry failure delay
|
timeout: Per-request timeout in seconds.
|
||||||
headers: default headers for requests
|
max_retries: Number of retries for failed requests.
|
||||||
robots_cache_ttl: TTL for robots.txt cache in seconds
|
retry_delay_factor: Multiplier for exponential backoff on retries.
|
||||||
|
headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
|
||||||
|
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
||||||
"""
|
"""
|
||||||
self.concurrency = concurrency
|
self.concurrency = concurrency
|
||||||
self._sem = asyncio.Semaphore(concurrency)
|
self._sem = asyncio.Semaphore(concurrency)
|
||||||
|
|
@ -75,33 +112,42 @@ class BeautifulSoupCrawler:
|
||||||
self.retry_delay_factor = retry_delay_factor
|
self.retry_delay_factor = retry_delay_factor
|
||||||
self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
|
self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
|
||||||
self.robots_cache_ttl = robots_cache_ttl
|
self.robots_cache_ttl = robots_cache_ttl
|
||||||
|
|
||||||
self._last_request_time_per_domain: Dict[str, float] = {}
|
self._last_request_time_per_domain: Dict[str, float] = {}
|
||||||
self._robots_cache: Dict[str, RobotsTxtCache] = {}
|
self._robots_cache: Dict[str, RobotsTxtCache] = {}
|
||||||
self._client: Optional[httpx.AsyncClient] = None
|
self._client: Optional[httpx.AsyncClient] = None
|
||||||
self._robots_lock = asyncio.Lock()
|
self._robots_lock = asyncio.Lock()
|
||||||
|
|
||||||
# ---------- lifecycle helpers ----------
|
|
||||||
async def _ensure_client(self):
|
async def _ensure_client(self):
|
||||||
|
"""Initialize the HTTP client if not already created."""
|
||||||
if self._client is None:
|
if self._client is None:
|
||||||
self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
|
self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
|
"""Close the HTTP client."""
|
||||||
if self._client:
|
if self._client:
|
||||||
await self._client.aclose()
|
await self._client.aclose()
|
||||||
self._client = None
|
self._client = None
|
||||||
|
|
||||||
async def __aenter__(self):
|
async def __aenter__(self):
|
||||||
|
"""Enter the context manager, initializing the HTTP client."""
|
||||||
await self._ensure_client()
|
await self._ensure_client()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
"""Exit the context manager, closing the HTTP client."""
|
||||||
await self.close()
|
await self.close()
|
||||||
|
|
||||||
# ---------- rate limiting ----------
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=1024)
|
@lru_cache(maxsize=1024)
|
||||||
def _domain_from_url(url: str) -> str:
|
def _domain_from_url(url: str) -> str:
|
||||||
|
"""Extract the domain (netloc) from a URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to parse.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The domain (netloc) of the URL.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
return urlparse(url).netloc
|
return urlparse(url).netloc
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
@ -110,10 +156,24 @@ class BeautifulSoupCrawler:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=1024)
|
@lru_cache(maxsize=1024)
|
||||||
def _get_domain_root(url: str) -> str:
|
def _get_domain_root(url: str) -> str:
|
||||||
|
"""Get the root URL (scheme and netloc) from a URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to parse.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The root URL (e.g., "https://example.com").
|
||||||
|
"""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
return f"{parsed.scheme}://{parsed.netloc}"
|
return f"{parsed.scheme}://{parsed.netloc}"
|
||||||
|
|
||||||
async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
|
async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
|
||||||
|
"""Enforce rate limiting for requests to the same domain.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to check.
|
||||||
|
crawl_delay: Custom crawl delay in seconds (if any).
|
||||||
|
"""
|
||||||
domain = self._domain_from_url(url)
|
domain = self._domain_from_url(url)
|
||||||
last = self._last_request_time_per_domain.get(domain)
|
last = self._last_request_time_per_domain.get(domain)
|
||||||
delay = crawl_delay if crawl_delay is not None else self.crawl_delay
|
delay = crawl_delay if crawl_delay is not None else self.crawl_delay
|
||||||
|
|
@ -128,9 +188,15 @@ class BeautifulSoupCrawler:
|
||||||
await asyncio.sleep(wait_for)
|
await asyncio.sleep(wait_for)
|
||||||
self._last_request_time_per_domain[domain] = time.time()
|
self._last_request_time_per_domain[domain] = time.time()
|
||||||
|
|
||||||
# ----------- robots.txt handling -----------
|
|
||||||
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
|
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
|
||||||
"""Get cached robots.txt data if valid"""
|
"""Get cached robots.txt data if valid.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
domain_root: The root URL (e.g., "https://example.com").
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
|
||||||
|
"""
|
||||||
if Protego is None:
|
if Protego is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -140,9 +206,18 @@ class BeautifulSoupCrawler:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
|
async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
|
||||||
"""Fetch and cache robots.txt data"""
|
"""Fetch and cache robots.txt data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
domain_root: The root URL (e.g., "https://example.com").
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RobotsTxtCache: Cached robots.txt data with crawl delay.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If fetching robots.txt fails.
|
||||||
|
"""
|
||||||
async with self._robots_lock:
|
async with self._robots_lock:
|
||||||
# Check again after acquiring lock
|
|
||||||
cached = await self._get_robots_cache(domain_root)
|
cached = await self._get_robots_cache(domain_root)
|
||||||
if cached:
|
if cached:
|
||||||
return cached
|
return cached
|
||||||
|
|
@ -170,6 +245,14 @@ class BeautifulSoupCrawler:
|
||||||
return cache_entry
|
return cache_entry
|
||||||
|
|
||||||
async def _is_url_allowed(self, url: str) -> bool:
|
async def _is_url_allowed(self, url: str) -> bool:
|
||||||
|
"""Check if a URL is allowed by robots.txt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the URL is allowed, False otherwise.
|
||||||
|
"""
|
||||||
if Protego is None:
|
if Protego is None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
@ -189,6 +272,14 @@ class BeautifulSoupCrawler:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def _get_crawl_delay(self, url: str) -> float:
|
async def _get_crawl_delay(self, url: str) -> float:
|
||||||
|
"""Get the crawl delay for a URL from robots.txt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Crawl delay in seconds.
|
||||||
|
"""
|
||||||
if Protego is None:
|
if Protego is None:
|
||||||
return self.crawl_delay
|
return self.crawl_delay
|
||||||
|
|
||||||
|
|
@ -201,8 +292,18 @@ class BeautifulSoupCrawler:
|
||||||
except Exception:
|
except Exception:
|
||||||
return self.crawl_delay
|
return self.crawl_delay
|
||||||
|
|
||||||
# ---------- low-level fetchers ----------
|
|
||||||
async def _fetch_httpx(self, url: str) -> str:
|
async def _fetch_httpx(self, url: str) -> str:
|
||||||
|
"""Fetch a URL using HTTPX with retries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to fetch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The HTML content of the page.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If all retry attempts fail.
|
||||||
|
"""
|
||||||
await self._ensure_client()
|
await self._ensure_client()
|
||||||
assert self._client is not None, "HTTP client not initialized"
|
assert self._client is not None, "HTTP client not initialized"
|
||||||
|
|
||||||
|
|
@ -230,6 +331,20 @@ class BeautifulSoupCrawler:
|
||||||
async def _render_with_playwright(
|
async def _render_with_playwright(
|
||||||
self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
|
self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
|
"""Fetch and render a URL using Playwright for JavaScript content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to fetch.
|
||||||
|
js_wait: Seconds to wait for JavaScript to load.
|
||||||
|
timeout: Timeout for the request (in seconds, defaults to instance timeout).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The rendered HTML content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If Playwright is not installed.
|
||||||
|
Exception: If all retry attempts fail.
|
||||||
|
"""
|
||||||
if async_playwright is None:
|
if async_playwright is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
|
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
|
||||||
|
|
@ -263,10 +378,19 @@ class BeautifulSoupCrawler:
|
||||||
)
|
)
|
||||||
await asyncio.sleep(backoff)
|
await asyncio.sleep(backoff)
|
||||||
|
|
||||||
# ---------- extraction helpers ----------
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||||
"""Normalize extraction rule to ExtractionRule dataclass"""
|
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rule: A string (CSS selector) or dict with extraction parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractionRule: Normalized extraction rule.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the rule is invalid.
|
||||||
|
"""
|
||||||
if isinstance(rule, str):
|
if isinstance(rule, str):
|
||||||
return ExtractionRule(selector=rule)
|
return ExtractionRule(selector=rule)
|
||||||
if isinstance(rule, dict):
|
if isinstance(rule, dict):
|
||||||
|
|
@ -280,7 +404,18 @@ class BeautifulSoupCrawler:
|
||||||
raise ValueError(f"Invalid extraction rule: {rule}")
|
raise ValueError(f"Invalid extraction rule: {rule}")
|
||||||
|
|
||||||
def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
|
def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
|
||||||
"""Extract content using BeautifulSoup or lxml xpath"""
|
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html: The HTML content to extract from.
|
||||||
|
rule: The extraction rule to apply.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If XPath is used but lxml is not installed.
|
||||||
|
"""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
if rule.xpath:
|
if rule.xpath:
|
||||||
|
|
@ -325,7 +460,6 @@ class BeautifulSoupCrawler:
|
||||||
return (val or "").strip()
|
return (val or "").strip()
|
||||||
return el.get_text(strip=True)
|
return el.get_text(strip=True)
|
||||||
|
|
||||||
# ---------- public methods ----------
|
|
||||||
async def fetch_with_bs4(
|
async def fetch_with_bs4(
|
||||||
self,
|
self,
|
||||||
urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
|
urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
|
||||||
|
|
@ -335,23 +469,22 @@ class BeautifulSoupCrawler:
|
||||||
playwright_js_wait: float = 0.8,
|
playwright_js_wait: float = 0.8,
|
||||||
join_all_matches: bool = False,
|
join_all_matches: bool = False,
|
||||||
) -> Dict[str, str]:
|
) -> Dict[str, str]:
|
||||||
"""
|
"""Fetch and extract content from URLs using BeautifulSoup or Playwright.
|
||||||
Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath).
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: Can be:
|
urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
|
||||||
- A single URL string
|
extraction_rules: Default extraction rules for string or list URLs.
|
||||||
- A list of URLs (uses extraction_rules for all)
|
use_playwright: If True, use Playwright for JavaScript rendering.
|
||||||
- A dict mapping URL -> extraction_rules (URL-specific rules)
|
playwright_js_wait: Seconds to wait for JavaScript to load.
|
||||||
extraction_rules: Default rules when urls is a string or list
|
join_all_matches: If True, extract all matching elements for each rule.
|
||||||
use_playwright: Whether to use Playwright for JS rendering
|
|
||||||
playwright_js_wait: Wait time after page load for JS
|
|
||||||
join_all_matches: Force all rules to extract all matching elements
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict[url] -> concatenated string of extracted content
|
Dict[str, str]: A dictionary mapping URLs to their extracted content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If extraction_rules are missing when required or if urls is invalid.
|
||||||
|
Exception: If fetching or extraction fails.
|
||||||
"""
|
"""
|
||||||
# Handle different input formats
|
|
||||||
url_rules_map: Dict[str, Dict[str, Any]] = {}
|
url_rules_map: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
if isinstance(urls, str):
|
if isinstance(urls, str):
|
||||||
|
|
@ -364,12 +497,10 @@ class BeautifulSoupCrawler:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
url_rules_map[url] = extraction_rules
|
url_rules_map[url] = extraction_rules
|
||||||
elif isinstance(urls, dict):
|
elif isinstance(urls, dict):
|
||||||
# URL-specific rules
|
|
||||||
url_rules_map = urls
|
url_rules_map = urls
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid urls type: {type(urls)}")
|
raise ValueError(f"Invalid urls type: {type(urls)}")
|
||||||
|
|
||||||
# Normalize all rules
|
|
||||||
normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
|
normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
|
||||||
for url, rules in url_rules_map.items():
|
for url, rules in url_rules_map.items():
|
||||||
normalized_rules = []
|
normalized_rules = []
|
||||||
|
|
@ -388,7 +519,6 @@ class BeautifulSoupCrawler:
|
||||||
logger.warning(f"URL disallowed by robots.txt: {url}")
|
logger.warning(f"URL disallowed by robots.txt: {url}")
|
||||||
return url, ""
|
return url, ""
|
||||||
|
|
||||||
# Fetch (rendered or not)
|
|
||||||
if use_playwright:
|
if use_playwright:
|
||||||
html = await self._render_with_playwright(
|
html = await self._render_with_playwright(
|
||||||
url, js_wait=playwright_js_wait, timeout=self.timeout
|
url, js_wait=playwright_js_wait, timeout=self.timeout
|
||||||
|
|
@ -396,7 +526,6 @@ class BeautifulSoupCrawler:
|
||||||
else:
|
else:
|
||||||
html = await self._fetch_httpx(url)
|
html = await self._fetch_httpx(url)
|
||||||
|
|
||||||
# Extract content using URL-specific rules
|
|
||||||
pieces = []
|
pieces = []
|
||||||
for rule in normalized_url_rules[url]:
|
for rule in normalized_url_rules[url]:
|
||||||
text = self._extract_with_bs4(html, rule)
|
text = self._extract_with_bs4(html, rule)
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,13 @@
|
||||||
from tavily import AsyncTavilyClient
|
"""Utilities for fetching web content using BeautifulSoup or Tavily.
|
||||||
from .bs4_crawler import BeautifulSoupCrawler
|
|
||||||
import os
|
This module provides functions to fetch and extract content from web pages, supporting
|
||||||
from .config import TavilyConfig, SoupCrawlerConfig
|
both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
|
||||||
from typing import Dict, Any, List, Union, Optional, Literal
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Union, Optional, Literal
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
import asyncio
|
from .bs4_crawler import BeautifulSoupCrawler
|
||||||
|
from .config import TavilyConfig, SoupCrawlerConfig
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
@ -16,20 +19,31 @@ async def fetch_page_content(
|
||||||
tavily_config: Optional[TavilyConfig] = None,
|
tavily_config: Optional[TavilyConfig] = None,
|
||||||
soup_crawler_config: Optional[SoupCrawlerConfig] = None,
|
soup_crawler_config: Optional[SoupCrawlerConfig] = None,
|
||||||
) -> Dict[str, Union[str, Dict[str, str]]]:
|
) -> Dict[str, Union[str, Dict[str, str]]]:
|
||||||
"""
|
"""Fetch content from one or more URLs using the specified tool.
|
||||||
Fetch page content using Tavily API if TAVILY_API_KEY is set,
|
|
||||||
otherwise fetch using BeautifulSoupCrawler directly.
|
|
||||||
|
|
||||||
Parameters:
|
This function retrieves web page content using either BeautifulSoup (with custom
|
||||||
urls: single URL or list of URLs
|
extraction rules) or Tavily (API-based scraping). It handles single URLs or lists of
|
||||||
extraction_rules: dict mapping field names -> CSS selector or rule
|
URLs and returns a dictionary mapping URLs to their extracted content.
|
||||||
use_playwright: whether to render JS (BeautifulSoupCrawler)
|
|
||||||
playwright_js_wait: seconds to wait for JS to load
|
Args:
|
||||||
join_all_matches: join all matching elements per rule
|
urls: A single URL (str) or a list of URLs (List[str]) to scrape.
|
||||||
structured: if True, returns structured dict instead of concatenated string (based on extraction_rules field names)
|
preferred_tool: The scraping tool to use ("tavily" or "beautifulsoup").
|
||||||
|
Defaults to "beautifulsoup".
|
||||||
|
tavily_config: Configuration for Tavily API, including API key.
|
||||||
|
Required if preferred_tool is "tavily".
|
||||||
|
soup_crawler_config: Configuration for BeautifulSoup crawler, including
|
||||||
|
extraction rules. Required if preferred_tool is "beautifulsoup" and
|
||||||
|
extraction_rules are needed.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict mapping URL -> extracted string or structured dict
|
Dict[str, Union[str, Dict[str, str]]]: A dictionary mapping each URL to its
|
||||||
|
extracted content (as a string for BeautifulSoup or a dict for Tavily).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If Tavily API key is missing when using Tavily, or if
|
||||||
|
extraction_rules are not provided when using BeautifulSoup.
|
||||||
|
ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
|
||||||
|
installed.
|
||||||
"""
|
"""
|
||||||
if preferred_tool == "tavily":
|
if preferred_tool == "tavily":
|
||||||
if tavily_config.api_key is None:
|
if tavily_config.api_key is None:
|
||||||
|
|
@ -43,6 +57,7 @@ async def fetch_page_content(
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
|
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
|
||||||
)
|
)
|
||||||
|
raise
|
||||||
crawler = BeautifulSoupCrawler()
|
crawler = BeautifulSoupCrawler()
|
||||||
extraction_rules = soup_crawler_config.extraction_rules
|
extraction_rules = soup_crawler_config.extraction_rules
|
||||||
if extraction_rules is None:
|
if extraction_rules is None:
|
||||||
|
|
@ -58,20 +73,34 @@ async def fetch_page_content(
|
||||||
return results
|
return results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error fetching page content: {str(e)}")
|
logger.error(f"Error fetching page content: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
async def fetch_with_tavily(urls: Union[str, List[str]]) -> Dict[str, str]:
|
async def fetch_with_tavily(urls: Union[str, List[str]]) -> Dict[str, str]:
|
||||||
|
"""Fetch content from URLs using the Tavily API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: A single URL (str) or a list of URLs (List[str]) to scrape.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, str]: A dictionary mapping each URL to its raw content as a string.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If tavily-python is not installed.
|
||||||
|
Exception: If the Tavily API request fails.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from tavily import AsyncTavilyClient
|
from tavily import AsyncTavilyClient
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
|
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
|
||||||
)
|
)
|
||||||
|
raise
|
||||||
client = AsyncTavilyClient()
|
client = AsyncTavilyClient()
|
||||||
results = await client.extract(urls)
|
results = await client.extract(urls)
|
||||||
for failed_result in results.get("failed_results", []):
|
for failed_result in results.get("failed_results", []):
|
||||||
logger.warning(f"Failed to fetch {failed_result}")
|
logger.warning(f"Failed to fetch {failed_result}")
|
||||||
return_results = {}
|
return_results = {}
|
||||||
for results in results.get("results", []):
|
for result in results.get("results", []):
|
||||||
return_results[results["url"]] = results["raw_content"]
|
return_results[result["url"]] = result["raw_content"]
|
||||||
return return_results
|
return return_results
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,10 @@
|
||||||
|
"""Web scraping tasks for storing scraped data in a graph database.
|
||||||
|
|
||||||
|
This module provides functions to scrape web content, create or update WebPage, WebSite,
|
||||||
|
and ScrapingJob data points, and store them in a Kuzu graph database. It supports
|
||||||
|
scheduled scraping tasks and ensures that node updates preserve existing graph edges.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
@ -22,7 +29,7 @@ try:
|
||||||
|
|
||||||
scheduler = BackgroundScheduler()
|
scheduler = BackgroundScheduler()
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Please install apscheduler by pip install APScheduler >=3.10")
|
raise ImportError("Please install apscheduler by pip install APScheduler>=3.10")
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
@ -37,6 +44,28 @@ async def cron_web_scraper_task(
|
||||||
tavily_config: TavilyConfig = None,
|
tavily_config: TavilyConfig = None,
|
||||||
job_name: str = "scraping",
|
job_name: str = "scraping",
|
||||||
):
|
):
|
||||||
|
"""Schedule or run a web scraping task.
|
||||||
|
|
||||||
|
This function schedules a recurring web scraping task using APScheduler or runs it
|
||||||
|
immediately if no schedule is provided. It delegates to web_scraper_task for actual
|
||||||
|
scraping and graph storage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: A single URL or list of URLs to scrape.
|
||||||
|
schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs immediately.
|
||||||
|
extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
|
||||||
|
tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
|
||||||
|
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
||||||
|
tavily_config: Configuration for Tavily API.
|
||||||
|
job_name: Name of the scraping job. Defaults to "scraping".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The result of web_scraper_task if run immediately, or None if scheduled.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the schedule is an invalid cron expression.
|
||||||
|
ImportError: If APScheduler is not installed.
|
||||||
|
"""
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
|
job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
|
||||||
if schedule:
|
if schedule:
|
||||||
|
|
@ -89,10 +118,29 @@ async def web_scraper_task(
|
||||||
tavily_config: TavilyConfig = None,
|
tavily_config: TavilyConfig = None,
|
||||||
job_name: str = None,
|
job_name: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""Scrape URLs and store data points in a Kuzu graph database.
|
||||||
Scrapes one or more URLs and returns WebPage, WebSite, and ScrapingJob data points.
|
|
||||||
Unique IDs are assigned to each WebPage, WebSite, and ScrapingJob.
|
This function scrapes content from the provided URLs, creates or updates WebPage,
|
||||||
Includes a description field summarizing other fields for each data point.
|
WebSite, and ScrapingJob data points, and stores them in a Kuzu graph database.
|
||||||
|
Each data point includes a description field summarizing its attributes. It creates
|
||||||
|
'is_scraping' (ScrapingJob to WebSite) and 'is_part_of' (WebPage to WebSite)
|
||||||
|
relationships, preserving existing edges during node updates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: A single URL or list of URLs to scrape.
|
||||||
|
schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs once.
|
||||||
|
extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
|
||||||
|
tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
|
||||||
|
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
||||||
|
tavily_config: Configuration for Tavily API.
|
||||||
|
job_name: Name of the scraping job. Defaults to a timestamp-based name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The graph data returned by the graph database.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TypeError: If neither tavily_config nor soup_crawler_config is provided.
|
||||||
|
Exception: If fetching content or database operations fail.
|
||||||
"""
|
"""
|
||||||
await setup()
|
await setup()
|
||||||
graph_db = await get_graph_engine()
|
graph_db = await get_graph_engine()
|
||||||
|
|
@ -260,7 +308,7 @@ async def web_scraper_task(
|
||||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||||
edge_mapping.append(
|
edge_mapping.append(
|
||||||
(
|
(
|
||||||
webpage.id, # Corrected: WebPage is the source, WebSite is the target
|
webpage.id,
|
||||||
websites_dict[base_url].id,
|
websites_dict[base_url].id,
|
||||||
"is_part_of",
|
"is_part_of",
|
||||||
{
|
{
|
||||||
|
|
@ -280,8 +328,20 @@ async def web_scraper_task(
|
||||||
|
|
||||||
|
|
||||||
def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawler_config):
|
def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawler_config):
|
||||||
"""
|
"""Validate and configure arguments for web_scraper_task.
|
||||||
Checking if the right argument are given, if not TypeError will be raised.
|
|
||||||
|
Args:
|
||||||
|
tavily_api_key: API key for Tavily.
|
||||||
|
extraction_rules: Extraction rules for BeautifulSoup.
|
||||||
|
tavily_config: Configuration for Tavily API.
|
||||||
|
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
|
||||||
|
tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TypeError: If neither tavily_config nor soup_crawler_config is provided.
|
||||||
"""
|
"""
|
||||||
preferred_tool = "beautifulsoup"
|
preferred_tool = "beautifulsoup"
|
||||||
|
|
||||||
|
|
@ -302,7 +362,19 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
|
||||||
return soup_crawler_config, tavily_config, preferred_tool
|
return soup_crawler_config, tavily_config, preferred_tool
|
||||||
|
|
||||||
|
|
||||||
def get_path_after_base(base_url, url):
|
def get_path_after_base(base_url: str, url: str) -> str:
|
||||||
|
"""Extract the path after the base URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: The base URL (e.g., "https://example.com").
|
||||||
|
url: The full URL to extract the path from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The path after the base URL, with leading slashes removed.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the base URL and target URL are from different domains.
|
||||||
|
"""
|
||||||
parsed_base = urlparse(base_url)
|
parsed_base = urlparse(base_url)
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue