Added support for multiple base_url extraction

This commit is contained in:
Geoff-Robin 2025-10-05 20:13:44 +05:30
parent 77ea7c4b1d
commit f148b1df89
2 changed files with 228 additions and 122 deletions

View file

@ -1,6 +1,6 @@
from uuid import UUID from uuid import UUID
import os import os
from typing import Union, BinaryIO, List, Optional, Dict, Literal from typing import Union, BinaryIO, List, Optional, Dict, Any
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.modules.pipelines import Task, run_pipeline from cognee.modules.pipelines import Task, run_pipeline
@ -30,7 +30,7 @@ async def add(
dataset_id: Optional[UUID] = None, dataset_id: Optional[UUID] = None,
preferred_loaders: List[str] = None, preferred_loaders: List[str] = None,
incremental_loading: bool = True, incremental_loading: bool = True,
extraction_rules: Optional[Dict[str, str]] = None, extraction_rules: Optional[Dict[str, Any]] = None,
tavily_config: Optional[TavilyConfig] = None, tavily_config: Optional[TavilyConfig] = None,
soup_crawler_config: Optional[SoupCrawlerConfig] = None, soup_crawler_config: Optional[SoupCrawlerConfig] = None,
): ):

View file

@ -2,6 +2,8 @@ import asyncio
import time import time
from typing import Union, List, Dict, Any, Optional from typing import Union, List, Dict, Any, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
from dataclasses import dataclass, field
from functools import lru_cache
import httpx import httpx
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -24,6 +26,26 @@ except ImportError:
Protego = None Protego = None
@dataclass
class ExtractionRule:
"""Normalized extraction rule"""
selector: Optional[str] = None
xpath: Optional[str] = None
attr: Optional[str] = None
all: bool = False
join_with: str = " "
@dataclass
class RobotsTxtCache:
"""Cache for robots.txt data"""
protego: Any
crawl_delay: float
timestamp: float = field(default_factory=time.time)
class BeautifulSoupCrawler: class BeautifulSoupCrawler:
def __init__( def __init__(
self, self,
@ -34,6 +56,7 @@ class BeautifulSoupCrawler:
max_retries: int = 2, max_retries: int = 2,
retry_delay_factor: float = 0.5, retry_delay_factor: float = 0.5,
headers: Optional[Dict[str, str]] = None, headers: Optional[Dict[str, str]] = None,
robots_cache_ttl: float = 3600.0, # Cache robots.txt for 1 hour
): ):
""" """
concurrency: number of concurrent requests allowed concurrency: number of concurrent requests allowed
@ -42,6 +65,7 @@ class BeautifulSoupCrawler:
max_retries: number of retries on network errors max_retries: number of retries on network errors
retry_delay_factor: multiplier for exponential retry failure delay retry_delay_factor: multiplier for exponential retry failure delay
headers: default headers for requests headers: default headers for requests
robots_cache_ttl: TTL for robots.txt cache in seconds
""" """
self.concurrency = concurrency self.concurrency = concurrency
self._sem = asyncio.Semaphore(concurrency) self._sem = asyncio.Semaphore(concurrency)
@ -50,8 +74,12 @@ class BeautifulSoupCrawler:
self.max_retries = max_retries self.max_retries = max_retries
self.retry_delay_factor = retry_delay_factor self.retry_delay_factor = retry_delay_factor
self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"} self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
self.robots_cache_ttl = robots_cache_ttl
self._last_request_time_per_domain: Dict[str, float] = {} self._last_request_time_per_domain: Dict[str, float] = {}
self._robots_cache: Dict[str, RobotsTxtCache] = {}
self._client: Optional[httpx.AsyncClient] = None self._client: Optional[httpx.AsyncClient] = None
self._robots_lock = asyncio.Lock()
# ---------- lifecycle helpers ---------- # ---------- lifecycle helpers ----------
async def _ensure_client(self): async def _ensure_client(self):
@ -63,20 +91,37 @@ class BeautifulSoupCrawler:
await self._client.aclose() await self._client.aclose()
self._client = None self._client = None
async def __aenter__(self):
await self._ensure_client()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
# ---------- rate limiting ---------- # ---------- rate limiting ----------
def _domain_from_url(self, url: str) -> str: @staticmethod
@lru_cache(maxsize=1024)
def _domain_from_url(url: str) -> str:
try: try:
return urlparse(url).netloc return urlparse(url).netloc
except Exception: except Exception:
return url return url
@staticmethod
@lru_cache(maxsize=1024)
def _get_domain_root(url: str) -> str:
parsed = urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}"
async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None): async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
domain = self._domain_from_url(url) domain = self._domain_from_url(url)
last = self._last_request_time_per_domain.get(domain) last = self._last_request_time_per_domain.get(domain)
delay = crawl_delay if crawl_delay is not None else self.crawl_delay delay = crawl_delay if crawl_delay is not None else self.crawl_delay
if last is None: if last is None:
self._last_request_time_per_domain[domain] = time.time() self._last_request_time_per_domain[domain] = time.time()
return return
elapsed = time.time() - last elapsed = time.time() - last
wait_for = delay - elapsed wait_for = delay - elapsed
if wait_for > 0: if wait_for > 0:
@ -84,30 +129,75 @@ class BeautifulSoupCrawler:
self._last_request_time_per_domain[domain] = time.time() self._last_request_time_per_domain[domain] = time.time()
# ----------- robots.txt handling ----------- # ----------- robots.txt handling -----------
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
"""Get cached robots.txt data if valid"""
if Protego is None:
return None
cached = self._robots_cache.get(domain_root)
if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
return cached
return None
async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
"""Fetch and cache robots.txt data"""
async with self._robots_lock:
# Check again after acquiring lock
cached = await self._get_robots_cache(domain_root)
if cached:
return cached
robots_url = f"{domain_root}/robots.txt"
try:
await self._ensure_client()
await self._respect_rate_limit(robots_url, self.crawl_delay)
resp = await self._client.get(robots_url, timeout=5.0)
content = resp.text if resp.status_code == 200 else ""
except Exception as e:
logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
content = ""
protego = Protego.parse(content) if content.strip() else None
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
crawl_delay = self.crawl_delay
if protego:
delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
crawl_delay = delay if delay else self.crawl_delay
cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
self._robots_cache[domain_root] = cache_entry
return cache_entry
async def _is_url_allowed(self, url: str) -> bool: async def _is_url_allowed(self, url: str) -> bool:
if Protego is None: if Protego is None:
return True # fallback if Protego not installed return True
try:
parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
content = await self._fetch_httpx(robots_url)
if content.strip() == "":
return True # no robots.txt means allowed
rp = Protego.parse(content)
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
return rp.can_fetch(agent, url) or rp.can_fetch("*", url)
except Exception:
return True # if no robots.txt, allow by default
async def _get_crawl_delay(self, base_url: str) -> float: try:
domain_root = self._get_domain_root(url)
cache = await self._get_robots_cache(domain_root)
if cache is None:
cache = await self._fetch_and_cache_robots(domain_root)
if cache.protego is None:
return True
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
except Exception as e:
logger.debug(f"Error checking robots.txt for {url}: {e}")
return True
async def _get_crawl_delay(self, url: str) -> float:
if Protego is None: if Protego is None:
return self.crawl_delay return self.crawl_delay
try: try:
content = await self._fetch_httpx(base_url + "/robots.txt") domain_root = self._get_domain_root(url)
rp = Protego.parse(content) cache = await self._get_robots_cache(domain_root)
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*") if cache is None:
delay = rp.crawl_delay(agent) or rp.crawl_delay("*") cache = await self._fetch_and_cache_robots(domain_root)
return delay or self.crawl_delay return cache.crawl_delay
except Exception: except Exception:
return self.crawl_delay return self.crawl_delay
@ -115,34 +205,17 @@ class BeautifulSoupCrawler:
async def _fetch_httpx(self, url: str) -> str: async def _fetch_httpx(self, url: str) -> str:
await self._ensure_client() await self._ensure_client()
assert self._client is not None, "HTTP client not initialized" assert self._client is not None, "HTTP client not initialized"
attempt = 0
parsed = urlparse(url)
domain_root = f"{parsed.scheme}://{parsed.netloc}"
# Handle robots.txt separately (no recursive crawl delay call) attempt = 0
is_robot = url.lower().endswith("/robots.txt") crawl_delay = await self._get_crawl_delay(url)
while True: while True:
try: try:
# Only get crawl delay for non-robots.txt pages
crawl_delay = self.crawl_delay
if not is_robot:
try:
crawl_delay = await self._get_crawl_delay(domain_root)
except Exception as e:
logger.debug(f"Failed to fetch crawl delay for {domain_root}: {e}")
await self._respect_rate_limit(url, crawl_delay) await self._respect_rate_limit(url, crawl_delay)
resp = await self._client.get(url) resp = await self._client.get(url)
resp.raise_for_status() resp.raise_for_status()
return resp.text return resp.text
except Exception as exc: except Exception as exc:
# Special case: if robots.txt failed, just return empty string
if is_robot:
logger.warning(f"Robots.txt not found or inaccessible at {url}: {exc}")
return ""
attempt += 1 attempt += 1
if attempt > self.max_retries: if attempt > self.max_retries:
logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}") logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
@ -191,73 +264,72 @@ class BeautifulSoupCrawler:
await asyncio.sleep(backoff) await asyncio.sleep(backoff)
# ---------- extraction helpers ---------- # ---------- extraction helpers ----------
def _normalize_rule(self, rule) -> Dict[str, Any]: @staticmethod
def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize extraction rule to ExtractionRule dataclass"""
if isinstance(rule, str): if isinstance(rule, str):
return {"selector": rule, "attr": None, "all": False, "join_with": " "} return ExtractionRule(selector=rule)
if isinstance(rule, dict): if isinstance(rule, dict):
return { return ExtractionRule(
"selector": rule.get("selector"), selector=rule.get("selector"),
"attr": rule.get("attr"), xpath=rule.get("xpath"),
"all": bool(rule.get("all")), attr=rule.get("attr"),
"join_with": rule.get("join_with", " "), all=bool(rule.get("all", False)),
"xpath": rule.get("xpath"), join_with=rule.get("join_with", " "),
} )
raise ValueError("Invalid extraction rule") raise ValueError(f"Invalid extraction rule: {rule}")
def _extract_with_bs4(self, html: str, rule: Dict[str, Any]) -> str: def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
"""Extract content using BeautifulSoup or lxml xpath"""
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
sel = rule.get("selector")
xpath = rule.get("xpath")
attr = rule.get("attr")
all_flag = rule.get("all", False)
join_with = rule.get("join_with", " ")
if xpath: if rule.xpath:
try: try:
from lxml import html as lxml_html from lxml import html as lxml_html
except Exception: except ImportError:
raise RuntimeError( raise RuntimeError(
"XPath requested but lxml is not available. Install lxml or use CSS selectors." "XPath requested but lxml is not available. Install lxml or use CSS selectors."
) )
doc = lxml_html.fromstring(html) doc = lxml_html.fromstring(html)
nodes = doc.xpath(xpath) nodes = doc.xpath(rule.xpath)
texts = [] texts = []
for n in nodes: for n in nodes:
if hasattr(n, "text_content"): if hasattr(n, "text_content"):
texts.append(n.text_content().strip()) texts.append(n.text_content().strip())
else: else:
texts.append(str(n).strip()) texts.append(str(n).strip())
return join_with.join(t for t in texts if t) return rule.join_with.join(t for t in texts if t)
if not rule.selector:
return ""
if rule.all:
nodes = soup.select(rule.selector)
pieces = []
for el in nodes:
if rule.attr:
val = el.get(rule.attr)
if val:
pieces.append(val.strip())
else:
text = el.get_text(strip=True)
if text:
pieces.append(text)
return rule.join_with.join(pieces).strip()
else: else:
if not sel: el = soup.select_one(rule.selector)
if el is None:
return "" return ""
if all_flag: if rule.attr:
nodes = soup.select(sel) val = el.get(rule.attr)
pieces = [] return (val or "").strip()
for el in nodes: return el.get_text(strip=True)
if attr:
val = el.get(attr)
if val:
pieces.append(val.strip())
else:
text = el.get_text(strip=True)
if text:
pieces.append(text)
return join_with.join(pieces).strip()
else:
el = soup.select_one(sel)
if el is None:
return ""
if attr:
val = el.get(attr)
return (val or "").strip()
return el.get_text(strip=True)
# ---------- public methods ---------- # ---------- public methods ----------
async def fetch_with_bs4( async def fetch_with_bs4(
self, self,
urls: Union[str, List[str]], urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
extraction_rules: Dict[str, Any], extraction_rules: Optional[Dict[str, Any]] = None,
*, *,
use_playwright: bool = False, use_playwright: bool = False,
playwright_js_wait: float = 0.8, playwright_js_wait: float = 0.8,
@ -265,50 +337,84 @@ class BeautifulSoupCrawler:
) -> Dict[str, str]: ) -> Dict[str, str]:
""" """
Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath). Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath).
Returns: dict[url] -> concatenated string of extracted content.
"""
if isinstance(urls, str):
urls = [urls]
# normalize rules Args:
normalized_rules = {} urls: Can be:
for field, rule in extraction_rules.items(): - A single URL string
r = self._normalize_rule(rule) - A list of URLs (uses extraction_rules for all)
if join_all_matches: - A dict mapping URL -> extraction_rules (URL-specific rules)
r["all"] = True extraction_rules: Default rules when urls is a string or list
normalized_rules[field] = r use_playwright: Whether to use Playwright for JS rendering
playwright_js_wait: Wait time after page load for JS
join_all_matches: Force all rules to extract all matching elements
Returns:
dict[url] -> concatenated string of extracted content
"""
# Handle different input formats
url_rules_map: Dict[str, Dict[str, Any]] = {}
if isinstance(urls, str):
if not extraction_rules:
raise ValueError("extraction_rules required when urls is a string")
url_rules_map[urls] = extraction_rules
elif isinstance(urls, list):
if not extraction_rules:
raise ValueError("extraction_rules required when urls is a list")
for url in urls:
url_rules_map[url] = extraction_rules
elif isinstance(urls, dict):
# URL-specific rules
url_rules_map = urls
else:
raise ValueError(f"Invalid urls type: {type(urls)}")
# Normalize all rules
normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
for url, rules in url_rules_map.items():
normalized_rules = []
for field, rule in rules.items():
r = self._normalize_rule(rule)
if join_all_matches:
r.all = True
normalized_rules.append(r)
normalized_url_rules[url] = normalized_rules
async def _task(url: str): async def _task(url: str):
async with self._sem: async with self._sem:
allowed = await self._is_url_allowed(url) try:
if not allowed: allowed = await self._is_url_allowed(url)
logger.warning(f"URL disallowed by robots.txt: {url}") if not allowed:
logger.warning(f"URL disallowed by robots.txt: {url}")
return url, ""
# Fetch (rendered or not)
if use_playwright:
html = await self._render_with_playwright(
url, js_wait=playwright_js_wait, timeout=self.timeout
)
else:
html = await self._fetch_httpx(url)
# Extract content using URL-specific rules
pieces = []
for rule in normalized_url_rules[url]:
text = self._extract_with_bs4(html, rule)
if text:
pieces.append(text)
concatenated = " ".join(pieces).strip()
return url, concatenated
except Exception as e:
logger.error(f"Error processing {url}: {e}")
return url, "" return url, ""
# fetch (rendered or not) tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
if use_playwright:
html = await self._render_with_playwright(
url, js_wait=playwright_js_wait, timeout=self.timeout
)
else:
html = await self._fetch_httpx(url)
pieces = []
for field, rule in normalized_rules.items():
text = self._extract_with_bs4(html, rule)
if text:
pieces.append(text)
concatenated = " ".join(pieces).strip()
return url, concatenated
tasks = [asyncio.create_task(_task(u)) for u in urls]
results = {} results = {}
for coro in asyncio.as_completed(tasks): for coro in asyncio.as_completed(tasks):
try: url, text = await coro
url, text = await coro
except Exception as e:
results[url] = ""
logger.error(f"Error processing {url}: {e}")
continue
results[url] = text results[url] = text
return results
return results