From b5190c90f1efc8256ae7405728e766c06b97e963 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 16:51:36 +0100 Subject: [PATCH] add logging for crawling status; add cap to the crawl_delay from robots.txt - Not advising to use the cap, but giving an option to be able to configure it --- .../loaders/external/web_url_loader.py | 13 ++++ cognee/tasks/web_scraper/bs4_crawler.py | 66 ++++++++++++++++++- cognee/tasks/web_scraper/config.py | 3 + cognee/tasks/web_scraper/utils.py | 41 ++++++++++-- 4 files changed, 116 insertions(+), 7 deletions(-) diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 491428c82..1ecf82171 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -4,6 +4,9 @@ from typing import List from cognee.modules.ingestion.exceptions.exceptions import IngestionError from cognee.modules.ingestion import save_data_to_file from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig +from cognee.shared.logging_utils import get_logger + +logger = get_logger() class WebUrlLoader(LoaderInterface): @@ -100,16 +103,26 @@ class WebUrlLoader(LoaderInterface): message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." ) + logger.info(f"Starting web URL crawling for: {file_path}") + logger.info(f"Using scraping tool: {preferred_tool}") + data = await fetch_page_content( file_path, preferred_tool=preferred_tool, tavily_config=_tavily_config, soup_crawler_config=_soup_config, ) + + logger.info(f"Successfully fetched content from {len(data)} URL(s)") + logger.info("Processing and concatenating fetched content") + content = "" for key, value in data.items(): content += f"{key}:\n{value}\n\n" + + logger.info(f"Saving content to file (total size: {len(content)} characters)") stored_path = await save_data_to_file(content) + logger.info(f"Successfully saved content to: {stored_path}") return stored_path except IngestionError: diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py index 0fbff4808..400287e08 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/tasks/web_scraper/bs4_crawler.py @@ -75,6 +75,7 @@ class BeautifulSoupCrawler: Attributes: concurrency: Number of concurrent requests allowed. crawl_delay: Minimum seconds between requests to the same domain. + max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit). timeout: Per-request timeout in seconds. max_retries: Number of retries for failed requests. retry_delay_factor: Multiplier for exponential backoff on retries. @@ -87,6 +88,7 @@ class BeautifulSoupCrawler: *, concurrency: int = 5, crawl_delay: float = 0.5, + max_crawl_delay: Optional[float] = 10.0, timeout: float = 15.0, max_retries: int = 2, retry_delay_factor: float = 0.5, @@ -98,6 +100,7 @@ class BeautifulSoupCrawler: Args: concurrency: Number of concurrent requests allowed. crawl_delay: Minimum seconds between requests to the same domain. + max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit). timeout: Per-request timeout in seconds. max_retries: Number of retries for failed requests. retry_delay_factor: Multiplier for exponential backoff on retries. @@ -107,6 +110,7 @@ class BeautifulSoupCrawler: self.concurrency = concurrency self._sem = asyncio.Semaphore(concurrency) self.crawl_delay = crawl_delay + self.max_crawl_delay = max_crawl_delay self.timeout = timeout self.max_retries = max_retries self.retry_delay_factor = retry_delay_factor @@ -183,7 +187,11 @@ class BeautifulSoupCrawler: elapsed = time.time() - last wait_for = delay - elapsed if wait_for > 0: + logger.info( + f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)" + ) await asyncio.sleep(wait_for) + logger.info(f"Rate limit wait completed for {url}") self._last_request_time_per_domain[domain] = time.time() async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]: @@ -236,7 +244,16 @@ class BeautifulSoupCrawler: crawl_delay = self.crawl_delay if protego: delay = protego.crawl_delay(agent) or protego.crawl_delay("*") - crawl_delay = delay if delay else self.crawl_delay + if delay: + # Apply max_crawl_delay cap if configured + if self.max_crawl_delay is not None and delay > self.max_crawl_delay: + logger.warning( + f"robots.txt specifies crawl_delay={delay}s for {domain_root}, " + f"capping to max_crawl_delay={self.max_crawl_delay}s" + ) + crawl_delay = self.max_crawl_delay + else: + crawl_delay = delay cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay) self._robots_cache[domain_root] = cache_entry @@ -307,12 +324,16 @@ class BeautifulSoupCrawler: attempt = 0 crawl_delay = await self._get_crawl_delay(url) + logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}") while True: try: await self._respect_rate_limit(url, crawl_delay) resp = await self._client.get(url) resp.raise_for_status() + logger.info( + f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)" + ) return resp.text except Exception as exc: attempt += 1 @@ -347,22 +368,35 @@ class BeautifulSoupCrawler: raise RuntimeError( "Playwright is not installed. Install with `pip install playwright` and run `playwright install`." ) + + timeout_val = timeout or self.timeout + logger.info( + f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}" + ) + attempt = 0 while True: try: async with async_playwright() as p: + logger.info(f"Launching headless Chromium browser for {url}") browser = await p.chromium.launch(headless=True) try: context = await browser.new_context() page = await context.new_page() + logger.info(f"Navigating to {url} and waiting for network idle") await page.goto( url, wait_until="networkidle", - timeout=int((timeout or self.timeout) * 1000), + timeout=int(timeout_val * 1000), ) if js_wait: + logger.info(f"Waiting {js_wait}s for JavaScript to execute") await asyncio.sleep(js_wait) - return await page.content() + content = await page.content() + logger.info( + f"Successfully rendered {url} with Playwright (size={len(content)} bytes)" + ) + return content finally: await browser.close() except Exception as exc: @@ -498,6 +532,10 @@ class BeautifulSoupCrawler: else: raise ValueError(f"Invalid urls type: {type(urls)}") + logger.info( + f"Preparing to fetch {len(url_rules_map)} URL(s) with {len(extraction_rules) if extraction_rules else 0} extraction rule(s)" + ) + normalized_url_rules: Dict[str, List[ExtractionRule]] = {} for url, rules in url_rules_map.items(): normalized_rules = [] @@ -508,21 +546,36 @@ class BeautifulSoupCrawler: normalized_rules.append(r) normalized_url_rules[url] = normalized_rules + logger.info(f"Normalized extraction rules for {len(normalized_url_rules)} URL(s)") + async def _task(url: str): async with self._sem: try: + logger.info(f"Processing URL: {url}") + + # Check robots.txt allowed = await self._is_url_allowed(url) if not allowed: logger.warning(f"URL disallowed by robots.txt: {url}") return url, "" + logger.info(f"Robots.txt check passed for {url}") + + # Fetch HTML if use_playwright: + logger.info( + f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)" + ) html = await self._render_with_playwright( url, js_wait=playwright_js_wait, timeout=self.timeout ) else: + logger.info(f"Fetching {url} with httpx") html = await self._fetch_httpx(url) + logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)") + + # Extract content pieces = [] for rule in normalized_url_rules[url]: text = self._extract_with_bs4(html, rule) @@ -530,17 +583,24 @@ class BeautifulSoupCrawler: pieces.append(text) concatenated = " ".join(pieces).strip() + logger.info(f"Extracted {len(concatenated)} characters from {url}") return url, concatenated except Exception as e: logger.error(f"Error processing {url}: {e}") return url, "" + logger.info(f"Creating {len(url_rules_map)} async tasks for concurrent fetching") tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()] results = {} + completed = 0 + total = len(tasks) for coro in asyncio.as_completed(tasks): url, text = await coro results[url] = text + completed += 1 + logger.info(f"Progress: {completed}/{total} URLs processed") + logger.info(f"Completed fetching all {len(results)} URL(s)") return results diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index 2ee43ed32..ac470daa9 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -13,6 +13,9 @@ class TavilyConfig(BaseModel): class SoupCrawlerConfig(BaseModel): concurrency: int = 5 crawl_delay: float = 0.5 + max_crawl_delay: Optional[float] = ( + 10.0 # Maximum crawl delay to respect from robots.txt (None = no limit) + ) timeout: float = 15.0 max_retries: int = 2 retry_delay_factor: float = 0.5 diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 6d094f423..a32b6848c 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -45,9 +45,13 @@ async def fetch_page_content( ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not installed. """ + url_list = [urls] if isinstance(urls, str) else urls + logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}") + if preferred_tool == "tavily": if not tavily_config or tavily_config.api_key is None: raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily") + logger.info("Using Tavily API for content extraction") return await fetch_with_tavily(urls, tavily_config) if preferred_tool == "beautifulsoup": @@ -60,10 +64,17 @@ async def fetch_page_content( raise ImportError if not soup_crawler_config or soup_crawler_config.extraction_rules is None: raise ValueError("extraction_rules must be provided when not using Tavily") + + logger.info("Using BeautifulSoup for content extraction") extraction_rules = soup_crawler_config.extraction_rules + logger.info( + f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s" + ) + crawler = BeautifulSoupCrawler( concurrency=soup_crawler_config.concurrency, crawl_delay=soup_crawler_config.crawl_delay, + max_crawl_delay=soup_crawler_config.max_crawl_delay, timeout=soup_crawler_config.timeout, max_retries=soup_crawler_config.max_retries, retry_delay_factor=soup_crawler_config.retry_delay_factor, @@ -71,6 +82,9 @@ async def fetch_page_content( robots_cache_ttl=soup_crawler_config.robots_cache_ttl, ) try: + logger.info( + f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})" + ) results = await crawler.fetch_with_bs4( urls, extraction_rules, @@ -78,11 +92,13 @@ async def fetch_page_content( playwright_js_wait=soup_crawler_config.playwright_js_wait, join_all_matches=soup_crawler_config.join_all_matches, ) + logger.info(f"Successfully fetched content from {len(results)} URL(s)") return results except Exception as e: logger.error(f"Error fetching page content: {str(e)}") raise finally: + logger.info("Closing BeautifulSoup crawler") await crawler.close() @@ -108,19 +124,36 @@ async def fetch_with_tavily( "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0" ) raise + + url_list = [urls] if isinstance(urls, str) else urls + extract_depth = tavily_config.extract_depth if tavily_config else "basic" + timeout = tavily_config.timeout if tavily_config else 10 + + logger.info( + f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s" + ) client = AsyncTavilyClient( api_key=tavily_config.api_key if tavily_config else None, proxies=tavily_config.proxies if tavily_config else None, ) + + logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)") results = await client.extract( urls, format="text", - extract_depth=tavily_config.extract_depth if tavily_config else "basic", - timeout=tavily_config.timeout if tavily_config else 10, + extract_depth=extract_depth, + timeout=timeout, ) - for failed_result in results.get("failed_results", []): - logger.warning(f"Failed to fetch {failed_result}") + + failed_count = len(results.get("failed_results", [])) + if failed_count > 0: + logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)") + for failed_result in results.get("failed_results", []): + logger.warning(f"Failed to fetch {failed_result}") + return_results = {} for result in results.get("results", []): return_results[result["url"]] = result["raw_content"] + + logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily") return return_results