From b5190c90f1efc8256ae7405728e766c06b97e963 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Fri, 17 Oct 2025 16:51:36 +0100
Subject: [PATCH] add logging for crawling status; add cap to the crawl_delay
 from robots.txt

- Not advising to use the cap, but giving an option to be able to configure it
---
 .../loaders/external/web_url_loader.py        | 13 ++++
 cognee/tasks/web_scraper/bs4_crawler.py       | 66 ++++++++++++++++++-
 cognee/tasks/web_scraper/config.py            |  3 +
 cognee/tasks/web_scraper/utils.py             | 41 ++++++++++--
 4 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py
index 491428c82..1ecf82171 100644
--- a/cognee/infrastructure/loaders/external/web_url_loader.py
+++ b/cognee/infrastructure/loaders/external/web_url_loader.py
@@ -4,6 +4,9 @@ from typing import List
 from cognee.modules.ingestion.exceptions.exceptions import IngestionError
 from cognee.modules.ingestion import save_data_to_file
 from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger()
 
 
 class WebUrlLoader(LoaderInterface):
@@ -100,16 +103,26 @@ class WebUrlLoader(LoaderInterface):
                     message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
                 )
 
+            logger.info(f"Starting web URL crawling for: {file_path}")
+            logger.info(f"Using scraping tool: {preferred_tool}")
+
             data = await fetch_page_content(
                 file_path,
                 preferred_tool=preferred_tool,
                 tavily_config=_tavily_config,
                 soup_crawler_config=_soup_config,
             )
+
+            logger.info(f"Successfully fetched content from {len(data)} URL(s)")
+            logger.info("Processing and concatenating fetched content")
+
             content = ""
             for key, value in data.items():
                 content += f"{key}:\n{value}\n\n"
+
+            logger.info(f"Saving content to file (total size: {len(content)} characters)")
             stored_path = await save_data_to_file(content)
+            logger.info(f"Successfully saved content to: {stored_path}")
 
             return stored_path
         except IngestionError:
diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py
index 0fbff4808..400287e08 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@@ -75,6 +75,7 @@ class BeautifulSoupCrawler:
     Attributes:
         concurrency: Number of concurrent requests allowed.
         crawl_delay: Minimum seconds between requests to the same domain.
+        max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
         timeout: Per-request timeout in seconds.
         max_retries: Number of retries for failed requests.
         retry_delay_factor: Multiplier for exponential backoff on retries.
@@ -87,6 +88,7 @@ class BeautifulSoupCrawler:
         *,
         concurrency: int = 5,
         crawl_delay: float = 0.5,
+        max_crawl_delay: Optional[float] = 10.0,
         timeout: float = 15.0,
         max_retries: int = 2,
         retry_delay_factor: float = 0.5,
@@ -98,6 +100,7 @@ class BeautifulSoupCrawler:
         Args:
             concurrency: Number of concurrent requests allowed.
             crawl_delay: Minimum seconds between requests to the same domain.
+            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
             timeout: Per-request timeout in seconds.
             max_retries: Number of retries for failed requests.
             retry_delay_factor: Multiplier for exponential backoff on retries.
@@ -107,6 +110,7 @@ class BeautifulSoupCrawler:
         self.concurrency = concurrency
         self._sem = asyncio.Semaphore(concurrency)
         self.crawl_delay = crawl_delay
+        self.max_crawl_delay = max_crawl_delay
         self.timeout = timeout
         self.max_retries = max_retries
         self.retry_delay_factor = retry_delay_factor
@@ -183,7 +187,11 @@ class BeautifulSoupCrawler:
         elapsed = time.time() - last
         wait_for = delay - elapsed
         if wait_for > 0:
+            logger.info(
+                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
+            )
             await asyncio.sleep(wait_for)
+            logger.info(f"Rate limit wait completed for {url}")
         self._last_request_time_per_domain[domain] = time.time()
 
     async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
@@ -236,7 +244,16 @@ class BeautifulSoupCrawler:
             crawl_delay = self.crawl_delay
             if protego:
                 delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
-                crawl_delay = delay if delay else self.crawl_delay
+                if delay:
+                    # Apply max_crawl_delay cap if configured
+                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
+                        logger.warning(
+                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
+                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
+                        )
+                        crawl_delay = self.max_crawl_delay
+                    else:
+                        crawl_delay = delay
 
             cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
             self._robots_cache[domain_root] = cache_entry
@@ -307,12 +324,16 @@ class BeautifulSoupCrawler:
 
         attempt = 0
         crawl_delay = await self._get_crawl_delay(url)
+        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
 
         while True:
             try:
                 await self._respect_rate_limit(url, crawl_delay)
                 resp = await self._client.get(url)
                 resp.raise_for_status()
+                logger.info(
+                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
+                )
                 return resp.text
             except Exception as exc:
                 attempt += 1
@@ -347,22 +368,35 @@ class BeautifulSoupCrawler:
             raise RuntimeError(
                 "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
             )
+
+        timeout_val = timeout or self.timeout
+        logger.info(
+            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
+        )
+
         attempt = 0
         while True:
             try:
                 async with async_playwright() as p:
+                    logger.info(f"Launching headless Chromium browser for {url}")
                     browser = await p.chromium.launch(headless=True)
                     try:
                         context = await browser.new_context()
                         page = await context.new_page()
+                        logger.info(f"Navigating to {url} and waiting for network idle")
                         await page.goto(
                             url,
                             wait_until="networkidle",
-                            timeout=int((timeout or self.timeout) * 1000),
+                            timeout=int(timeout_val * 1000),
                         )
                         if js_wait:
+                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
                             await asyncio.sleep(js_wait)
-                        return await page.content()
+                        content = await page.content()
+                        logger.info(
+                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
+                        )
+                        return content
                     finally:
                         await browser.close()
             except Exception as exc:
@@ -498,6 +532,10 @@ class BeautifulSoupCrawler:
         else:
             raise ValueError(f"Invalid urls type: {type(urls)}")
 
+        logger.info(
+            f"Preparing to fetch {len(url_rules_map)} URL(s) with {len(extraction_rules) if extraction_rules else 0} extraction rule(s)"
+        )
+
         normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
         for url, rules in url_rules_map.items():
             normalized_rules = []
@@ -508,21 +546,36 @@ class BeautifulSoupCrawler:
                 normalized_rules.append(r)
             normalized_url_rules[url] = normalized_rules
 
+        logger.info(f"Normalized extraction rules for {len(normalized_url_rules)} URL(s)")
+
         async def _task(url: str):
             async with self._sem:
                 try:
+                    logger.info(f"Processing URL: {url}")
+
+                    # Check robots.txt
                     allowed = await self._is_url_allowed(url)
                     if not allowed:
                         logger.warning(f"URL disallowed by robots.txt: {url}")
                         return url, ""
 
+                    logger.info(f"Robots.txt check passed for {url}")
+
+                    # Fetch HTML
                     if use_playwright:
+                        logger.info(
+                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
+                        )
                         html = await self._render_with_playwright(
                             url, js_wait=playwright_js_wait, timeout=self.timeout
                         )
                     else:
+                        logger.info(f"Fetching {url} with httpx")
                         html = await self._fetch_httpx(url)
 
+                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
+
+                    # Extract content
                     pieces = []
                     for rule in normalized_url_rules[url]:
                         text = self._extract_with_bs4(html, rule)
@@ -530,17 +583,24 @@ class BeautifulSoupCrawler:
                             pieces.append(text)
 
                     concatenated = " ".join(pieces).strip()
+                    logger.info(f"Extracted {len(concatenated)} characters from {url}")
                     return url, concatenated
 
                 except Exception as e:
                     logger.error(f"Error processing {url}: {e}")
                     return url, ""
 
+        logger.info(f"Creating {len(url_rules_map)} async tasks for concurrent fetching")
         tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
         results = {}
+        completed = 0
+        total = len(tasks)
 
         for coro in asyncio.as_completed(tasks):
             url, text = await coro
             results[url] = text
+            completed += 1
+            logger.info(f"Progress: {completed}/{total} URLs processed")
 
+        logger.info(f"Completed fetching all {len(results)} URL(s)")
         return results
diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py
index 2ee43ed32..ac470daa9 100644
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@@ -13,6 +13,9 @@ class TavilyConfig(BaseModel):
 class SoupCrawlerConfig(BaseModel):
     concurrency: int = 5
     crawl_delay: float = 0.5
+    max_crawl_delay: Optional[float] = (
+        10.0  # Maximum crawl delay to respect from robots.txt (None = no limit)
+    )
     timeout: float = 15.0
     max_retries: int = 2
     retry_delay_factor: float = 0.5
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index 6d094f423..a32b6848c 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -45,9 +45,13 @@ async def fetch_page_content(
         ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
             installed.
     """
+    url_list = [urls] if isinstance(urls, str) else urls
+    logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}")
+
     if preferred_tool == "tavily":
         if not tavily_config or tavily_config.api_key is None:
             raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
+        logger.info("Using Tavily API for content extraction")
         return await fetch_with_tavily(urls, tavily_config)
 
     if preferred_tool == "beautifulsoup":
@@ -60,10 +64,17 @@ async def fetch_page_content(
             raise ImportError
         if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
             raise ValueError("extraction_rules must be provided when not using Tavily")
+
+        logger.info("Using BeautifulSoup for content extraction")
         extraction_rules = soup_crawler_config.extraction_rules
+        logger.info(
+            f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s"
+        )
+
         crawler = BeautifulSoupCrawler(
             concurrency=soup_crawler_config.concurrency,
             crawl_delay=soup_crawler_config.crawl_delay,
+            max_crawl_delay=soup_crawler_config.max_crawl_delay,
             timeout=soup_crawler_config.timeout,
             max_retries=soup_crawler_config.max_retries,
             retry_delay_factor=soup_crawler_config.retry_delay_factor,
@@ -71,6 +82,9 @@ async def fetch_page_content(
             robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
         )
         try:
+            logger.info(
+                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})"
+            )
             results = await crawler.fetch_with_bs4(
                 urls,
                 extraction_rules,
@@ -78,11 +92,13 @@ async def fetch_page_content(
                 playwright_js_wait=soup_crawler_config.playwright_js_wait,
                 join_all_matches=soup_crawler_config.join_all_matches,
             )
+            logger.info(f"Successfully fetched content from {len(results)} URL(s)")
             return results
         except Exception as e:
             logger.error(f"Error fetching page content: {str(e)}")
             raise
         finally:
+            logger.info("Closing BeautifulSoup crawler")
             await crawler.close()
 
 
@@ -108,19 +124,36 @@ async def fetch_with_tavily(
             "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
         )
         raise
+
+    url_list = [urls] if isinstance(urls, str) else urls
+    extract_depth = tavily_config.extract_depth if tavily_config else "basic"
+    timeout = tavily_config.timeout if tavily_config else 10
+
+    logger.info(
+        f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
+    )
     client = AsyncTavilyClient(
         api_key=tavily_config.api_key if tavily_config else None,
         proxies=tavily_config.proxies if tavily_config else None,
     )
+
+    logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
     results = await client.extract(
         urls,
         format="text",
-        extract_depth=tavily_config.extract_depth if tavily_config else "basic",
-        timeout=tavily_config.timeout if tavily_config else 10,
+        extract_depth=extract_depth,
+        timeout=timeout,
     )
-    for failed_result in results.get("failed_results", []):
-        logger.warning(f"Failed to fetch {failed_result}")
+
+    failed_count = len(results.get("failed_results", []))
+    if failed_count > 0:
+        logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
+        for failed_result in results.get("failed_results", []):
+            logger.warning(f"Failed to fetch {failed_result}")
+
     return_results = {}
     for result in results.get("results", []):
         return_results[result["url"]] = result["raw_content"]
+
+    logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
     return return_results