From d91ffa2ad6711073fc71aa954a3cf40d24225115 Mon Sep 17 00:00:00 2001
From: Geoff-Robin <jeffrobin132004@gmail.com>
Date: Tue, 7 Oct 2025 20:56:23 +0530
Subject: [PATCH] Removed staticmethod decorator from bs4_crawler.py, kwargs
 from the function signature in save_data_item_to_storage.py, removed unused
 imports in ingest_data.py and added robots_cache_ttl as a config field in
 BeautifulSoupCrawler.

---
 cognee/tasks/ingestion/ingest_data.py               | 3 +--
 cognee/tasks/ingestion/save_data_item_to_storage.py | 2 +-
 cognee/tasks/web_scraper/bs4_crawler.py             | 3 ---
 cognee/tasks/web_scraper/config.py                  | 1 +
 cognee/tasks/web_scraper/utils.py                   | 1 +
 5 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py
index b86bada19..3c20a2b13 100644
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@@ -1,7 +1,7 @@
 import json
 import inspect
 from uuid import UUID
-from typing import Union, BinaryIO, Any, List, Optional, Dict, Literal
+from typing import Union, BinaryIO, Any, List, Optional
 
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
@@ -16,7 +16,6 @@ from cognee.modules.data.methods import (
     get_dataset_data,
     load_or_create_datasets,
 )
-from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
 
 from .save_data_item_to_storage import save_data_item_to_storage
 from .data_item_to_text_file import data_item_to_text_file
diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
index c947d35c5..105f17c0d 100644
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -28,7 +28,7 @@ class HTMLContent(str):
 settings = SaveDataSettings()
 
 
-async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], **kwargs) -> str:
+async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
     if "llama_index" in str(type(data_item)):
         # Dynamic import is used because the llama_index module is optional.
         from .transform_data import get_data_from_llama_index
diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py
index 7ccf06a3d..f1efca3d6 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@@ -137,7 +137,6 @@ class BeautifulSoupCrawler:
         """Exit the context manager, closing the HTTP client."""
         await self.close()
 
-    @staticmethod
     @lru_cache(maxsize=1024)
     def _domain_from_url(url: str) -> str:
         """Extract the domain (netloc) from a URL.
@@ -153,7 +152,6 @@ class BeautifulSoupCrawler:
         except Exception:
             return url
 
-    @staticmethod
     @lru_cache(maxsize=1024)
     def _get_domain_root(url: str) -> str:
         """Get the root URL (scheme and netloc) from a URL.
@@ -378,7 +376,6 @@ class BeautifulSoupCrawler:
                 )
                 await asyncio.sleep(backoff)
 
-    @staticmethod
     def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
 
diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py
index e28f412e5..2f7f6b3f6 100644
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@@ -20,4 +20,5 @@ class SoupCrawlerConfig(BaseModel):
     extraction_rules: Dict[str, Any]
     use_playwright: bool = False
     playwright_js_wait: float = 0.8
+    robots_cache_ttl: float = 3600.0
     join_all_matches: bool = False
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index 1a300a37b..5af95a2fe 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -68,6 +68,7 @@ async def fetch_page_content(
             max_retries=soup_crawler_config.max_retries,
             retry_delay_factor=soup_crawler_config.retry_delay_factor,
             headers=soup_crawler_config.headers,
+            robots_cache_ttl=soup_crawler_config.robots_cache_ttl
         )
         try:
             results = await crawler.fetch_with_bs4(