From d91ffa2ad6711073fc71aa954a3cf40d24225115 Mon Sep 17 00:00:00 2001 From: Geoff-Robin Date: Tue, 7 Oct 2025 20:56:23 +0530 Subject: [PATCH] Removed staticmethod decorator from bs4_crawler.py, kwargs from the function signature in save_data_item_to_storage.py, removed unused imports in ingest_data.py and added robots_cache_ttl as a config field in BeautifulSoupCrawler. --- cognee/tasks/ingestion/ingest_data.py | 3 +-- cognee/tasks/ingestion/save_data_item_to_storage.py | 2 +- cognee/tasks/web_scraper/bs4_crawler.py | 3 --- cognee/tasks/web_scraper/config.py | 1 + cognee/tasks/web_scraper/utils.py | 1 + 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index b86bada19..3c20a2b13 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -1,7 +1,7 @@ import json import inspect from uuid import UUID -from typing import Union, BinaryIO, Any, List, Optional, Dict, Literal +from typing import Union, BinaryIO, Any, List, Optional import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine @@ -16,7 +16,6 @@ from cognee.modules.data.methods import ( get_dataset_data, load_or_create_datasets, ) -from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig from .save_data_item_to_storage import save_data_item_to_storage from .data_item_to_text_file import data_item_to_text_file diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index c947d35c5..105f17c0d 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -28,7 +28,7 @@ class HTMLContent(str): settings = SaveDataSettings() -async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], **kwargs) -> str: +async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str: if "llama_index" in str(type(data_item)): # Dynamic import is used because the llama_index module is optional. from .transform_data import get_data_from_llama_index diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py index 7ccf06a3d..f1efca3d6 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/tasks/web_scraper/bs4_crawler.py @@ -137,7 +137,6 @@ class BeautifulSoupCrawler: """Exit the context manager, closing the HTTP client.""" await self.close() - @staticmethod @lru_cache(maxsize=1024) def _domain_from_url(url: str) -> str: """Extract the domain (netloc) from a URL. @@ -153,7 +152,6 @@ class BeautifulSoupCrawler: except Exception: return url - @staticmethod @lru_cache(maxsize=1024) def _get_domain_root(url: str) -> str: """Get the root URL (scheme and netloc) from a URL. @@ -378,7 +376,6 @@ class BeautifulSoupCrawler: ) await asyncio.sleep(backoff) - @staticmethod def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index e28f412e5..2f7f6b3f6 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -20,4 +20,5 @@ class SoupCrawlerConfig(BaseModel): extraction_rules: Dict[str, Any] use_playwright: bool = False playwright_js_wait: float = 0.8 + robots_cache_ttl: float = 3600.0 join_all_matches: bool = False diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 1a300a37b..5af95a2fe 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -68,6 +68,7 @@ async def fetch_page_content( max_retries=soup_crawler_config.max_retries, retry_delay_factor=soup_crawler_config.retry_delay_factor, headers=soup_crawler_config.headers, + robots_cache_ttl=soup_crawler_config.robots_cache_ttl ) try: results = await crawler.fetch_with_bs4(