Removed staticmethod decorator from bs4_crawler.py, kwargs from the function signature in save_data_item_to_storage.py, removed unused imports in ingest_data.py and added robots_cache_ttl as a config field in BeautifulSoupCrawler.

2025-10-07 20:56:23 +05:30 · 2025-10-07 20:56:23 +05:30 · d91ffa2ad6
commit d91ffa2ad6
parent fdf85628c7
5 changed files with 4 additions and 6 deletions
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@ -1,7 +1,7 @@
 import json
 import inspect
 from uuid import UUID
-from typing import Union, BinaryIO, Any, List, Optional, Dict, Literal
+from typing import Union, BinaryIO, Any, List, Optional

 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
@ -16,7 +16,6 @@ from cognee.modules.data.methods import (
    get_dataset_data,
    load_or_create_datasets,
 )
-from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig

 from .save_data_item_to_storage import save_data_item_to_storage
 from .data_item_to_text_file import data_item_to_text_file
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -28,7 +28,7 @@ class HTMLContent(str):
 settings = SaveDataSettings()


-async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], **kwargs) -> str:
+async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
    if "llama_index" in str(type(data_item)):
        # Dynamic import is used because the llama_index module is optional.
        from .transform_data import get_data_from_llama_index
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@ -137,7 +137,6 @@ class BeautifulSoupCrawler:
        """Exit the context manager, closing the HTTP client."""
        await self.close()

-    @staticmethod
    @lru_cache(maxsize=1024)
    def _domain_from_url(url: str) -> str:
        """Extract the domain (netloc) from a URL.
@ -153,7 +152,6 @@ class BeautifulSoupCrawler:
        except Exception:
            return url

-    @staticmethod
    @lru_cache(maxsize=1024)
    def _get_domain_root(url: str) -> str:
        """Get the root URL (scheme and netloc) from a URL.
@ -378,7 +376,6 @@ class BeautifulSoupCrawler:
                )
                await asyncio.sleep(backoff)

-    @staticmethod
    def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.

--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -20,4 +20,5 @@ class SoupCrawlerConfig(BaseModel):
    extraction_rules: Dict[str, Any]
    use_playwright: bool = False
    playwright_js_wait: float = 0.8
+    robots_cache_ttl: float = 3600.0
    join_all_matches: bool = False
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -68,6 +68,7 @@ async def fetch_page_content(
            max_retries=soup_crawler_config.max_retries,
            retry_delay_factor=soup_crawler_config.retry_delay_factor,
            headers=soup_crawler_config.headers,
+            robots_cache_ttl=soup_crawler_config.robots_cache_ttl
        )
        try:
            results = await crawler.fetch_with_bs4(