Removed staticmethod decorator from bs4_crawler.py, kwargs from the function signature in save_data_item_to_storage.py, removed unused imports in ingest_data.py and added robots_cache_ttl as a config field in BeautifulSoupCrawler.

This commit is contained in:
Geoff-Robin 2025-10-07 20:56:23 +05:30
parent fdf85628c7
commit d91ffa2ad6
5 changed files with 4 additions and 6 deletions

View file

@ -1,7 +1,7 @@
import json
import inspect
from uuid import UUID
from typing import Union, BinaryIO, Any, List, Optional, Dict, Literal
from typing import Union, BinaryIO, Any, List, Optional
import cognee.modules.ingestion as ingestion
from cognee.infrastructure.databases.relational import get_relational_engine
@ -16,7 +16,6 @@ from cognee.modules.data.methods import (
get_dataset_data,
load_or_create_datasets,
)
from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
from .save_data_item_to_storage import save_data_item_to_storage
from .data_item_to_text_file import data_item_to_text_file

View file

@ -28,7 +28,7 @@ class HTMLContent(str):
settings = SaveDataSettings()
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], **kwargs) -> str:
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
if "llama_index" in str(type(data_item)):
# Dynamic import is used because the llama_index module is optional.
from .transform_data import get_data_from_llama_index

View file

@ -137,7 +137,6 @@ class BeautifulSoupCrawler:
"""Exit the context manager, closing the HTTP client."""
await self.close()
@staticmethod
@lru_cache(maxsize=1024)
def _domain_from_url(url: str) -> str:
"""Extract the domain (netloc) from a URL.
@ -153,7 +152,6 @@ class BeautifulSoupCrawler:
except Exception:
return url
@staticmethod
@lru_cache(maxsize=1024)
def _get_domain_root(url: str) -> str:
"""Get the root URL (scheme and netloc) from a URL.
@ -378,7 +376,6 @@ class BeautifulSoupCrawler:
)
await asyncio.sleep(backoff)
@staticmethod
def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass.

View file

@ -20,4 +20,5 @@ class SoupCrawlerConfig(BaseModel):
extraction_rules: Dict[str, Any]
use_playwright: bool = False
playwright_js_wait: float = 0.8
robots_cache_ttl: float = 3600.0
join_all_matches: bool = False

View file

@ -68,6 +68,7 @@ async def fetch_page_content(
max_retries=soup_crawler_config.max_retries,
retry_delay_factor=soup_crawler_config.retry_delay_factor,
headers=soup_crawler_config.headers,
robots_cache_ttl=soup_crawler_config.robots_cache_ttl
)
try:
results = await crawler.fetch_with_bs4(