Removed staticmethod decorator from bs4_crawler.py, kwargs from the function signature in save_data_item_to_storage.py, removed unused imports in ingest_data.py and added robots_cache_ttl as a config field in BeautifulSoupCrawler.

This commit is contained in:
Geoff-Robin 2025-10-07 20:56:23 +05:30
parent fdf85628c7
commit d91ffa2ad6
5 changed files with 4 additions and 6 deletions

View file

@ -1,7 +1,7 @@
import json import json
import inspect import inspect
from uuid import UUID from uuid import UUID
from typing import Union, BinaryIO, Any, List, Optional, Dict, Literal from typing import Union, BinaryIO, Any, List, Optional
import cognee.modules.ingestion as ingestion import cognee.modules.ingestion as ingestion
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
@ -16,7 +16,6 @@ from cognee.modules.data.methods import (
get_dataset_data, get_dataset_data,
load_or_create_datasets, load_or_create_datasets,
) )
from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
from .save_data_item_to_storage import save_data_item_to_storage from .save_data_item_to_storage import save_data_item_to_storage
from .data_item_to_text_file import data_item_to_text_file from .data_item_to_text_file import data_item_to_text_file

View file

@ -28,7 +28,7 @@ class HTMLContent(str):
settings = SaveDataSettings() settings = SaveDataSettings()
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], **kwargs) -> str: async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
if "llama_index" in str(type(data_item)): if "llama_index" in str(type(data_item)):
# Dynamic import is used because the llama_index module is optional. # Dynamic import is used because the llama_index module is optional.
from .transform_data import get_data_from_llama_index from .transform_data import get_data_from_llama_index

View file

@ -137,7 +137,6 @@ class BeautifulSoupCrawler:
"""Exit the context manager, closing the HTTP client.""" """Exit the context manager, closing the HTTP client."""
await self.close() await self.close()
@staticmethod
@lru_cache(maxsize=1024) @lru_cache(maxsize=1024)
def _domain_from_url(url: str) -> str: def _domain_from_url(url: str) -> str:
"""Extract the domain (netloc) from a URL. """Extract the domain (netloc) from a URL.
@ -153,7 +152,6 @@ class BeautifulSoupCrawler:
except Exception: except Exception:
return url return url
@staticmethod
@lru_cache(maxsize=1024) @lru_cache(maxsize=1024)
def _get_domain_root(url: str) -> str: def _get_domain_root(url: str) -> str:
"""Get the root URL (scheme and netloc) from a URL. """Get the root URL (scheme and netloc) from a URL.
@ -378,7 +376,6 @@ class BeautifulSoupCrawler:
) )
await asyncio.sleep(backoff) await asyncio.sleep(backoff)
@staticmethod
def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule: def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass. """Normalize an extraction rule to an ExtractionRule dataclass.

View file

@ -20,4 +20,5 @@ class SoupCrawlerConfig(BaseModel):
extraction_rules: Dict[str, Any] extraction_rules: Dict[str, Any]
use_playwright: bool = False use_playwright: bool = False
playwright_js_wait: float = 0.8 playwright_js_wait: float = 0.8
robots_cache_ttl: float = 3600.0
join_all_matches: bool = False join_all_matches: bool = False

View file

@ -68,6 +68,7 @@ async def fetch_page_content(
max_retries=soup_crawler_config.max_retries, max_retries=soup_crawler_config.max_retries,
retry_delay_factor=soup_crawler_config.retry_delay_factor, retry_delay_factor=soup_crawler_config.retry_delay_factor,
headers=soup_crawler_config.headers, headers=soup_crawler_config.headers,
robots_cache_ttl=soup_crawler_config.robots_cache_ttl
) )
try: try:
results = await crawler.fetch_with_bs4( results = await crawler.fetch_with_bs4(