Removed staticmethod decorator from bs4_crawler.py, kwargs from the function signature in save_data_item_to_storage.py, removed unused imports in ingest_data.py and added robots_cache_ttl as a config field in BeautifulSoupCrawler.
This commit is contained in:
parent
fdf85628c7
commit
d91ffa2ad6
5 changed files with 4 additions and 6 deletions
|
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
import inspect
|
||||
from uuid import UUID
|
||||
from typing import Union, BinaryIO, Any, List, Optional, Dict, Literal
|
||||
from typing import Union, BinaryIO, Any, List, Optional
|
||||
|
||||
import cognee.modules.ingestion as ingestion
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
|
|
@ -16,7 +16,6 @@ from cognee.modules.data.methods import (
|
|||
get_dataset_data,
|
||||
load_or_create_datasets,
|
||||
)
|
||||
from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig
|
||||
|
||||
from .save_data_item_to_storage import save_data_item_to_storage
|
||||
from .data_item_to_text_file import data_item_to_text_file
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ class HTMLContent(str):
|
|||
settings = SaveDataSettings()
|
||||
|
||||
|
||||
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], **kwargs) -> str:
|
||||
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
|
||||
if "llama_index" in str(type(data_item)):
|
||||
# Dynamic import is used because the llama_index module is optional.
|
||||
from .transform_data import get_data_from_llama_index
|
||||
|
|
|
|||
|
|
@ -137,7 +137,6 @@ class BeautifulSoupCrawler:
|
|||
"""Exit the context manager, closing the HTTP client."""
|
||||
await self.close()
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=1024)
|
||||
def _domain_from_url(url: str) -> str:
|
||||
"""Extract the domain (netloc) from a URL.
|
||||
|
|
@ -153,7 +152,6 @@ class BeautifulSoupCrawler:
|
|||
except Exception:
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=1024)
|
||||
def _get_domain_root(url: str) -> str:
|
||||
"""Get the root URL (scheme and netloc) from a URL.
|
||||
|
|
@ -378,7 +376,6 @@ class BeautifulSoupCrawler:
|
|||
)
|
||||
await asyncio.sleep(backoff)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_rule(rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||
|
||||
|
|
|
|||
|
|
@ -20,4 +20,5 @@ class SoupCrawlerConfig(BaseModel):
|
|||
extraction_rules: Dict[str, Any]
|
||||
use_playwright: bool = False
|
||||
playwright_js_wait: float = 0.8
|
||||
robots_cache_ttl: float = 3600.0
|
||||
join_all_matches: bool = False
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@ async def fetch_page_content(
|
|||
max_retries=soup_crawler_config.max_retries,
|
||||
retry_delay_factor=soup_crawler_config.retry_delay_factor,
|
||||
headers=soup_crawler_config.headers,
|
||||
robots_cache_ttl=soup_crawler_config.robots_cache_ttl
|
||||
)
|
||||
try:
|
||||
results = await crawler.fetch_with_bs4(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue