move bs4 html parsing into bs4_loader

This commit is contained in:
Daulet Amirkhanov 2025-10-21 16:43:56 +01:00
parent 9d9969676f
commit 16e1c60925
4 changed files with 26 additions and 11 deletions

View file

@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
supports robots.txt handling, rate limiting, and custom extraction rules.
"""
from typing import Union, List, Dict, Any, Optional
from typing import Union, Dict, Any, Optional, List
from dataclasses import dataclass
from bs4 import BeautifulSoup
from cognee.infrastructure.loaders import LoaderInterface
from cognee.shared.logging_utils import get_logger
logger = get_logger(__name__)
@ -32,8 +33,7 @@ class ExtractionRule:
join_with: str = " "
# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
class BeautifulSoupCrawler:
class BeautifulSoupLoader(LoaderInterface):
"""Crawler for fetching and extracting web content using BeautifulSoup.
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
"""
@property
def supported_extensions(self) -> List[str]:
return ["html"]
@property
def supported_mime_types(self) -> List[str]:
pass
@property
def loader_name(self) -> str:
return "beautiful_soup_loader"
def can_handle(self, extension: str, mime_type: str) -> bool:
pass
async def load(self, file_path: str, **kwargs):
pass
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass.

View file

@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
"""
from .bs4_crawler import BeautifulSoupCrawler
from .utils import fetch_page_content
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
from .default_url_crawler import DefaultUrlCrawler

View file

@ -9,7 +9,6 @@ from re import L
from typing import List, Union, TypeAlias
from cognee.shared.logging_utils import get_logger
from .default_url_crawler import DefaultUrlCrawler
from .bs4_crawler import BeautifulSoupCrawler
from .config import DefaultCrawlerConfig, TavilyConfig
logger = get_logger(__name__)
@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
if os.getenv("TAVILY_API_KEY"):
logger.info("Using Tavily API for url fetching")
return await fetch_with_tavily(urls, tavily_config)
return await fetch_with_tavily(urls)
else:
logger.info("Using default crawler for content extraction")
@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
await crawler.close()
async def fetch_with_tavily(
urls: Union[str, List[str]], tavily_config: TavilyConfig
) -> UrlsToHtmls:
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
"""Fetch content from URLs using the Tavily API.
Args:
@ -112,6 +109,7 @@ async def fetch_with_tavily(
)
raise
tavily_config = TavilyConfig()
url_list = [urls] if isinstance(urls, str) else urls
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
timeout = tavily_config.timeout if tavily_config else 10

View file

@ -1,10 +1,10 @@
import pytest
from cognee.tasks.web_scraper import BeautifulSoupCrawler
from cognee.tasks.web_scraper import DefaultUrlCrawler
@pytest.mark.asyncio
async def test_fetch():
crawler = BeautifulSoupCrawler()
crawler = DefaultUrlCrawler()
url = "https://en.wikipedia.org/wiki/Large_language_model"
results = await crawler.fetch_urls(url)
assert len(results) == 1