From 16e1c609253f74a36061b49e3ef533e9b5490272 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 16:43:56 +0100 Subject: [PATCH] move bs4 html parsing into `bs4_loader` --- .../loaders/external/bs4_loader.py} | 24 ++++++++++++++++--- cognee/tasks/web_scraper/__init__.py | 1 - cognee/tasks/web_scraper/utils.py | 8 +++---- .../web_url_crawler/test_bs4_crawler.py | 4 ++-- 4 files changed, 26 insertions(+), 11 deletions(-) rename cognee/{tasks/web_scraper/bs4_crawler.py => infrastructure/loaders/external/bs4_loader.py} (89%) diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/infrastructure/loaders/external/bs4_loader.py similarity index 89% rename from cognee/tasks/web_scraper/bs4_crawler.py rename to cognee/infrastructure/loaders/external/bs4_loader.py index 171a76633..8022de04f 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/infrastructure/loaders/external/bs4_loader.py @@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. supports robots.txt handling, rate limiting, and custom extraction rules. """ -from typing import Union, List, Dict, Any, Optional +from typing import Union, Dict, Any, Optional, List from dataclasses import dataclass from bs4 import BeautifulSoup +from cognee.infrastructure.loaders import LoaderInterface from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) @@ -32,8 +33,7 @@ class ExtractionRule: join_with: str = " " -# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler -class BeautifulSoupCrawler: +class BeautifulSoupLoader(LoaderInterface): """Crawler for fetching and extracting web content using BeautifulSoup. Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt @@ -50,6 +50,24 @@ class BeautifulSoupCrawler: robots_cache_ttl: Time-to-live for robots.txt cache in seconds. """ + @property + def supported_extensions(self) -> List[str]: + return ["html"] + + @property + def supported_mime_types(self) -> List[str]: + pass + + @property + def loader_name(self) -> str: + return "beautiful_soup_loader" + + def can_handle(self, extension: str, mime_type: str) -> bool: + pass + + async def load(self, file_path: str, **kwargs): + pass + def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py index f4d6677c7..26c3e68cf 100644 --- a/cognee/tasks/web_scraper/__init__.py +++ b/cognee/tasks/web_scraper/__init__.py @@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag BeautifulSoup or Tavily, defining data models, and handling scraping configurations. """ -from .bs4_crawler import BeautifulSoupCrawler from .utils import fetch_page_content from .web_scraper_task import cron_web_scraper_task, web_scraper_task from .default_url_crawler import DefaultUrlCrawler diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 0cbd355a3..b1cbf82e9 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -9,7 +9,6 @@ from re import L from typing import List, Union, TypeAlias from cognee.shared.logging_utils import get_logger from .default_url_crawler import DefaultUrlCrawler -from .bs4_crawler import BeautifulSoupCrawler from .config import DefaultCrawlerConfig, TavilyConfig logger = get_logger(__name__) @@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: if os.getenv("TAVILY_API_KEY"): logger.info("Using Tavily API for url fetching") - return await fetch_with_tavily(urls, tavily_config) + return await fetch_with_tavily(urls) else: logger.info("Using default crawler for content extraction") @@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: await crawler.close() -async def fetch_with_tavily( - urls: Union[str, List[str]], tavily_config: TavilyConfig -) -> UrlsToHtmls: +async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls: """Fetch content from URLs using the Tavily API. Args: @@ -112,6 +109,7 @@ async def fetch_with_tavily( ) raise + tavily_config = TavilyConfig() url_list = [urls] if isinstance(urls, str) else urls extract_depth = tavily_config.extract_depth if tavily_config else "basic" timeout = tavily_config.timeout if tavily_config else 10 diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py index 0e7637d86..156cc87a4 100644 --- a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py @@ -1,10 +1,10 @@ import pytest -from cognee.tasks.web_scraper import BeautifulSoupCrawler +from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): - crawler = BeautifulSoupCrawler() + crawler = DefaultUrlCrawler() url = "https://en.wikipedia.org/wiki/Large_language_model" results = await crawler.fetch_urls(url) assert len(results) == 1