move bs4 html parsing into bs4_loader

2025-10-21 16:43:56 +01:00 · 2025-10-21 16:43:56 +01:00 · 16e1c60925
commit 16e1c60925
parent 9d9969676f
4 changed files with 26 additions and 11 deletions
--- a/cognee/infrastructure/loaders/external/bs4_loader.py
+++ b/cognee/infrastructure/loaders/external/bs4_loader.py
@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
-from typing import Union, List, Dict, Any, Optional
+from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
 from cognee.infrastructure.loaders import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 logger = get_logger(__name__)
@ -32,8 +33,7 @@ class ExtractionRule:
    join_with: str = " "
-# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
+class BeautifulSoupLoader(LoaderInterface):
 class BeautifulSoupCrawler:
    """Crawler for fetching and extracting web content using BeautifulSoup.
    Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
    """
    @property
    def supported_extensions(self) -> List[str]:
        return ["html"]
    @property
    def supported_mime_types(self) -> List[str]:
        pass
    @property
    def loader_name(self) -> str:
        return "beautiful_soup_loader"
    def can_handle(self, extension: str, mime_type: str) -> bool:
        pass
    async def load(self, file_path: str, **kwargs):
        pass
    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.
--- a/cognee/tasks/web_scraper/init.py
+++ b/cognee/tasks/web_scraper/init.py
@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
 BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
 """
 from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
 from .web_scraper_task import cron_web_scraper_task, web_scraper_task
 from .default_url_crawler import DefaultUrlCrawler
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -9,7 +9,6 @@ from re import L
 from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
 from .default_url_crawler import DefaultUrlCrawler
 from .bs4_crawler import BeautifulSoupCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 logger = get_logger(__name__)
@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
    if os.getenv("TAVILY_API_KEY"):
        logger.info("Using Tavily API for url fetching")
-        return await fetch_with_tavily(urls, tavily_config)
+        return await fetch_with_tavily(urls)
    else:
        logger.info("Using default crawler for content extraction")
@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
            await crawler.close()
-async def fetch_with_tavily(
+async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
    urls: Union[str, List[str]], tavily_config: TavilyConfig
 ) -> UrlsToHtmls:
    """Fetch content from URLs using the Tavily API.
    Args:
@ -112,6 +109,7 @@ async def fetch_with_tavily(
        )
        raise
    tavily_config = TavilyConfig()
    url_list = [urls] if isinstance(urls, str) else urls
    extract_depth = tavily_config.extract_depth if tavily_config else "basic"
    timeout = tavily_config.timeout if tavily_config else 10
--- a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
@ -1,10 +1,10 @@
 import pytest
-from cognee.tasks.web_scraper import BeautifulSoupCrawler
+from cognee.tasks.web_scraper import DefaultUrlCrawler
@pytest.mark.asyncio
 async def test_fetch():
-    crawler = BeautifulSoupCrawler()
+    crawler = DefaultUrlCrawler()
    url = "https://en.wikipedia.org/wiki/Large_language_model"
    results = await crawler.fetch_urls(url)
    assert len(results) == 1