move bs4 html parsing into bs4_loader

2025-10-21 16:43:56 +01:00 · 2025-10-21 16:43:56 +01:00 · 16e1c60925
commit 16e1c60925
parent 9d9969676f
4 changed files with 26 additions and 11 deletions
--- a/cognee/infrastructure/loaders/external/bs4_loader.py
+++ b/cognee/infrastructure/loaders/external/bs4_loader.py
@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """

-from typing import Union, List, Dict, Any, Optional
+from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
+from cognee.infrastructure.loaders import LoaderInterface
 from cognee.shared.logging_utils import get_logger

 logger = get_logger(__name__)
@ -32,8 +33,7 @@ class ExtractionRule:
    join_with: str = " "


-# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
-class BeautifulSoupCrawler:
+class BeautifulSoupLoader(LoaderInterface):
    """Crawler for fetching and extracting web content using BeautifulSoup.

    Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
    """

+    @property
+    def supported_extensions(self) -> List[str]:
+        return ["html"]
+
+    @property
+    def supported_mime_types(self) -> List[str]:
+        pass
+
+    @property
+    def loader_name(self) -> str:
+        return "beautiful_soup_loader"
+
+    def can_handle(self, extension: str, mime_type: str) -> bool:
+        pass
+
+    async def load(self, file_path: str, **kwargs):
+        pass
+
    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.

--- a/cognee/tasks/web_scraper/init.py
+++ b/cognee/tasks/web_scraper/init.py
@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
 BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
 """

-from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
 from .web_scraper_task import cron_web_scraper_task, web_scraper_task
 from .default_url_crawler import DefaultUrlCrawler
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -9,7 +9,6 @@ from re import L
 from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
 from .default_url_crawler import DefaultUrlCrawler
-from .bs4_crawler import BeautifulSoupCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig

 logger = get_logger(__name__)
@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:

    if os.getenv("TAVILY_API_KEY"):
        logger.info("Using Tavily API for url fetching")
-        return await fetch_with_tavily(urls, tavily_config)
+        return await fetch_with_tavily(urls)
    else:
        logger.info("Using default crawler for content extraction")

@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
            await crawler.close()


-async def fetch_with_tavily(
-    urls: Union[str, List[str]], tavily_config: TavilyConfig
-) -> UrlsToHtmls:
+async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
    """Fetch content from URLs using the Tavily API.

    Args:
@ -112,6 +109,7 @@ async def fetch_with_tavily(
        )
        raise

+    tavily_config = TavilyConfig()
    url_list = [urls] if isinstance(urls, str) else urls
    extract_depth = tavily_config.extract_depth if tavily_config else "basic"
    timeout = tavily_config.timeout if tavily_config else 10
--- a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
@ -1,10 +1,10 @@
 import pytest
-from cognee.tasks.web_scraper import BeautifulSoupCrawler
+from cognee.tasks.web_scraper import DefaultUrlCrawler


@pytest.mark.asyncio
 async def test_fetch():
-    crawler = BeautifulSoupCrawler()
+    crawler = DefaultUrlCrawler()
    url = "https://en.wikipedia.org/wiki/Large_language_model"
    results = await crawler.fetch_urls(url)
    assert len(results) == 1