From 16e1c609253f74a36061b49e3ef533e9b5490272 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Tue, 21 Oct 2025 16:43:56 +0100
Subject: [PATCH] move bs4 html parsing into `bs4_loader`

---
 .../loaders/external/bs4_loader.py}           | 24 ++++++++++++++++---
 cognee/tasks/web_scraper/__init__.py          |  1 -
 cognee/tasks/web_scraper/utils.py             |  8 +++----
 .../web_url_crawler/test_bs4_crawler.py       |  4 ++--
 4 files changed, 26 insertions(+), 11 deletions(-)
 rename cognee/{tasks/web_scraper/bs4_crawler.py => infrastructure/loaders/external/bs4_loader.py} (89%)

diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/infrastructure/loaders/external/bs4_loader.py
similarity index 89%
rename from cognee/tasks/web_scraper/bs4_crawler.py
rename to cognee/infrastructure/loaders/external/bs4_loader.py
index 171a76633..8022de04f 100644
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/infrastructure/loaders/external/bs4_loader.py
@@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
 
-from typing import Union, List, Dict, Any, Optional
+from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
+from cognee.infrastructure.loaders import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 
 logger = get_logger(__name__)
@@ -32,8 +33,7 @@ class ExtractionRule:
     join_with: str = " "
 
 
-# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
-class BeautifulSoupCrawler:
+class BeautifulSoupLoader(LoaderInterface):
     """Crawler for fetching and extracting web content using BeautifulSoup.
 
     Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
@@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
         robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
     """
 
+    @property
+    def supported_extensions(self) -> List[str]:
+        return ["html"]
+
+    @property
+    def supported_mime_types(self) -> List[str]:
+        pass
+
+    @property
+    def loader_name(self) -> str:
+        return "beautiful_soup_loader"
+
+    def can_handle(self, extension: str, mime_type: str) -> bool:
+        pass
+
+    async def load(self, file_path: str, **kwargs):
+        pass
+
     def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
         """Normalize an extraction rule to an ExtractionRule dataclass.
 
diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py
index f4d6677c7..26c3e68cf 100644
--- a/cognee/tasks/web_scraper/__init__.py
+++ b/cognee/tasks/web_scraper/__init__.py
@@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
 BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
 """
 
-from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
 from .web_scraper_task import cron_web_scraper_task, web_scraper_task
 from .default_url_crawler import DefaultUrlCrawler
diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py
index 0cbd355a3..b1cbf82e9 100644
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@@ -9,7 +9,6 @@ from re import L
 from typing import List, Union, TypeAlias
 from cognee.shared.logging_utils import get_logger
 from .default_url_crawler import DefaultUrlCrawler
-from .bs4_crawler import BeautifulSoupCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 
 logger = get_logger(__name__)
@@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
 
     if os.getenv("TAVILY_API_KEY"):
         logger.info("Using Tavily API for url fetching")
-        return await fetch_with_tavily(urls, tavily_config)
+        return await fetch_with_tavily(urls)
     else:
         logger.info("Using default crawler for content extraction")
 
@@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
             await crawler.close()
 
 
-async def fetch_with_tavily(
-    urls: Union[str, List[str]], tavily_config: TavilyConfig
-) -> UrlsToHtmls:
+async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
     """Fetch content from URLs using the Tavily API.
 
     Args:
@@ -112,6 +109,7 @@ async def fetch_with_tavily(
         )
         raise
 
+    tavily_config = TavilyConfig()
     url_list = [urls] if isinstance(urls, str) else urls
     extract_depth = tavily_config.extract_depth if tavily_config else "basic"
     timeout = tavily_config.timeout if tavily_config else 10
diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
index 0e7637d86..156cc87a4 100644
--- a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py
@@ -1,10 +1,10 @@
 import pytest
-from cognee.tasks.web_scraper import BeautifulSoupCrawler
+from cognee.tasks.web_scraper import DefaultUrlCrawler
 
 
 @pytest.mark.asyncio
 async def test_fetch():
-    crawler = BeautifulSoupCrawler()
+    crawler = DefaultUrlCrawler()
     url = "https://en.wikipedia.org/wiki/Large_language_model"
     results = await crawler.fetch_urls(url)
     assert len(results) == 1