move bs4 html parsing into bs4_loader
This commit is contained in:
parent
9d9969676f
commit
16e1c60925
4 changed files with 26 additions and 11 deletions
|
|
@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
|
|||
supports robots.txt handling, rate limiting, and custom extraction rules.
|
||||
"""
|
||||
|
||||
from typing import Union, List, Dict, Any, Optional
|
||||
from typing import Union, Dict, Any, Optional, List
|
||||
from dataclasses import dataclass
|
||||
from bs4 import BeautifulSoup
|
||||
from cognee.infrastructure.loaders import LoaderInterface
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -32,8 +33,7 @@ class ExtractionRule:
|
|||
join_with: str = " "
|
||||
|
||||
|
||||
# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
|
||||
class BeautifulSoupCrawler:
|
||||
class BeautifulSoupLoader(LoaderInterface):
|
||||
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
||||
|
||||
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
||||
|
|
@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
|
|||
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
||||
"""
|
||||
|
||||
@property
|
||||
def supported_extensions(self) -> List[str]:
|
||||
return ["html"]
|
||||
|
||||
@property
|
||||
def supported_mime_types(self) -> List[str]:
|
||||
pass
|
||||
|
||||
@property
|
||||
def loader_name(self) -> str:
|
||||
return "beautiful_soup_loader"
|
||||
|
||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||
pass
|
||||
|
||||
async def load(self, file_path: str, **kwargs):
|
||||
pass
|
||||
|
||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||
|
||||
|
|
@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
|
|||
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
||||
"""
|
||||
|
||||
from .bs4_crawler import BeautifulSoupCrawler
|
||||
from .utils import fetch_page_content
|
||||
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
|
||||
from .default_url_crawler import DefaultUrlCrawler
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ from re import L
|
|||
from typing import List, Union, TypeAlias
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from .default_url_crawler import DefaultUrlCrawler
|
||||
from .bs4_crawler import BeautifulSoupCrawler
|
||||
from .config import DefaultCrawlerConfig, TavilyConfig
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
|||
|
||||
if os.getenv("TAVILY_API_KEY"):
|
||||
logger.info("Using Tavily API for url fetching")
|
||||
return await fetch_with_tavily(urls, tavily_config)
|
||||
return await fetch_with_tavily(urls)
|
||||
else:
|
||||
logger.info("Using default crawler for content extraction")
|
||||
|
||||
|
|
@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
|||
await crawler.close()
|
||||
|
||||
|
||||
async def fetch_with_tavily(
|
||||
urls: Union[str, List[str]], tavily_config: TavilyConfig
|
||||
) -> UrlsToHtmls:
|
||||
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||
"""Fetch content from URLs using the Tavily API.
|
||||
|
||||
Args:
|
||||
|
|
@ -112,6 +109,7 @@ async def fetch_with_tavily(
|
|||
)
|
||||
raise
|
||||
|
||||
tavily_config = TavilyConfig()
|
||||
url_list = [urls] if isinstance(urls, str) else urls
|
||||
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
|
||||
timeout = tavily_config.timeout if tavily_config else 10
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
import pytest
|
||||
from cognee.tasks.web_scraper import BeautifulSoupCrawler
|
||||
from cognee.tasks.web_scraper import DefaultUrlCrawler
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fetch():
|
||||
crawler = BeautifulSoupCrawler()
|
||||
crawler = DefaultUrlCrawler()
|
||||
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
||||
results = await crawler.fetch_urls(url)
|
||||
assert len(results) == 1
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue