move bs4 html parsing into bs4_loader
This commit is contained in:
parent
9d9969676f
commit
16e1c60925
4 changed files with 26 additions and 11 deletions
|
|
@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages.
|
||||||
supports robots.txt handling, rate limiting, and custom extraction rules.
|
supports robots.txt handling, rate limiting, and custom extraction rules.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Union, List, Dict, Any, Optional
|
from typing import Union, Dict, Any, Optional, List
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from cognee.infrastructure.loaders import LoaderInterface
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
@ -32,8 +33,7 @@ class ExtractionRule:
|
||||||
join_with: str = " "
|
join_with: str = " "
|
||||||
|
|
||||||
|
|
||||||
# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler
|
class BeautifulSoupLoader(LoaderInterface):
|
||||||
class BeautifulSoupCrawler:
|
|
||||||
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
||||||
|
|
||||||
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
||||||
|
|
@ -50,6 +50,24 @@ class BeautifulSoupCrawler:
|
||||||
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
return ["html"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
return "beautiful_soup_loader"
|
||||||
|
|
||||||
|
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def load(self, file_path: str, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||||
|
|
||||||
|
|
@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag
|
||||||
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .bs4_crawler import BeautifulSoupCrawler
|
|
||||||
from .utils import fetch_page_content
|
from .utils import fetch_page_content
|
||||||
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
|
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
|
||||||
from .default_url_crawler import DefaultUrlCrawler
|
from .default_url_crawler import DefaultUrlCrawler
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from re import L
|
||||||
from typing import List, Union, TypeAlias
|
from typing import List, Union, TypeAlias
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from .default_url_crawler import DefaultUrlCrawler
|
from .default_url_crawler import DefaultUrlCrawler
|
||||||
from .bs4_crawler import BeautifulSoupCrawler
|
|
||||||
from .config import DefaultCrawlerConfig, TavilyConfig
|
from .config import DefaultCrawlerConfig, TavilyConfig
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||||
|
|
||||||
if os.getenv("TAVILY_API_KEY"):
|
if os.getenv("TAVILY_API_KEY"):
|
||||||
logger.info("Using Tavily API for url fetching")
|
logger.info("Using Tavily API for url fetching")
|
||||||
return await fetch_with_tavily(urls, tavily_config)
|
return await fetch_with_tavily(urls)
|
||||||
else:
|
else:
|
||||||
logger.info("Using default crawler for content extraction")
|
logger.info("Using default crawler for content extraction")
|
||||||
|
|
||||||
|
|
@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||||
await crawler.close()
|
await crawler.close()
|
||||||
|
|
||||||
|
|
||||||
async def fetch_with_tavily(
|
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||||
urls: Union[str, List[str]], tavily_config: TavilyConfig
|
|
||||||
) -> UrlsToHtmls:
|
|
||||||
"""Fetch content from URLs using the Tavily API.
|
"""Fetch content from URLs using the Tavily API.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -112,6 +109,7 @@ async def fetch_with_tavily(
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
tavily_config = TavilyConfig()
|
||||||
url_list = [urls] if isinstance(urls, str) else urls
|
url_list = [urls] if isinstance(urls, str) else urls
|
||||||
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
|
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
|
||||||
timeout = tavily_config.timeout if tavily_config else 10
|
timeout = tavily_config.timeout if tavily_config else 10
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
import pytest
|
import pytest
|
||||||
from cognee.tasks.web_scraper import BeautifulSoupCrawler
|
from cognee.tasks.web_scraper import DefaultUrlCrawler
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_fetch():
|
async def test_fetch():
|
||||||
crawler = BeautifulSoupCrawler()
|
crawler = DefaultUrlCrawler()
|
||||||
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
results = await crawler.fetch_urls(url)
|
results = await crawler.fetch_urls(url)
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue