switched httpx AsyncClient to fetch webpage

This commit is contained in:
Geoff-Robin 2025-10-02 02:01:46 +05:30
parent 60499c439c
commit c283977035

View file

@ -1,7 +1,7 @@
from tavily import AsyncTavilyClient from tavily import AsyncTavilyClient
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import os import os
import requests import httpx
from typing import Dict, Any, List, Union from typing import Dict, Any, List, Union
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
@ -42,15 +42,17 @@ async def fetch_with_bs4(urls: Union[str, List[str]], extraction_rules: Dict) ->
result_dict = {} result_dict = {}
if isinstance(urls, str): if isinstance(urls, str):
urls = [urls] urls = [urls]
for url in urls: async with httpx.AsyncClient(headers={"User-Agent": "Cognee-Scraper"}) as client:
response = requests.get(url, headers={"User-Agent": "Cognee-Scraper"}) for url in urls:
response.raise_for_status() response = await client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
extracted_data = "" extracted_data = ""
for field, selector in extraction_rules.items():
element = soup.select_one(selector)
extracted_data += (element.get_text(strip=True) + "\n") if element else ""
for field, selector in extraction_rules.items(): result_dict[url] = extracted_data.strip()
element = soup.select_one(selector)
extracted_data += element.get_text(strip=True) if element else ""
return result_dict return result_dict