switched httpx AsyncClient to fetch webpage
This commit is contained in:
parent
60499c439c
commit
c283977035
1 changed files with 11 additions and 9 deletions
|
|
@ -1,7 +1,7 @@
|
||||||
from tavily import AsyncTavilyClient
|
from tavily import AsyncTavilyClient
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import os
|
import os
|
||||||
import requests
|
import httpx
|
||||||
from typing import Dict, Any, List, Union
|
from typing import Dict, Any, List, Union
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
|
@ -42,15 +42,17 @@ async def fetch_with_bs4(urls: Union[str, List[str]], extraction_rules: Dict) ->
|
||||||
result_dict = {}
|
result_dict = {}
|
||||||
if isinstance(urls, str):
|
if isinstance(urls, str):
|
||||||
urls = [urls]
|
urls = [urls]
|
||||||
for url in urls:
|
async with httpx.AsyncClient(headers={"User-Agent": "Cognee-Scraper"}) as client:
|
||||||
response = requests.get(url, headers={"User-Agent": "Cognee-Scraper"})
|
for url in urls:
|
||||||
response.raise_for_status()
|
response = await client.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
extracted_data = ""
|
extracted_data = ""
|
||||||
|
for field, selector in extraction_rules.items():
|
||||||
|
element = soup.select_one(selector)
|
||||||
|
extracted_data += (element.get_text(strip=True) + "\n") if element else ""
|
||||||
|
|
||||||
for field, selector in extraction_rules.items():
|
result_dict[url] = extracted_data.strip()
|
||||||
element = soup.select_one(selector)
|
|
||||||
extracted_data += element.get_text(strip=True) if element else ""
|
|
||||||
|
|
||||||
return result_dict
|
return result_dict
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue