From c2839770356ff09e6621a3ce27e9cd2f068a78e9 Mon Sep 17 00:00:00 2001 From: Geoff-Robin Date: Thu, 2 Oct 2025 02:01:46 +0530 Subject: [PATCH] switched httpx AsyncClient to fetch webpage --- cognee/tasks/web_scraper/utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 7c90a87a1..5ff105ec9 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -1,7 +1,7 @@ from tavily import AsyncTavilyClient from bs4 import BeautifulSoup import os -import requests +import httpx from typing import Dict, Any, List, Union from cognee.shared.logging_utils import get_logger @@ -42,15 +42,17 @@ async def fetch_with_bs4(urls: Union[str, List[str]], extraction_rules: Dict) -> result_dict = {} if isinstance(urls, str): urls = [urls] - for url in urls: - response = requests.get(url, headers={"User-Agent": "Cognee-Scraper"}) - response.raise_for_status() + async with httpx.AsyncClient(headers={"User-Agent": "Cognee-Scraper"}) as client: + for url in urls: + response = await client.get(url) + response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - extracted_data = "" + soup = BeautifulSoup(response.text, "html.parser") + extracted_data = "" + for field, selector in extraction_rules.items(): + element = soup.select_one(selector) + extracted_data += (element.get_text(strip=True) + "\n") if element else "" - for field, selector in extraction_rules.items(): - element = soup.select_one(selector) - extracted_data += element.get_text(strip=True) if element else "" + result_dict[url] = extracted_data.strip() return result_dict