From c2aa95521c2f98fe1a8ff501652503958b4b8ac6 Mon Sep 17 00:00:00 2001 From: Geoff-Robin Date: Sun, 5 Oct 2025 20:00:19 +0530 Subject: [PATCH] removed structured argument --- cognee/tasks/web_scraper/bs4_crawler.py | 17 +++++------------ cognee/tasks/web_scraper/config.py | 1 - cognee/tasks/web_scraper/utils.py | 1 - 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py index 514567317..19631a9fa 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/tasks/web_scraper/bs4_crawler.py @@ -262,11 +262,10 @@ class BeautifulSoupCrawler: use_playwright: bool = False, playwright_js_wait: float = 0.8, join_all_matches: bool = False, - structured: bool = False, # return structured output instead of concatenated string - ) -> Dict[str, Union[str, Dict[str, str]]]: + ) -> Dict[str, str]: """ Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath). - Returns: dict[url] -> concatenated string OR structured dict depending on `structured`. + Returns: dict[url] -> concatenated string of extracted content. """ if isinstance(urls, str): urls = [urls] @@ -284,7 +283,7 @@ class BeautifulSoupCrawler: allowed = await self._is_url_allowed(url) if not allowed: logger.warning(f"URL disallowed by robots.txt: {url}") - return url, "" if not structured else {} + return url, "" # fetch (rendered or not) if use_playwright: @@ -294,12 +293,6 @@ class BeautifulSoupCrawler: else: html = await self._fetch_httpx(url) - if structured: - return url, { - field: self._extract_with_bs4(html, rule) - for field, rule in normalized_rules.items() - } - pieces = [] for field, rule in normalized_rules.items(): text = self._extract_with_bs4(html, rule) @@ -314,8 +307,8 @@ class BeautifulSoupCrawler: try: url, text = await coro except Exception as e: - results[url] = {} if structured else "" + results[url] = "" logger.error(f"Error processing {url}: {e}") continue results[url] = text - return results + return results \ No newline at end of file diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index 505cef1df..4819800ab 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -21,4 +21,3 @@ class SoupCrawlerConfig(BaseModel): use_playwright: bool = False playwright_js_wait: float = 0.8 join_all_matches: bool = False - structured: bool = False diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index d6b57d615..c1b6ecb53 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -54,7 +54,6 @@ async def fetch_page_content( use_playwright=soup_crawler_config.use_playwright, playwright_js_wait=soup_crawler_config.playwright_js_wait, join_all_matches=soup_crawler_config.join_all_matches, - structured=soup_crawler_config.structured, ) return results except Exception as e: