removed structured argument

2025-10-05 20:00:19 +05:30 · 2025-10-05 20:00:19 +05:30 · c2aa95521c
commit c2aa95521c
parent 2cba31a086
3 changed files with 5 additions and 14 deletions
--- a/cognee/tasks/web_scraper/bs4_crawler.py
+++ b/cognee/tasks/web_scraper/bs4_crawler.py
@ -262,11 +262,10 @@ class BeautifulSoupCrawler:
        use_playwright: bool = False,
        playwright_js_wait: float = 0.8,
        join_all_matches: bool = False,
-        structured: bool = False,  # return structured output instead of concatenated string
+    ) -> Dict[str, str]:
    ) -> Dict[str, Union[str, Dict[str, str]]]:
        """
        Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath).
-        Returns: dict[url] -> concatenated string OR structured dict depending on `structured`.
+        Returns: dict[url] -> concatenated string of extracted content.
        """
        if isinstance(urls, str):
            urls = [urls]
@ -284,7 +283,7 @@ class BeautifulSoupCrawler:
                allowed = await self._is_url_allowed(url)
                if not allowed:
                    logger.warning(f"URL disallowed by robots.txt: {url}")
-                    return url, "" if not structured else {}
+                    return url, ""
                # fetch (rendered or not)
                if use_playwright:
@ -294,12 +293,6 @@ class BeautifulSoupCrawler:
                else:
                    html = await self._fetch_httpx(url)
                if structured:
                    return url, {
                        field: self._extract_with_bs4(html, rule)
                        for field, rule in normalized_rules.items()
                    }
                pieces = []
                for field, rule in normalized_rules.items():
                    text = self._extract_with_bs4(html, rule)
@ -314,8 +307,8 @@ class BeautifulSoupCrawler:
            try:
                url, text = await coro
            except Exception as e:
-                results[url] = {} if structured else ""
+                results[url] = ""
                logger.error(f"Error processing {url}: {e}")
                continue
            results[url] = text
-        return results
+        return results
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -21,4 +21,3 @@ class SoupCrawlerConfig(BaseModel):
    use_playwright: bool = False
    playwright_js_wait: float = 0.8
    join_all_matches: bool = False
    structured: bool = False
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -54,7 +54,6 @@ async def fetch_page_content(
                use_playwright=soup_crawler_config.use_playwright,
                playwright_js_wait=soup_crawler_config.playwright_js_wait,
                join_all_matches=soup_crawler_config.join_all_matches,
                structured=soup_crawler_config.structured,
            )
            return results
        except Exception as e: