removed structured argument

This commit is contained in:
Geoff-Robin 2025-10-05 20:00:19 +05:30
parent 2cba31a086
commit c2aa95521c
3 changed files with 5 additions and 14 deletions

View file

@ -262,11 +262,10 @@ class BeautifulSoupCrawler:
use_playwright: bool = False,
playwright_js_wait: float = 0.8,
join_all_matches: bool = False,
structured: bool = False, # return structured output instead of concatenated string
) -> Dict[str, Union[str, Dict[str, str]]]:
) -> Dict[str, str]:
"""
Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath).
Returns: dict[url] -> concatenated string OR structured dict depending on `structured`.
Returns: dict[url] -> concatenated string of extracted content.
"""
if isinstance(urls, str):
urls = [urls]
@ -284,7 +283,7 @@ class BeautifulSoupCrawler:
allowed = await self._is_url_allowed(url)
if not allowed:
logger.warning(f"URL disallowed by robots.txt: {url}")
return url, "" if not structured else {}
return url, ""
# fetch (rendered or not)
if use_playwright:
@ -294,12 +293,6 @@ class BeautifulSoupCrawler:
else:
html = await self._fetch_httpx(url)
if structured:
return url, {
field: self._extract_with_bs4(html, rule)
for field, rule in normalized_rules.items()
}
pieces = []
for field, rule in normalized_rules.items():
text = self._extract_with_bs4(html, rule)
@ -314,8 +307,8 @@ class BeautifulSoupCrawler:
try:
url, text = await coro
except Exception as e:
results[url] = {} if structured else ""
results[url] = ""
logger.error(f"Error processing {url}: {e}")
continue
results[url] = text
return results
return results

View file

@ -21,4 +21,3 @@ class SoupCrawlerConfig(BaseModel):
use_playwright: bool = False
playwright_js_wait: float = 0.8
join_all_matches: bool = False
structured: bool = False

View file

@ -54,7 +54,6 @@ async def fetch_page_content(
use_playwright=soup_crawler_config.use_playwright,
playwright_js_wait=soup_crawler_config.playwright_js_wait,
join_all_matches=soup_crawler_config.join_all_matches,
structured=soup_crawler_config.structured,
)
return results
except Exception as e: