removed structured argument
This commit is contained in:
parent
2cba31a086
commit
c2aa95521c
3 changed files with 5 additions and 14 deletions
|
|
@ -262,11 +262,10 @@ class BeautifulSoupCrawler:
|
||||||
use_playwright: bool = False,
|
use_playwright: bool = False,
|
||||||
playwright_js_wait: float = 0.8,
|
playwright_js_wait: float = 0.8,
|
||||||
join_all_matches: bool = False,
|
join_all_matches: bool = False,
|
||||||
structured: bool = False, # return structured output instead of concatenated string
|
) -> Dict[str, str]:
|
||||||
) -> Dict[str, Union[str, Dict[str, str]]]:
|
|
||||||
"""
|
"""
|
||||||
Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath).
|
Fetch one or more URLs and extract text using BeautifulSoup (or lxml xpath).
|
||||||
Returns: dict[url] -> concatenated string OR structured dict depending on `structured`.
|
Returns: dict[url] -> concatenated string of extracted content.
|
||||||
"""
|
"""
|
||||||
if isinstance(urls, str):
|
if isinstance(urls, str):
|
||||||
urls = [urls]
|
urls = [urls]
|
||||||
|
|
@ -284,7 +283,7 @@ class BeautifulSoupCrawler:
|
||||||
allowed = await self._is_url_allowed(url)
|
allowed = await self._is_url_allowed(url)
|
||||||
if not allowed:
|
if not allowed:
|
||||||
logger.warning(f"URL disallowed by robots.txt: {url}")
|
logger.warning(f"URL disallowed by robots.txt: {url}")
|
||||||
return url, "" if not structured else {}
|
return url, ""
|
||||||
|
|
||||||
# fetch (rendered or not)
|
# fetch (rendered or not)
|
||||||
if use_playwright:
|
if use_playwright:
|
||||||
|
|
@ -294,12 +293,6 @@ class BeautifulSoupCrawler:
|
||||||
else:
|
else:
|
||||||
html = await self._fetch_httpx(url)
|
html = await self._fetch_httpx(url)
|
||||||
|
|
||||||
if structured:
|
|
||||||
return url, {
|
|
||||||
field: self._extract_with_bs4(html, rule)
|
|
||||||
for field, rule in normalized_rules.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
pieces = []
|
pieces = []
|
||||||
for field, rule in normalized_rules.items():
|
for field, rule in normalized_rules.items():
|
||||||
text = self._extract_with_bs4(html, rule)
|
text = self._extract_with_bs4(html, rule)
|
||||||
|
|
@ -314,8 +307,8 @@ class BeautifulSoupCrawler:
|
||||||
try:
|
try:
|
||||||
url, text = await coro
|
url, text = await coro
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
results[url] = {} if structured else ""
|
results[url] = ""
|
||||||
logger.error(f"Error processing {url}: {e}")
|
logger.error(f"Error processing {url}: {e}")
|
||||||
continue
|
continue
|
||||||
results[url] = text
|
results[url] = text
|
||||||
return results
|
return results
|
||||||
|
|
@ -21,4 +21,3 @@ class SoupCrawlerConfig(BaseModel):
|
||||||
use_playwright: bool = False
|
use_playwright: bool = False
|
||||||
playwright_js_wait: float = 0.8
|
playwright_js_wait: float = 0.8
|
||||||
join_all_matches: bool = False
|
join_all_matches: bool = False
|
||||||
structured: bool = False
|
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,6 @@ async def fetch_page_content(
|
||||||
use_playwright=soup_crawler_config.use_playwright,
|
use_playwright=soup_crawler_config.use_playwright,
|
||||||
playwright_js_wait=soup_crawler_config.playwright_js_wait,
|
playwright_js_wait=soup_crawler_config.playwright_js_wait,
|
||||||
join_all_matches=soup_crawler_config.join_all_matches,
|
join_all_matches=soup_crawler_config.join_all_matches,
|
||||||
structured=soup_crawler_config.structured,
|
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue