Make BS4 loader compatible with tavily fetcher
This commit is contained in:
parent
20c9e5498b
commit
10e4fd7681
1 changed files with 17 additions and 0 deletions
|
|
@ -115,6 +115,23 @@ class BeautifulSoupLoader(LoaderInterface):
|
|||
|
||||
full_content = " ".join(pieces).strip()
|
||||
|
||||
# Fallback: If no content extracted, check if the file is plain text (not HTML)
|
||||
if not full_content:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# If there are no HTML tags, treat as plain text
|
||||
if not soup.find():
|
||||
logger.warning(
|
||||
f"No HTML tags found in {file_path}. Treating as plain text. "
|
||||
"This may happen when content is pre-extracted (e.g., via Tavily with text format)."
|
||||
)
|
||||
full_content = html.decode("utf-8") if isinstance(html, bytes) else html
|
||||
full_content = full_content.strip()
|
||||
|
||||
if not full_content:
|
||||
logger.warning(f"No content extracted from HTML file: {file_path}")
|
||||
|
||||
# Store the extracted content
|
||||
storage_config = get_storage_config()
|
||||
data_root_directory = storage_config["data_root_directory"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue