Make BS4 loader compatible with tavily fetcher

This commit is contained in:
Daulet Amirkhanov 2025-10-21 23:46:21 +01:00
parent 20c9e5498b
commit 10e4fd7681

View file

@ -115,6 +115,23 @@ class BeautifulSoupLoader(LoaderInterface):
full_content = " ".join(pieces).strip()
# Fallback: If no content extracted, check if the file is plain text (not HTML)
if not full_content:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# If there are no HTML tags, treat as plain text
if not soup.find():
logger.warning(
f"No HTML tags found in {file_path}. Treating as plain text. "
"This may happen when content is pre-extracted (e.g., via Tavily with text format)."
)
full_content = html.decode("utf-8") if isinstance(html, bytes) else html
full_content = full_content.strip()
if not full_content:
logger.warning(f"No content extracted from HTML file: {file_path}")
# Store the extracted content
storage_config = get_storage_config()
data_root_directory = storage_config["data_root_directory"]