Make BS4 loader compatible with tavily fetcher

2025-10-21 23:46:21 +01:00 · 2025-10-21 23:46:21 +01:00 · 10e4fd7681
commit 10e4fd7681
parent 20c9e5498b
1 changed files with 17 additions and 0 deletions
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -115,6 +115,23 @@ class BeautifulSoupLoader(LoaderInterface):
        full_content = " ".join(pieces).strip()
        # Fallback: If no content extracted, check if the file is plain text (not HTML)
        if not full_content:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")
            # If there are no HTML tags, treat as plain text
            if not soup.find():
                logger.warning(
                    f"No HTML tags found in {file_path}. Treating as plain text. "
                    "This may happen when content is pre-extracted (e.g., via Tavily with text format)."
                )
                full_content = html.decode("utf-8") if isinstance(html, bytes) else html
                full_content = full_content.strip()
        if not full_content:
            logger.warning(f"No content extracted from HTML file: {file_path}")
        # Store the extracted content
        storage_config = get_storage_config()
        data_root_directory = storage_config["data_root_directory"]