Make BS4 loader compatible with tavily fetcher

2025-10-21 23:46:21 +01:00 · 2025-10-21 23:46:21 +01:00 · 10e4fd7681
commit 10e4fd7681
parent 20c9e5498b
1 changed files with 17 additions and 0 deletions
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -115,6 +115,23 @@ class BeautifulSoupLoader(LoaderInterface):

        full_content = " ".join(pieces).strip()

+        # Fallback: If no content extracted, check if the file is plain text (not HTML)
+        if not full_content:
+            from bs4 import BeautifulSoup
+
+            soup = BeautifulSoup(html, "html.parser")
+            # If there are no HTML tags, treat as plain text
+            if not soup.find():
+                logger.warning(
+                    f"No HTML tags found in {file_path}. Treating as plain text. "
+                    "This may happen when content is pre-extracted (e.g., via Tavily with text format)."
+                )
+                full_content = html.decode("utf-8") if isinstance(html, bytes) else html
+                full_content = full_content.strip()
+
+        if not full_content:
+            logger.warning(f"No content extracted from HTML file: {file_path}")
+
        # Store the extracted content
        storage_config = get_storage_config()
        data_root_directory = storage_config["data_root_directory"]