From ab6a0ef11c2de99e035a644a8b7045a1c905cd70 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 12:39:50 +0100
Subject: [PATCH] beautilful soup loader: define default comprehensive
 extraction_rules

---
 .../loaders/external/beautiful_soup_loader.py | 94 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index 04954a228..5ed0b16c0 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -66,6 +66,96 @@ class BeautifulSoupLoader(LoaderInterface):
         can = extension in self.supported_extensions and mime_type in self.supported_mime_types
         return can
 
+    def _get_default_extraction_rules(self):
+        # Comprehensive default extraction rules for common HTML content
+        return {
+            # Meta information
+            "title": {"selector": "title", "all": False},
+            "meta_description": {
+                "selector": "meta[name='description']",
+                "attr": "content",
+                "all": False,
+            },
+            "meta_keywords": {
+                "selector": "meta[name='keywords']",
+                "attr": "content",
+                "all": False,
+            },
+            # Open Graph meta tags
+            "og_title": {
+                "selector": "meta[property='og:title']",
+                "attr": "content",
+                "all": False,
+            },
+            "og_description": {
+                "selector": "meta[property='og:description']",
+                "attr": "content",
+                "all": False,
+            },
+            # Main content areas (prioritized selectors)
+            "article": {"selector": "article", "all": True, "join_with": "\n\n"},
+            "main": {"selector": "main", "all": True, "join_with": "\n\n"},
+            # Semantic content sections
+            "headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
+            "headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
+            "headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
+            "headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
+            "headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
+            "headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
+            # Text content
+            "paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
+            "blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
+            "preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
+            # Lists
+            "ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
+            "unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
+            "list_items": {"selector": "li", "all": True, "join_with": "\n"},
+            "definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
+            # Tables
+            "tables": {"selector": "table", "all": True, "join_with": "\n\n"},
+            "table_captions": {
+                "selector": "caption",
+                "all": True,
+                "join_with": "\n",
+            },
+            # Code blocks
+            "code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
+            # Figures and media descriptions
+            "figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
+            "figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
+            "image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
+            # Links (text content, not URLs to avoid clutter)
+            "link_text": {"selector": "a", "all": True, "join_with": " "},
+            # Emphasized text
+            "strong": {"selector": "strong", "all": True, "join_with": " "},
+            "emphasis": {"selector": "em", "all": True, "join_with": " "},
+            "marked": {"selector": "mark", "all": True, "join_with": " "},
+            # Time and data elements
+            "time": {"selector": "time", "all": True, "join_with": " "},
+            "data": {"selector": "data", "all": True, "join_with": " "},
+            # Sections and semantic structure
+            "sections": {"selector": "section", "all": True, "join_with": "\n\n"},
+            "asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
+            "details": {"selector": "details", "all": True, "join_with": "\n"},
+            "summary": {"selector": "summary", "all": True, "join_with": "\n"},
+            # Navigation (may contain important links/structure)
+            "nav": {"selector": "nav", "all": True, "join_with": "\n"},
+            # Footer information
+            "footer": {"selector": "footer", "all": True, "join_with": "\n"},
+            # Divs with specific content roles
+            "content_divs": {
+                "selector": "div[role='main'], div[role='article'], div.content, div#content",
+                "all": True,
+                "join_with": "\n\n",
+            },
+            # Spans with data (fallback for inline content)
+            "data_spans": {
+                "selector": "span[data-*]",
+                "all": True,
+                "join_with": " ",
+            },
+        }
+
     async def load(
         self,
         file_path: str,
@@ -85,7 +175,8 @@ class BeautifulSoupLoader(LoaderInterface):
             Path to the stored extracted text file
         """
         if extraction_rules is None:
-            raise ValueError("extraction_rules required for BeautifulSoupLoader")
+            extraction_rules = self._get_default_extraction_rules()
+            logger.info("Using default comprehensive extraction rules for HTML content")
 
         logger.info(f"Processing HTML file: {file_path}")
 
@@ -115,6 +206,7 @@ class BeautifulSoupLoader(LoaderInterface):
 
         full_content = " ".join(pieces).strip()
 
+        # remove after defaults for extraction rules
         # Fallback: If no content extracted, check if the file is plain text (not HTML)
         if not full_content:
             from bs4 import BeautifulSoup