From 344fbbdc293a758f7e6c5f769504074839b92351 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 12:35:11 +0100
Subject: [PATCH 1/7] refactor: make `prefererred_loaders` easier to define on
 user facing api

---
 cognee/api/v1/add/add.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 73a3081be..3f3331899 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -23,7 +23,7 @@ async def add(
     vector_db_config: dict = None,
     graph_db_config: dict = None,
     dataset_id: Optional[UUID] = None,
-    preferred_loaders: dict[str, dict[str, Any]] = None,
+    preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None,
     incremental_loading: bool = True,
     data_per_batch: Optional[int] = 20,
 ):
@@ -164,6 +164,15 @@ async def add(
         - TAVILY_API_KEY: YOUR_TAVILY_API_KEY
 
     """
+    if preferred_loaders:
+        transformed = {}
+        for item in preferred_loaders:
+            if isinstance(item, dict):
+                transformed.update(item)
+            else:
+                transformed[item] = {}
+        preferred_loaders = transformed
+
     tasks = [
         Task(resolve_data_directories, include_subdirectories=True),
         Task(

From ab6a0ef11c2de99e035a644a8b7045a1c905cd70 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 12:39:50 +0100
Subject: [PATCH 2/7] beautilful soup loader: define default comprehensive
 extraction_rules

---
 .../loaders/external/beautiful_soup_loader.py | 94 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index 04954a228..5ed0b16c0 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -66,6 +66,96 @@ class BeautifulSoupLoader(LoaderInterface):
         can = extension in self.supported_extensions and mime_type in self.supported_mime_types
         return can
 
+    def _get_default_extraction_rules(self):
+        # Comprehensive default extraction rules for common HTML content
+        return {
+            # Meta information
+            "title": {"selector": "title", "all": False},
+            "meta_description": {
+                "selector": "meta[name='description']",
+                "attr": "content",
+                "all": False,
+            },
+            "meta_keywords": {
+                "selector": "meta[name='keywords']",
+                "attr": "content",
+                "all": False,
+            },
+            # Open Graph meta tags
+            "og_title": {
+                "selector": "meta[property='og:title']",
+                "attr": "content",
+                "all": False,
+            },
+            "og_description": {
+                "selector": "meta[property='og:description']",
+                "attr": "content",
+                "all": False,
+            },
+            # Main content areas (prioritized selectors)
+            "article": {"selector": "article", "all": True, "join_with": "\n\n"},
+            "main": {"selector": "main", "all": True, "join_with": "\n\n"},
+            # Semantic content sections
+            "headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
+            "headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
+            "headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
+            "headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
+            "headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
+            "headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
+            # Text content
+            "paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
+            "blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
+            "preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
+            # Lists
+            "ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
+            "unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
+            "list_items": {"selector": "li", "all": True, "join_with": "\n"},
+            "definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
+            # Tables
+            "tables": {"selector": "table", "all": True, "join_with": "\n\n"},
+            "table_captions": {
+                "selector": "caption",
+                "all": True,
+                "join_with": "\n",
+            },
+            # Code blocks
+            "code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
+            # Figures and media descriptions
+            "figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
+            "figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
+            "image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
+            # Links (text content, not URLs to avoid clutter)
+            "link_text": {"selector": "a", "all": True, "join_with": " "},
+            # Emphasized text
+            "strong": {"selector": "strong", "all": True, "join_with": " "},
+            "emphasis": {"selector": "em", "all": True, "join_with": " "},
+            "marked": {"selector": "mark", "all": True, "join_with": " "},
+            # Time and data elements
+            "time": {"selector": "time", "all": True, "join_with": " "},
+            "data": {"selector": "data", "all": True, "join_with": " "},
+            # Sections and semantic structure
+            "sections": {"selector": "section", "all": True, "join_with": "\n\n"},
+            "asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
+            "details": {"selector": "details", "all": True, "join_with": "\n"},
+            "summary": {"selector": "summary", "all": True, "join_with": "\n"},
+            # Navigation (may contain important links/structure)
+            "nav": {"selector": "nav", "all": True, "join_with": "\n"},
+            # Footer information
+            "footer": {"selector": "footer", "all": True, "join_with": "\n"},
+            # Divs with specific content roles
+            "content_divs": {
+                "selector": "div[role='main'], div[role='article'], div.content, div#content",
+                "all": True,
+                "join_with": "\n\n",
+            },
+            # Spans with data (fallback for inline content)
+            "data_spans": {
+                "selector": "span[data-*]",
+                "all": True,
+                "join_with": " ",
+            },
+        }
+
     async def load(
         self,
         file_path: str,
@@ -85,7 +175,8 @@ class BeautifulSoupLoader(LoaderInterface):
             Path to the stored extracted text file
         """
         if extraction_rules is None:
-            raise ValueError("extraction_rules required for BeautifulSoupLoader")
+            extraction_rules = self._get_default_extraction_rules()
+            logger.info("Using default comprehensive extraction rules for HTML content")
 
         logger.info(f"Processing HTML file: {file_path}")
 
@@ -115,6 +206,7 @@ class BeautifulSoupLoader(LoaderInterface):
 
         full_content = " ".join(pieces).strip()
 
+        # remove after defaults for extraction rules
         # Fallback: If no content extracted, check if the file is plain text (not HTML)
         if not full_content:
             from bs4 import BeautifulSoup

From 5288ab4ab49333843813c53c39be4f9a883c2f89 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 13:01:06 +0100
Subject: [PATCH 3/7] tests: fix failing tests

---
 .../loaders/external/beautiful_soup_loader.py         |  6 ------
 .../web_url_crawler/test_url_adding_e2e.py            | 11 +++++------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
index 5ed0b16c0..8bea8fb6e 100644
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@@ -148,12 +148,6 @@ class BeautifulSoupLoader(LoaderInterface):
                 "all": True,
                 "join_with": "\n\n",
             },
-            # Spans with data (fallback for inline content)
-            "data_spans": {
-                "selector": "span[data-*]",
-                "all": True,
-                "join_with": " ",
-            },
         }
 
     async def load(
diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
index afe2dce7f..c098f5928 100644
--- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
+++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
@@ -185,7 +185,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
 
 
 @pytest.mark.asyncio
-async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
+async def test_beautiful_soup_loader_works_with_and_without_arguments():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
@@ -203,11 +203,10 @@ async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
         bs_loader = BeautifulSoupLoader()
         loader_engine.register_loader(bs_loader)
         preferred_loaders = {"beautiful_soup_loader": {}}
-        with pytest.raises(ValueError):
-            await loader_engine.load_file(
-                file_path,
-                preferred_loaders=preferred_loaders,
-            )
+        await loader_engine.load_file(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
         extraction_rules = {
             "title": {"selector": "title"},
             "headings": {"selector": "h1, h2, h3", "all": True},

From 925323fb35e7d9b56b30087f4b7b5e4bb99af61b Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 13:08:08 +0100
Subject: [PATCH 4/7] add test for `cognee.add()` when tavily is used

---
 .../web_url_crawler/test_url_adding_e2e.py      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
index c098f5928..7be0df341 100644
--- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
+++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
@@ -1,3 +1,4 @@
+import os
 import pytest
 import cognee
 from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
@@ -67,6 +68,22 @@ async def test_add_url():
     await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
 
 
+skip_in_ci = pytest.mark.skipif(
+    os.getenv("GITHUB_ACTIONS") == "true",
+    reason="Skipping in Github for now - before we get TAVILY_API_KEY",
+)
+
+
+@skip_in_ci
+@pytest.mark.asyncio
+async def test_add_url_with_tavily():
+    assert os.getenv("TAVILY_API_KEY") is not None
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
+
+
 @pytest.mark.asyncio
 async def test_add_url_without_incremental_loading():
     await cognee.prune.prune_data()

From b9afc54233cbe3c1d567cb18b4c60107b413fd29 Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 13:15:15 +0100
Subject: [PATCH 5/7] add test cases for tavily

---
 .../web_url_crawler/test_url_adding_e2e.py    | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
index 7be0df341..d91b075aa 100644
--- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
+++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
@@ -26,6 +26,13 @@ async def test_url_saves_as_html_file():
         pytest.fail(f"Failed to save data item to storage: {e}")
 
 
+skip_for_tavily = pytest.mark.skipif(
+    os.getenv("TAVILY_API_KEY") is not None,
+    reason="Skipping as Tavily already handles parsing and outputs text",
+)
+
+
+@skip_for_tavily
 @pytest.mark.asyncio
 async def test_saved_html_is_valid():
     try:
@@ -113,7 +120,18 @@ async def test_add_url_with_incremental_loading():
 
 
 @pytest.mark.asyncio
-async def test_add_url_with_extraction_rules():  # TODO: this'll fail due to not implemented `load()` yet
+async def test_add_url_can_define_preferred_loader_as_list_of_str():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    await cognee.add(
+        "https://en.wikipedia.org/wiki/Large_language_model",
+        preferred_loaders=["beautiful_soup_loader"],
+    )
+
+
+@pytest.mark.asyncio
+async def test_add_url_with_extraction_rules():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 

From a499bd4d3f98468defc1706e65452ff89a50033d Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 17:10:39 +0100
Subject: [PATCH 6/7] ruff format

---
 .../infrastructure/databases/vector/lancedb/LanceDBAdapter.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
index 31243306d..30631ac4c 100644
--- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
+++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
@@ -181,7 +181,7 @@ class LanceDBAdapter(VectorDBInterface):
         def create_lance_data_point(data_point: DataPoint, vector: list[float]) -> LanceDataPoint:
             properties = get_own_properties(data_point)
             properties["id"] = str(properties["id"])
-            
+
             return LanceDataPoint[str, self.get_data_point_schema(type(data_point))](
                 id=str(data_point.id),
                 vector=vector,

From 90118562d8d3e40ef8f7626b65affa7ce772c62c Mon Sep 17 00:00:00 2001
From: Daulet Amirkhanov <damirkhanov01@gmail.com>
Date: Wed, 22 Oct 2025 17:19:36 +0100
Subject: [PATCH 7/7] `cognee.add()` - add more explicit check for empty
 `preferred_loaders` param

---
 cognee/api/v1/add/add.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 3f3331899..a521b316b 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -164,7 +164,7 @@ async def add(
         - TAVILY_API_KEY: YOUR_TAVILY_API_KEY
 
     """
-    if preferred_loaders:
+    if preferred_loaders is not None:
         transformed = {}
         for item in preferred_loaders:
             if isinstance(item, dict):