From 344fbbdc293a758f7e6c5f769504074839b92351 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 12:35:11 +0100 Subject: [PATCH 1/7] refactor: make `prefererred_loaders` easier to define on user facing api --- cognee/api/v1/add/add.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 73a3081be..3f3331899 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -23,7 +23,7 @@ async def add( vector_db_config: dict = None, graph_db_config: dict = None, dataset_id: Optional[UUID] = None, - preferred_loaders: dict[str, dict[str, Any]] = None, + preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None, incremental_loading: bool = True, data_per_batch: Optional[int] = 20, ): @@ -164,6 +164,15 @@ async def add( - TAVILY_API_KEY: YOUR_TAVILY_API_KEY """ + if preferred_loaders: + transformed = {} + for item in preferred_loaders: + if isinstance(item, dict): + transformed.update(item) + else: + transformed[item] = {} + preferred_loaders = transformed + tasks = [ Task(resolve_data_directories, include_subdirectories=True), Task( From ab6a0ef11c2de99e035a644a8b7045a1c905cd70 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 12:39:50 +0100 Subject: [PATCH 2/7] beautilful soup loader: define default comprehensive extraction_rules --- .../loaders/external/beautiful_soup_loader.py | 94 ++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index 04954a228..5ed0b16c0 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -66,6 +66,96 @@ class BeautifulSoupLoader(LoaderInterface): can = extension in self.supported_extensions and mime_type in self.supported_mime_types return can + def _get_default_extraction_rules(self): + # Comprehensive default extraction rules for common HTML content + return { + # Meta information + "title": {"selector": "title", "all": False}, + "meta_description": { + "selector": "meta[name='description']", + "attr": "content", + "all": False, + }, + "meta_keywords": { + "selector": "meta[name='keywords']", + "attr": "content", + "all": False, + }, + # Open Graph meta tags + "og_title": { + "selector": "meta[property='og:title']", + "attr": "content", + "all": False, + }, + "og_description": { + "selector": "meta[property='og:description']", + "attr": "content", + "all": False, + }, + # Main content areas (prioritized selectors) + "article": {"selector": "article", "all": True, "join_with": "\n\n"}, + "main": {"selector": "main", "all": True, "join_with": "\n\n"}, + # Semantic content sections + "headers_h1": {"selector": "h1", "all": True, "join_with": "\n"}, + "headers_h2": {"selector": "h2", "all": True, "join_with": "\n"}, + "headers_h3": {"selector": "h3", "all": True, "join_with": "\n"}, + "headers_h4": {"selector": "h4", "all": True, "join_with": "\n"}, + "headers_h5": {"selector": "h5", "all": True, "join_with": "\n"}, + "headers_h6": {"selector": "h6", "all": True, "join_with": "\n"}, + # Text content + "paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"}, + "blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"}, + "preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"}, + # Lists + "ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"}, + "unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"}, + "list_items": {"selector": "li", "all": True, "join_with": "\n"}, + "definition_lists": {"selector": "dl", "all": True, "join_with": "\n"}, + # Tables + "tables": {"selector": "table", "all": True, "join_with": "\n\n"}, + "table_captions": { + "selector": "caption", + "all": True, + "join_with": "\n", + }, + # Code blocks + "code_blocks": {"selector": "code", "all": True, "join_with": "\n"}, + # Figures and media descriptions + "figures": {"selector": "figure", "all": True, "join_with": "\n\n"}, + "figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"}, + "image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "}, + # Links (text content, not URLs to avoid clutter) + "link_text": {"selector": "a", "all": True, "join_with": " "}, + # Emphasized text + "strong": {"selector": "strong", "all": True, "join_with": " "}, + "emphasis": {"selector": "em", "all": True, "join_with": " "}, + "marked": {"selector": "mark", "all": True, "join_with": " "}, + # Time and data elements + "time": {"selector": "time", "all": True, "join_with": " "}, + "data": {"selector": "data", "all": True, "join_with": " "}, + # Sections and semantic structure + "sections": {"selector": "section", "all": True, "join_with": "\n\n"}, + "asides": {"selector": "aside", "all": True, "join_with": "\n\n"}, + "details": {"selector": "details", "all": True, "join_with": "\n"}, + "summary": {"selector": "summary", "all": True, "join_with": "\n"}, + # Navigation (may contain important links/structure) + "nav": {"selector": "nav", "all": True, "join_with": "\n"}, + # Footer information + "footer": {"selector": "footer", "all": True, "join_with": "\n"}, + # Divs with specific content roles + "content_divs": { + "selector": "div[role='main'], div[role='article'], div.content, div#content", + "all": True, + "join_with": "\n\n", + }, + # Spans with data (fallback for inline content) + "data_spans": { + "selector": "span[data-*]", + "all": True, + "join_with": " ", + }, + } + async def load( self, file_path: str, @@ -85,7 +175,8 @@ class BeautifulSoupLoader(LoaderInterface): Path to the stored extracted text file """ if extraction_rules is None: - raise ValueError("extraction_rules required for BeautifulSoupLoader") + extraction_rules = self._get_default_extraction_rules() + logger.info("Using default comprehensive extraction rules for HTML content") logger.info(f"Processing HTML file: {file_path}") @@ -115,6 +206,7 @@ class BeautifulSoupLoader(LoaderInterface): full_content = " ".join(pieces).strip() + # remove after defaults for extraction rules # Fallback: If no content extracted, check if the file is plain text (not HTML) if not full_content: from bs4 import BeautifulSoup From 5288ab4ab49333843813c53c39be4f9a883c2f89 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 13:01:06 +0100 Subject: [PATCH 3/7] tests: fix failing tests --- .../loaders/external/beautiful_soup_loader.py | 6 ------ .../web_url_crawler/test_url_adding_e2e.py | 11 +++++------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index 5ed0b16c0..8bea8fb6e 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -148,12 +148,6 @@ class BeautifulSoupLoader(LoaderInterface): "all": True, "join_with": "\n\n", }, - # Spans with data (fallback for inline content) - "data_spans": { - "selector": "span[data-*]", - "all": True, - "join_with": " ", - }, } async def load( diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index afe2dce7f..c098f5928 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -185,7 +185,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov @pytest.mark.asyncio -async def test_beautiful_soup_loader_raises_if_required_args_are_missing(): +async def test_beautiful_soup_loader_works_with_and_without_arguments(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -203,11 +203,10 @@ async def test_beautiful_soup_loader_raises_if_required_args_are_missing(): bs_loader = BeautifulSoupLoader() loader_engine.register_loader(bs_loader) preferred_loaders = {"beautiful_soup_loader": {}} - with pytest.raises(ValueError): - await loader_engine.load_file( - file_path, - preferred_loaders=preferred_loaders, - ) + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) extraction_rules = { "title": {"selector": "title"}, "headings": {"selector": "h1, h2, h3", "all": True}, From 925323fb35e7d9b56b30087f4b7b5e4bb99af61b Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 13:08:08 +0100 Subject: [PATCH 4/7] add test for `cognee.add()` when tavily is used --- .../web_url_crawler/test_url_adding_e2e.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index c098f5928..7be0df341 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -1,3 +1,4 @@ +import os import pytest import cognee from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path @@ -67,6 +68,22 @@ async def test_add_url(): await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") +skip_in_ci = pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS") == "true", + reason="Skipping in Github for now - before we get TAVILY_API_KEY", +) + + +@skip_in_ci +@pytest.mark.asyncio +async def test_add_url_with_tavily(): + assert os.getenv("TAVILY_API_KEY") is not None + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + + @pytest.mark.asyncio async def test_add_url_without_incremental_loading(): await cognee.prune.prune_data() From b9afc54233cbe3c1d567cb18b4c60107b413fd29 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 13:15:15 +0100 Subject: [PATCH 5/7] add test cases for tavily --- .../web_url_crawler/test_url_adding_e2e.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index 7be0df341..d91b075aa 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -26,6 +26,13 @@ async def test_url_saves_as_html_file(): pytest.fail(f"Failed to save data item to storage: {e}") +skip_for_tavily = pytest.mark.skipif( + os.getenv("TAVILY_API_KEY") is not None, + reason="Skipping as Tavily already handles parsing and outputs text", +) + + +@skip_for_tavily @pytest.mark.asyncio async def test_saved_html_is_valid(): try: @@ -113,7 +120,18 @@ async def test_add_url_with_incremental_loading(): @pytest.mark.asyncio -async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet +async def test_add_url_can_define_preferred_loader_as_list_of_str(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + await cognee.add( + "https://en.wikipedia.org/wiki/Large_language_model", + preferred_loaders=["beautiful_soup_loader"], + ) + + +@pytest.mark.asyncio +async def test_add_url_with_extraction_rules(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) From a499bd4d3f98468defc1706e65452ff89a50033d Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 17:10:39 +0100 Subject: [PATCH 6/7] ruff format --- .../infrastructure/databases/vector/lancedb/LanceDBAdapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 31243306d..30631ac4c 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -181,7 +181,7 @@ class LanceDBAdapter(VectorDBInterface): def create_lance_data_point(data_point: DataPoint, vector: list[float]) -> LanceDataPoint: properties = get_own_properties(data_point) properties["id"] = str(properties["id"]) - + return LanceDataPoint[str, self.get_data_point_schema(type(data_point))]( id=str(data_point.id), vector=vector, From 90118562d8d3e40ef8f7626b65affa7ce772c62c Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 22 Oct 2025 17:19:36 +0100 Subject: [PATCH 7/7] `cognee.add()` - add more explicit check for empty `preferred_loaders` param --- cognee/api/v1/add/add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 3f3331899..a521b316b 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -164,7 +164,7 @@ async def add( - TAVILY_API_KEY: YOUR_TAVILY_API_KEY """ - if preferred_loaders: + if preferred_loaders is not None: transformed = {} for item in preferred_loaders: if isinstance(item, dict):