fix: Refactor web parsing (#1575)

<!-- .github/pull_request_template.md -->

## Description
<!--
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
-->

This PR is an iteration over #1552:

1. Refactors `preferred_loaders` from dicts, to a list that can contain
strings (name of the loader) or dicts (`{loader_name: {arg1: val1}}`),
i.e. - `[{"loader_name_one": {"arg1": "val1"}}, "loader_name_two"]`
2. Adds default extraction rules for html parsing
3. Adds unit tests that cover the changes + unit test for tavily

## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [x] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-10-22 19:09:19 +02:00 committed by GitHub
commit c7d0f64cb1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 138 additions and 9 deletions

View file

@ -23,7 +23,7 @@ async def add(
vector_db_config: dict = None,
graph_db_config: dict = None,
dataset_id: Optional[UUID] = None,
preferred_loaders: dict[str, dict[str, Any]] = None,
preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None,
incremental_loading: bool = True,
data_per_batch: Optional[int] = 20,
):
@ -164,6 +164,15 @@ async def add(
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
"""
if preferred_loaders is not None:
transformed = {}
for item in preferred_loaders:
if isinstance(item, dict):
transformed.update(item)
else:
transformed[item] = {}
preferred_loaders = transformed
tasks = [
Task(resolve_data_directories, include_subdirectories=True),
Task(

View file

@ -66,6 +66,90 @@ class BeautifulSoupLoader(LoaderInterface):
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
return can
def _get_default_extraction_rules(self):
# Comprehensive default extraction rules for common HTML content
return {
# Meta information
"title": {"selector": "title", "all": False},
"meta_description": {
"selector": "meta[name='description']",
"attr": "content",
"all": False,
},
"meta_keywords": {
"selector": "meta[name='keywords']",
"attr": "content",
"all": False,
},
# Open Graph meta tags
"og_title": {
"selector": "meta[property='og:title']",
"attr": "content",
"all": False,
},
"og_description": {
"selector": "meta[property='og:description']",
"attr": "content",
"all": False,
},
# Main content areas (prioritized selectors)
"article": {"selector": "article", "all": True, "join_with": "\n\n"},
"main": {"selector": "main", "all": True, "join_with": "\n\n"},
# Semantic content sections
"headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
"headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
"headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
"headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
"headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
"headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
# Text content
"paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
"blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
"preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
# Lists
"ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
"unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
"list_items": {"selector": "li", "all": True, "join_with": "\n"},
"definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
# Tables
"tables": {"selector": "table", "all": True, "join_with": "\n\n"},
"table_captions": {
"selector": "caption",
"all": True,
"join_with": "\n",
},
# Code blocks
"code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
# Figures and media descriptions
"figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
"figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
"image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
# Links (text content, not URLs to avoid clutter)
"link_text": {"selector": "a", "all": True, "join_with": " "},
# Emphasized text
"strong": {"selector": "strong", "all": True, "join_with": " "},
"emphasis": {"selector": "em", "all": True, "join_with": " "},
"marked": {"selector": "mark", "all": True, "join_with": " "},
# Time and data elements
"time": {"selector": "time", "all": True, "join_with": " "},
"data": {"selector": "data", "all": True, "join_with": " "},
# Sections and semantic structure
"sections": {"selector": "section", "all": True, "join_with": "\n\n"},
"asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
"details": {"selector": "details", "all": True, "join_with": "\n"},
"summary": {"selector": "summary", "all": True, "join_with": "\n"},
# Navigation (may contain important links/structure)
"nav": {"selector": "nav", "all": True, "join_with": "\n"},
# Footer information
"footer": {"selector": "footer", "all": True, "join_with": "\n"},
# Divs with specific content roles
"content_divs": {
"selector": "div[role='main'], div[role='article'], div.content, div#content",
"all": True,
"join_with": "\n\n",
},
}
async def load(
self,
file_path: str,
@ -85,7 +169,8 @@ class BeautifulSoupLoader(LoaderInterface):
Path to the stored extracted text file
"""
if extraction_rules is None:
raise ValueError("extraction_rules required for BeautifulSoupLoader")
extraction_rules = self._get_default_extraction_rules()
logger.info("Using default comprehensive extraction rules for HTML content")
logger.info(f"Processing HTML file: {file_path}")
@ -115,6 +200,7 @@ class BeautifulSoupLoader(LoaderInterface):
full_content = " ".join(pieces).strip()
# remove after defaults for extraction rules
# Fallback: If no content extracted, check if the file is plain text (not HTML)
if not full_content:
from bs4 import BeautifulSoup

View file

@ -1,3 +1,4 @@
import os
import pytest
import cognee
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
@ -25,6 +26,13 @@ async def test_url_saves_as_html_file():
pytest.fail(f"Failed to save data item to storage: {e}")
skip_for_tavily = pytest.mark.skipif(
os.getenv("TAVILY_API_KEY") is not None,
reason="Skipping as Tavily already handles parsing and outputs text",
)
@skip_for_tavily
@pytest.mark.asyncio
async def test_saved_html_is_valid():
try:
@ -67,6 +75,22 @@ async def test_add_url():
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
skip_in_ci = pytest.mark.skipif(
os.getenv("GITHUB_ACTIONS") == "true",
reason="Skipping in Github for now - before we get TAVILY_API_KEY",
)
@skip_in_ci
@pytest.mark.asyncio
async def test_add_url_with_tavily():
assert os.getenv("TAVILY_API_KEY") is not None
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
@pytest.mark.asyncio
async def test_add_url_without_incremental_loading():
await cognee.prune.prune_data()
@ -96,7 +120,18 @@ async def test_add_url_with_incremental_loading():
@pytest.mark.asyncio
async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet
async def test_add_url_can_define_preferred_loader_as_list_of_str():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
preferred_loaders=["beautiful_soup_loader"],
)
@pytest.mark.asyncio
async def test_add_url_with_extraction_rules():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
@ -185,7 +220,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
@pytest.mark.asyncio
async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
async def test_beautiful_soup_loader_works_with_and_without_arguments():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
@ -203,11 +238,10 @@ async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {}}
with pytest.raises(ValueError):
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},