(WIP) Fix/fix web parsing (#1552)

## Description  This PR (using TDD): 1. Separates web crawling implementation into separate fetching, and parsing (loader) steps 2. Fetching is used in `save_data_item_to_storage`. Default settings are used for fetching 3. Loader produces a txt file, scraping the fetched html and saves it in a txt file (`html_hash.html` -> `html_hash.txt`), similar to how we process pdf files ## Type of Change  - [x] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [x] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable)  ## Pre-submission Checklist  - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-10-22 11:57:40 +02:00 · 2025-10-22 11:57:40 +02:00 · 738759bc5b
commit 738759bc5b
parent 62157a114d 10e4fd7681
24 changed files with 859 additions and 339 deletions
--- a/.github/workflows/basic_tests.yml
+++ b/.github/workflows/basic_tests.yml
@ -123,6 +123,7 @@ jobs:
        uses: ./.github/actions/cognee_setup
        with:
          python-version: ${{ inputs.python-version }}
          extra-dependencies: "scraping"
      - name: Run Integration Tests
        run: uv run pytest cognee/tests/integration/
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -1,8 +1,5 @@
 from uuid import UUID
-import os
+from typing import Union, BinaryIO, List, Optional, Any
 from typing import Union, BinaryIO, List, Optional, Dict, Any
 from pydantic import BaseModel
 from urllib.parse import urlparse
 from cognee.modules.users.models import User
 from cognee.modules.pipelines import Task, run_pipeline
 from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
@ -17,16 +14,6 @@ from cognee.shared.logging_utils import get_logger
 logger = get_logger()
 try:
    from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
    from cognee.context_global_variables import (
        tavily_config as tavily,
        soup_crawler_config as soup_crawler,
    )
 except ImportError:
    logger.debug(f"Unable to import {str(ImportError)}")
    pass
 async def add(
    data: Union[BinaryIO, list[BinaryIO], str, list[str]],
@ -36,11 +23,8 @@ async def add(
    vector_db_config: dict = None,
    graph_db_config: dict = None,
    dataset_id: Optional[UUID] = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
    incremental_loading: bool = True,
    extraction_rules: Optional[Dict[str, Any]] = None,
    tavily_config: Optional[BaseModel] = None,
    soup_crawler_config: Optional[BaseModel] = None,
    data_per_batch: Optional[int] = 20,
 ):
    """
@ -180,29 +164,6 @@ async def add(
        - TAVILY_API_KEY: YOUR_TAVILY_API_KEY
    """
    try:
        if not soup_crawler_config and extraction_rules:
            soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
        if not tavily_config and os.getenv("TAVILY_API_KEY"):
            tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
        soup_crawler.set(soup_crawler_config)
        tavily.set(tavily_config)
        http_schemes = {"http", "https"}
        def _is_http_url(item: Union[str, BinaryIO]) -> bool:
            return isinstance(item, str) and urlparse(item).scheme in http_schemes
        if _is_http_url(data):
            node_set = ["web_content"] if not node_set else node_set + ["web_content"]
        elif isinstance(data, list) and any(_is_http_url(item) for item in data):
            node_set = ["web_content"] if not node_set else node_set + ["web_content"]
    except NameError:
        logger.debug(f"Unable to import {str(ImportError)}")
        pass
    tasks = [
        Task(resolve_data_directories, include_subdirectories=True),
        Task(
--- a/cognee/api/v1/update/update.py
+++ b/cognee/api/v1/update/update.py
@ -1,5 +1,5 @@
 from uuid import UUID
-from typing import Union, BinaryIO, List, Optional
+from typing import Union, BinaryIO, List, Optional, Any
 from cognee.modules.users.models import User
 from cognee.api.v1.delete import delete
@ -15,7 +15,7 @@ async def update(
    node_set: Optional[List[str]] = None,
    vector_db_config: dict = None,
    graph_db_config: dict = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
    incremental_loading: bool = True,
 ):
    """
--- a/cognee/context_global_variables.py
+++ b/cognee/context_global_variables.py
@ -13,8 +13,6 @@ from cognee.modules.users.methods import get_user
 vector_db_config = ContextVar("vector_db_config", default=None)
 graph_db_config = ContextVar("graph_db_config", default=None)
 session_user = ContextVar("session_user", default=None)
 soup_crawler_config = ContextVar("soup_crawler_config", default=None)
 tavily_config = ContextVar("tavily_config", default=None)
 async def set_session_user_context_variable(user):
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@ -64,7 +64,9 @@ class LoaderEngine:
        return True
    def get_loader(
-        self, file_path: str, preferred_loaders: List[str] = None
+        self,
        file_path: str,
        preferred_loaders: dict[str, dict[str, Any]],
    ) -> Optional[LoaderInterface]:
        """
        Get appropriate loader for a file.
@ -76,14 +78,21 @@ class LoaderEngine:
        Returns:
            LoaderInterface that can handle the file, or None if not found
        """
        from pathlib import Path
        file_info = filetype.guess(file_path)
        path_extension = Path(file_path).suffix.lstrip(".")
        # Try preferred loaders first
        if preferred_loaders:
            for loader_name in preferred_loaders:
                if loader_name in self._loaders:
                    loader = self._loaders[loader_name]
                    # Try with path extension first (for text formats like html)
                    if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
                        return loader
                    # Fall back to content-detected extension
                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                        return loader
                else:
@ -93,6 +102,10 @@ class LoaderEngine:
        for loader_name in self.default_loader_priority:
            if loader_name in self._loaders:
                loader = self._loaders[loader_name]
                # Try with path extension first (for text formats like html)
                if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
                    return loader
                # Fall back to content-detected extension
                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                    return loader
            else:
@ -105,7 +118,7 @@ class LoaderEngine:
    async def load_file(
        self,
        file_path: str,
-        preferred_loaders: Optional[List[str]] = None,
+        preferred_loaders: dict[str, dict[str, Any]] = None,
        **kwargs,
    ):
        """
@ -113,7 +126,7 @@ class LoaderEngine:
        Args:
            file_path: Path to the file to be processed
-            preferred_loaders: List of preferred loader names to try first
+            preferred_loaders: Dict of loader names to their configurations
            **kwargs: Additional loader-specific configuration
        Raises:
@ -125,8 +138,16 @@ class LoaderEngine:
            raise ValueError(f"No loader found for file: {file_path}")
        logger.debug(f"Loading {file_path} with {loader.loader_name}")
-        # TODO: loading needs to be reworked to work with both file streams and file locations
+
-        return await loader.load(file_path, **kwargs)
+        # Extract loader-specific config from preferred_loaders
        loader_config = {}
        if preferred_loaders and loader.loader_name in preferred_loaders:
            loader_config = preferred_loaders[loader.loader_name]
        # Merge with any additional kwargs (kwargs take precedence)
        merged_kwargs = {**loader_config, **kwargs}
        return await loader.load(file_path, **merged_kwargs)
    def get_available_loaders(self) -> List[str]:
        """
--- a/cognee/infrastructure/loaders/external/init.py
+++ b/cognee/infrastructure/loaders/external/init.py
@ -27,3 +27,10 @@ try:
    __all__.append("AdvancedPdfLoader")
 except ImportError:
    pass
 try:
    from .beautiful_soup_loader import BeautifulSoupLoader
    __all__.append("BeautifulSoupLoader")
 except ImportError:
    pass
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -0,0 +1,224 @@
 """BeautifulSoup-based web crawler for extracting content from web pages.
 This module provides the BeautifulSoupCrawler class for fetching and extracting content
 from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
 from typing import Union, Dict, Any, Optional, List
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
 from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
 from cognee.shared.logging_utils import get_logger
 logger = get_logger(__name__)
@dataclass
 class ExtractionRule:
    """Normalized extraction rule for web content.
    Attributes:
        selector: CSS selector for extraction (if any).
        xpath: XPath expression for extraction (if any).
        attr: HTML attribute to extract (if any).
        all: If True, extract all matching elements; otherwise, extract first.
        join_with: String to join multiple extracted elements.
    """
    selector: Optional[str] = None
    xpath: Optional[str] = None
    attr: Optional[str] = None
    all: bool = False
    join_with: str = " "
 class BeautifulSoupLoader(LoaderInterface):
    """Crawler for fetching and extracting web content using BeautifulSoup.
    Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
    compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
    Attributes:
        concurrency: Number of concurrent requests allowed.
        crawl_delay: Minimum seconds between requests to the same domain.
        max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
        timeout: Per-request timeout in seconds.
        max_retries: Number of retries for failed requests.
        retry_delay_factor: Multiplier for exponential backoff on retries.
        headers: HTTP headers for requests (e.g., User-Agent).
        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
    """
    @property
    def supported_extensions(self) -> List[str]:
        return ["html"]
    @property
    def supported_mime_types(self) -> List[str]:
        return ["text/html", "text/plain"]
    @property
    def loader_name(self) -> str:
        return "beautiful_soup_loader"
    def can_handle(self, extension: str, mime_type: str) -> bool:
        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
        return can
    async def load(
        self,
        file_path: str,
        extraction_rules: dict[str, Any] = None,
        join_all_matches: bool = False,
        **kwargs,
    ):
        """Load an HTML file, extract content, and save to storage.
        Args:
            file_path: Path to the HTML file
            extraction_rules: Dict of CSS selector rules for content extraction
            join_all_matches: If True, extract all matching elements for each rule
            **kwargs: Additional arguments
        Returns:
            Path to the stored extracted text file
        """
        if extraction_rules is None:
            raise ValueError("extraction_rules required for BeautifulSoupLoader")
        logger.info(f"Processing HTML file: {file_path}")
        from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
        from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
        with open(file_path, "rb") as f:
            file_metadata = await get_file_metadata(f)
            f.seek(0)
            html = f.read()
        storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
        # Normalize extraction rules
        normalized_rules: List[ExtractionRule] = []
        for _, rule in extraction_rules.items():
            r = self._normalize_rule(rule)
            if join_all_matches:
                r.all = True
            normalized_rules.append(r)
        pieces = []
        for rule in normalized_rules:
            text = self._extract_from_html(html, rule)
            if text:
                pieces.append(text)
        full_content = " ".join(pieces).strip()
        # Fallback: If no content extracted, check if the file is plain text (not HTML)
        if not full_content:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")
            # If there are no HTML tags, treat as plain text
            if not soup.find():
                logger.warning(
                    f"No HTML tags found in {file_path}. Treating as plain text. "
                    "This may happen when content is pre-extracted (e.g., via Tavily with text format)."
                )
                full_content = html.decode("utf-8") if isinstance(html, bytes) else html
                full_content = full_content.strip()
        if not full_content:
            logger.warning(f"No content extracted from HTML file: {file_path}")
        # Store the extracted content
        storage_config = get_storage_config()
        data_root_directory = storage_config["data_root_directory"]
        storage = get_file_storage(data_root_directory)
        full_file_path = await storage.store(storage_file_name, full_content)
        logger.info(f"Extracted {len(full_content)} characters from HTML")
        return full_file_path
    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.
        Args:
            rule: A string (CSS selector) or dict with extraction parameters.
        Returns:
            ExtractionRule: Normalized extraction rule.
        Raises:
            ValueError: If the rule is invalid.
        """
        if isinstance(rule, str):
            return ExtractionRule(selector=rule)
        if isinstance(rule, dict):
            return ExtractionRule(
                selector=rule.get("selector"),
                xpath=rule.get("xpath"),
                attr=rule.get("attr"),
                all=bool(rule.get("all", False)),
                join_with=rule.get("join_with", " "),
            )
        raise ValueError(f"Invalid extraction rule: {rule}")
    def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
        """Extract content from HTML using BeautifulSoup or lxml XPath.
        Args:
            html: The HTML content to extract from.
            rule: The extraction rule to apply.
        Returns:
            str: The extracted content.
        Raises:
            RuntimeError: If XPath is used but lxml is not installed.
        """
        soup = BeautifulSoup(html, "html.parser")
        if rule.xpath:
            try:
                from lxml import html as lxml_html
            except ImportError:
                raise RuntimeError(
                    "XPath requested but lxml is not available. Install lxml or use CSS selectors."
                )
            doc = lxml_html.fromstring(html)
            nodes = doc.xpath(rule.xpath)
            texts = []
            for n in nodes:
                if hasattr(n, "text_content"):
                    texts.append(n.text_content().strip())
                else:
                    texts.append(str(n).strip())
            return rule.join_with.join(t for t in texts if t)
        if not rule.selector:
            return ""
        if rule.all:
            nodes = soup.select(rule.selector)
            pieces = []
            for el in nodes:
                if rule.attr:
                    val = el.get(rule.attr)
                    if val:
                        pieces.append(val.strip())
                else:
                    text = el.get_text(strip=True)
                    if text:
                        pieces.append(text)
            return rule.join_with.join(pieces).strip()
        else:
            el = soup.select_one(rule.selector)
            if el is None:
                return ""
            if rule.attr:
                val = el.get(rule.attr)
                return (val or "").strip()
            return el.get_text(strip=True)
--- a/cognee/infrastructure/loaders/supported_loaders.py
+++ b/cognee/infrastructure/loaders/supported_loaders.py
@ -23,3 +23,10 @@ try:
    supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
 except ImportError:
    pass
 try:
    from cognee.infrastructure.loaders.external import BeautifulSoupLoader
    supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
 except ImportError:
    pass
--- a/cognee/modules/ingestion/save_data_to_file.py
+++ b/cognee/modules/ingestion/save_data_to_file.py
@ -1,10 +1,12 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
 from .classify import classify
 import hashlib
-async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
+async def save_data_to_file(
    data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
 ):
    storage_config = get_storage_config()
    data_root_directory = storage_config["data_root_directory"]
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
        file_name = file_metadata["name"]
        if file_extension is not None:
            extension = file_extension.lstrip(".")
            file_name_without_ext = file_name.rsplit(".", 1)[0]
            file_name = f"{file_name_without_ext}.{extension}"
        storage = get_file_storage(data_root_directory)
        full_file_path = await storage.store(file_name, data)
--- a/cognee/modules/pipelines/operations/pipeline.py
+++ b/cognee/modules/pipelines/operations/pipeline.py
@ -20,6 +20,7 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
 from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
    check_pipeline_run_qualification,
 )
 from typing import Any
 logger = get_logger("cognee.pipeline")
@ -80,7 +81,14 @@ async def run_pipeline_per_dataset(
        return
    pipeline_run = run_tasks(
-        tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_per_batch
+        tasks,
        dataset.id,
        data,
        user,
        pipeline_name,
        context,
        incremental_loading,
        data_per_batch,
    )
    async for pipeline_run_info in pipeline_run:
--- a/cognee/tasks/ingestion/data_item_to_text_file.py
+++ b/cognee/tasks/ingestion/data_item_to_text_file.py
@ -1,6 +1,6 @@
 import os
 from urllib.parse import urlparse
-from typing import List, Tuple
+from typing import Any, List, Tuple
 from pathlib import Path
 import tempfile
@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
 async def data_item_to_text_file(
-    data_item_path: str, preferred_loaders: List[str]
+    data_item_path: str,
    preferred_loaders: dict[str, dict[str, Any]] = None,
 ) -> Tuple[str, LoaderInterface]:
    if isinstance(data_item_path, str):
        parsed_url = urlparse(data_item_path)
@ -74,6 +75,5 @@ async def data_item_to_text_file(
                )
            else:
                raise IngestionError(message="Local files are not accepted.")
    # data is not a supported type
    raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
--- a/cognee/tasks/ingestion/ingest_data.py
+++ b/cognee/tasks/ingestion/ingest_data.py
@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.models import Data
 from cognee.modules.ingestion.exceptions import IngestionError
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
@ -27,7 +28,7 @@ async def ingest_data(
    user: User,
    node_set: Optional[List[str]] = None,
    dataset_id: UUID = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
 ):
    if not user:
        user = await get_default_user()
@ -44,7 +45,7 @@ async def ingest_data(
        user: User,
        node_set: Optional[List[str]] = None,
        dataset_id: UUID = None,
-        preferred_loaders: List[str] = None,
+        preferred_loaders: dict[str, dict[str, Any]] = None,
    ):
        new_datapoints = []
        existing_data_points = []
@ -77,22 +78,27 @@ async def ingest_data(
        dataset_data_map = {str(data.id): True for data in dataset_data}
        for data_item in data:
-            # Get file path of data item or create a file it doesn't exist
+            # Get file path of data item or create a file if it doesn't exist
            original_file_path = await save_data_item_to_storage(data_item)
            # Transform file path to be OS usable
            actual_file_path = get_data_file_path(original_file_path)
            # Store all input data as text files in Cognee data storage
            cognee_storage_file_path, loader_engine = await data_item_to_text_file(
-                actual_file_path, preferred_loaders
+                actual_file_path,
                preferred_loaders,
            )
            if loader_engine is None:
                raise IngestionError("Loader cannot be None")
            # Find metadata from original file
            # Standard flow: extract metadata from both original and stored files
            async with open_data_file(original_file_path) as file:
                classified_data = ingestion.classify(file)
                # data_id is the hash of original file contents + owner id to avoid duplicate data
                data_id = ingestion.identify(classified_data, user)
                original_file_metadata = classified_data.get_metadata()
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -8,6 +8,8 @@ from cognee.modules.ingestion import save_data_to_file
 from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from cognee.tasks.web_scraper.utils import fetch_page_content
 logger = get_logger()
@ -18,13 +20,6 @@ class SaveDataSettings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", extra="allow")
 class HTMLContent(str):
    def __new__(cls, value: str):
        if not ("<" in value and ">" in value):
            raise ValueError("Not valid HTML-like content")
        return super().__new__(cls, value)
 settings = SaveDataSettings()
@ -63,40 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
        if parsed_url.scheme == "s3":
            return data_item
        elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
-            # Validate URL by sending a HEAD request
+            urls_to_page_contents = await fetch_page_content(data_item)
-            try:
+            return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
                from cognee.context_global_variables import tavily_config, soup_crawler_config
                from cognee.tasks.web_scraper import fetch_page_content
                tavily = tavily_config.get()
                soup_crawler = soup_crawler_config.get()
                preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
                if preferred_tool == "tavily" and tavily is None:
                    raise IngestionError(
                        message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
                    )
                if preferred_tool == "beautifulsoup" and soup_crawler is None:
                    raise IngestionError(
                        message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
                    )
                data = await fetch_page_content(
                    data_item,
                    preferred_tool=preferred_tool,
                    tavily_config=tavily,
                    soup_crawler_config=soup_crawler,
                )
                content = ""
                for key, value in data.items():
                    content += f"{key}:\n{value}\n\n"
                return await save_data_to_file(content)
            except IngestionError:
                raise
            except Exception as e:
                raise IngestionError(
                    message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
                )
        # data is local file path
        elif parsed_url.scheme == "file":
            if settings.accept_local_file_path:
--- a/cognee/tasks/web_scraper/init.py
+++ b/cognee/tasks/web_scraper/init.py
@ -5,9 +5,24 @@ data in a graph database. It includes classes and functions for crawling web pag
 BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
 """
 from .bs4_crawler import BeautifulSoupCrawler
 from .utils import fetch_page_content
-from .web_scraper_task import cron_web_scraper_task, web_scraper_task
+from .default_url_crawler import DefaultUrlCrawler
 # Lazy import for web_scraper_task to avoid requiring apscheduler
 # Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
 def __getattr__(name):
    """Lazy load web scraper task functions that require apscheduler."""
    if name == "cron_web_scraper_task":
        from .web_scraper_task import cron_web_scraper_task
        return cron_web_scraper_task
    elif name == "web_scraper_task":
        from .web_scraper_task import web_scraper_task
        return web_scraper_task
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 __all__ = [
@ -15,4 +30,5 @@ __all__ = [
    "fetch_page_content",
    "cron_web_scraper_task",
    "web_scraper_task",
    "DefaultUrlCrawler",
 ]
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -10,14 +10,16 @@ class TavilyConfig(BaseModel):
    timeout: Optional[int] = Field(default=10, ge=1, le=60)
-class SoupCrawlerConfig(BaseModel):
+class DefaultCrawlerConfig(BaseModel):
    concurrency: int = 5
    crawl_delay: float = 0.5
    max_crawl_delay: Optional[float] = (
        10.0  # Maximum crawl delay to respect from robots.txt (None = no limit)
    )
    timeout: float = 15.0
    max_retries: int = 2
    retry_delay_factor: float = 0.5
    headers: Optional[Dict[str, str]] = None
    extraction_rules: Dict[str, Any]
    use_playwright: bool = False
    playwright_js_wait: float = 0.8
    robots_cache_ttl: float = 3600.0
--- a/cognee/tasks/web_scraper/default_url_crawler.py
+++ b/cognee/tasks/web_scraper/default_url_crawler.py
@ -1,21 +1,21 @@
 """BeautifulSoup-based web crawler for extracting content from web pages.
 This module provides the BeautifulSoupCrawler class for fetching and extracting content
 from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
 supports robots.txt handling, rate limiting, and custom extraction rules.
 """
 import asyncio
 import time
 from typing import Union, List, Dict, Any, Optional
 from urllib.parse import urlparse
 from dataclasses import dataclass, field
 from functools import lru_cache
 import time
 from typing import Any, Union, List, Dict, Optional
 from urllib.parse import urlparse
 import httpx
 from bs4 import BeautifulSoup
 from cognee.shared.logging_utils import get_logger
-logger = get_logger(__name__)
+from cognee.shared.logging_utils import get_logger
 from cognee.tasks.web_scraper.types import UrlsToHtmls
 logger = get_logger()
 try:
    from protego import Protego
 except ImportError:
    logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
    Protego = None
 try:
    from playwright.async_api import async_playwright
@ -25,31 +25,6 @@ except ImportError:
    )
    async_playwright = None
 try:
    from protego import Protego
 except ImportError:
    logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
    Protego = None
@dataclass
 class ExtractionRule:
    """Normalized extraction rule for web content.
    Attributes:
        selector: CSS selector for extraction (if any).
        xpath: XPath expression for extraction (if any).
        attr: HTML attribute to extract (if any).
        all: If True, extract all matching elements; otherwise, extract first.
        join_with: String to join multiple extracted elements.
    """
    selector: Optional[str] = None
    xpath: Optional[str] = None
    attr: Optional[str] = None
    all: bool = False
    join_with: str = " "
@dataclass
 class RobotsTxtCache:
@ -66,27 +41,13 @@ class RobotsTxtCache:
    timestamp: float = field(default_factory=time.time)
-class BeautifulSoupCrawler:
+class DefaultUrlCrawler:
    """Crawler for fetching and extracting web content using BeautifulSoup.
    Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
    compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
    Attributes:
        concurrency: Number of concurrent requests allowed.
        crawl_delay: Minimum seconds between requests to the same domain.
        timeout: Per-request timeout in seconds.
        max_retries: Number of retries for failed requests.
        retry_delay_factor: Multiplier for exponential backoff on retries.
        headers: HTTP headers for requests (e.g., User-Agent).
        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
    """
    def __init__(
        self,
        *,
        concurrency: int = 5,
        crawl_delay: float = 0.5,
        max_crawl_delay: Optional[float] = 10.0,
        timeout: float = 15.0,
        max_retries: int = 2,
        retry_delay_factor: float = 0.5,
@ -98,6 +59,7 @@ class BeautifulSoupCrawler:
        Args:
            concurrency: Number of concurrent requests allowed.
            crawl_delay: Minimum seconds between requests to the same domain.
            max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
            timeout: Per-request timeout in seconds.
            max_retries: Number of retries for failed requests.
            retry_delay_factor: Multiplier for exponential backoff on retries.
@ -107,6 +69,7 @@ class BeautifulSoupCrawler:
        self.concurrency = concurrency
        self._sem = asyncio.Semaphore(concurrency)
        self.crawl_delay = crawl_delay
        self.max_crawl_delay = max_crawl_delay
        self.timeout = timeout
        self.max_retries = max_retries
        self.retry_delay_factor = retry_delay_factor
@ -183,7 +146,11 @@ class BeautifulSoupCrawler:
        elapsed = time.time() - last
        wait_for = delay - elapsed
        if wait_for > 0:
            logger.info(
                f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
            )
            await asyncio.sleep(wait_for)
            logger.info(f"Rate limit wait completed for {url}")
        self._last_request_time_per_domain[domain] = time.time()
    async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
@ -236,7 +203,16 @@ class BeautifulSoupCrawler:
            crawl_delay = self.crawl_delay
            if protego:
                delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
-                crawl_delay = delay if delay else self.crawl_delay
+                if delay:
                    # Apply max_crawl_delay cap if configured
                    if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
                        logger.warning(
                            f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
                            f"capping to max_crawl_delay={self.max_crawl_delay}s"
                        )
                        crawl_delay = self.max_crawl_delay
                    else:
                        crawl_delay = delay
            cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
            self._robots_cache[domain_root] = cache_entry
@ -307,12 +283,16 @@ class BeautifulSoupCrawler:
        attempt = 0
        crawl_delay = await self._get_crawl_delay(url)
        logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
        while True:
            try:
                await self._respect_rate_limit(url, crawl_delay)
                resp = await self._client.get(url)
                resp.raise_for_status()
                logger.info(
                    f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
                )
                return resp.text
            except Exception as exc:
                attempt += 1
@ -347,22 +327,35 @@ class BeautifulSoupCrawler:
            raise RuntimeError(
                "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
            )
        timeout_val = timeout or self.timeout
        logger.info(
            f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
        )
        attempt = 0
        while True:
            try:
                async with async_playwright() as p:
                    logger.info(f"Launching headless Chromium browser for {url}")
                    browser = await p.chromium.launch(headless=True)
                    try:
                        context = await browser.new_context()
                        page = await context.new_page()
                        logger.info(f"Navigating to {url} and waiting for network idle")
                        await page.goto(
                            url,
                            wait_until="networkidle",
-                            timeout=int((timeout or self.timeout) * 1000),
+                            timeout=int(timeout_val * 1000),
                        )
                        if js_wait:
                            logger.info(f"Waiting {js_wait}s for JavaScript to execute")
                            await asyncio.sleep(js_wait)
-                        return await page.content()
+                        content = await page.content()
                        logger.info(
                            f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
                        )
                        return content
                    finally:
                        await browser.close()
            except Exception as exc:
@ -376,96 +369,13 @@ class BeautifulSoupCrawler:
                )
                await asyncio.sleep(backoff)
-    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
+    async def fetch_urls(
        """Normalize an extraction rule to an ExtractionRule dataclass.
        Args:
            rule: A string (CSS selector) or dict with extraction parameters.
        Returns:
            ExtractionRule: Normalized extraction rule.
        Raises:
            ValueError: If the rule is invalid.
        """
        if isinstance(rule, str):
            return ExtractionRule(selector=rule)
        if isinstance(rule, dict):
            return ExtractionRule(
                selector=rule.get("selector"),
                xpath=rule.get("xpath"),
                attr=rule.get("attr"),
                all=bool(rule.get("all", False)),
                join_with=rule.get("join_with", " "),
            )
        raise ValueError(f"Invalid extraction rule: {rule}")
    def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
        """Extract content from HTML using BeautifulSoup or lxml XPath.
        Args:
            html: The HTML content to extract from.
            rule: The extraction rule to apply.
        Returns:
            str: The extracted content.
        Raises:
            RuntimeError: If XPath is used but lxml is not installed.
        """
        soup = BeautifulSoup(html, "html.parser")
        if rule.xpath:
            try:
                from lxml import html as lxml_html
            except ImportError:
                raise RuntimeError(
                    "XPath requested but lxml is not available. Install lxml or use CSS selectors."
                )
            doc = lxml_html.fromstring(html)
            nodes = doc.xpath(rule.xpath)
            texts = []
            for n in nodes:
                if hasattr(n, "text_content"):
                    texts.append(n.text_content().strip())
                else:
                    texts.append(str(n).strip())
            return rule.join_with.join(t for t in texts if t)
        if not rule.selector:
            return ""
        if rule.all:
            nodes = soup.select(rule.selector)
            pieces = []
            for el in nodes:
                if rule.attr:
                    val = el.get(rule.attr)
                    if val:
                        pieces.append(val.strip())
                else:
                    text = el.get_text(strip=True)
                    if text:
                        pieces.append(text)
            return rule.join_with.join(pieces).strip()
        else:
            el = soup.select_one(rule.selector)
            if el is None:
                return ""
            if rule.attr:
                val = el.get(rule.attr)
                return (val or "").strip()
            return el.get_text(strip=True)
    async def fetch_with_bs4(
        self,
-        urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
+        urls: Union[str, List[str]],
        extraction_rules: Optional[Dict[str, Any]] = None,
        *,
        use_playwright: bool = False,
        playwright_js_wait: float = 0.8,
-        join_all_matches: bool = False,
+    ) -> UrlsToHtmls:
    ) -> Dict[str, str]:
        """Fetch and extract content from URLs using BeautifulSoup or Playwright.
        Args:
@ -482,65 +392,55 @@ class BeautifulSoupCrawler:
            ValueError: If extraction_rules are missing when required or if urls is invalid.
            Exception: If fetching or extraction fails.
        """
        url_rules_map: Dict[str, Dict[str, Any]] = {}
        if isinstance(urls, str):
-            if not extraction_rules:
+            urls = [urls]
                raise ValueError("extraction_rules required when urls is a string")
            url_rules_map[urls] = extraction_rules
        elif isinstance(urls, list):
            if not extraction_rules:
                raise ValueError("extraction_rules required when urls is a list")
            for url in urls:
                url_rules_map[url] = extraction_rules
        elif isinstance(urls, dict):
            url_rules_map = urls
        else:
            raise ValueError(f"Invalid urls type: {type(urls)}")
        normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
        for url, rules in url_rules_map.items():
            normalized_rules = []
            for _, rule in rules.items():
                r = self._normalize_rule(rule)
                if join_all_matches:
                    r.all = True
                normalized_rules.append(r)
            normalized_url_rules[url] = normalized_rules
        async def _task(url: str):
            async with self._sem:
                try:
                    logger.info(f"Processing URL: {url}")
                    # Check robots.txt
                    allowed = await self._is_url_allowed(url)
                    if not allowed:
                        logger.warning(f"URL disallowed by robots.txt: {url}")
                        return url, ""
                    logger.info(f"Robots.txt check passed for {url}")
                    # Fetch HTML
                    if use_playwright:
                        logger.info(
                            f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
                        )
                        html = await self._render_with_playwright(
                            url, js_wait=playwright_js_wait, timeout=self.timeout
                        )
                    else:
                        logger.info(f"Fetching {url} with httpx")
                        html = await self._fetch_httpx(url)
-                    pieces = []
+                    logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
                    for rule in normalized_url_rules[url]:
                        text = self._extract_with_bs4(html, rule)
                        if text:
                            pieces.append(text)
-                    concatenated = " ".join(pieces).strip()
+                    return url, html
                    return url, concatenated
                except Exception as e:
                    logger.error(f"Error processing {url}: {e}")
                    return url, ""
-        tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
+        logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
        tasks = [asyncio.create_task(_task(u)) for u in urls]
        results = {}
        completed = 0
        total = len(tasks)
        for coro in asyncio.as_completed(tasks):
-            url, text = await coro
+            url, html = await coro
-            results[url] = text
+            results[url] = html
            completed += 1
            logger.info(f"Progress: {completed}/{total} URLs processed")
        logger.info(f"Completed fetching all {len(results)} URL(s)")
        return results
--- a/cognee/tasks/web_scraper/types.py
+++ b/cognee/tasks/web_scraper/types.py
@ -0,0 +1,4 @@
 from typing import TypeAlias
 UrlsToHtmls: TypeAlias = dict[str, str]
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -4,21 +4,17 @@ This module provides functions to fetch and extract content from web pages, supp
 both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
 """
-from typing import Dict, List, Union, Optional, Literal
+import os
 from typing import List, Union
 from cognee.shared.logging_utils import get_logger
-from .bs4_crawler import BeautifulSoupCrawler
+from cognee.tasks.web_scraper.types import UrlsToHtmls
-from .config import TavilyConfig, SoupCrawlerConfig
+from .default_url_crawler import DefaultUrlCrawler
 from .config import DefaultCrawlerConfig, TavilyConfig
 logger = get_logger(__name__)
-async def fetch_page_content(
+async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
    urls: Union[str, List[str]],
    *,
    preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup",
    tavily_config: Optional[TavilyConfig] = None,
    soup_crawler_config: Optional[SoupCrawlerConfig] = None,
 ) -> Dict[str, str]:
    """Fetch content from one or more URLs using the specified tool.
    This function retrieves web page content using either BeautifulSoup (with custom
@ -31,7 +27,7 @@ async def fetch_page_content(
            Defaults to "beautifulsoup".
        tavily_config: Configuration for Tavily API, including API key.
            Required if preferred_tool is "tavily".
-        soup_crawler_config: Configuration for BeautifulSoup crawler, including
+        default_crawler_config: Configuration for BeautifulSoup crawler, including
            extraction rules. Required if preferred_tool is "beautifulsoup" and
            extraction_rules are needed.
@ -45,50 +41,52 @@ async def fetch_page_content(
        ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
            installed.
    """
-    if preferred_tool == "tavily":
+    url_list = [urls] if isinstance(urls, str) else urls
        if not tavily_config or tavily_config.api_key is None:
            raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
        return await fetch_with_tavily(urls, tavily_config)
-    if preferred_tool == "beautifulsoup":
+    if os.getenv("TAVILY_API_KEY"):
-        try:
+        logger.info("Using Tavily API for url fetching")
-            from bs4 import BeautifulSoup as _  # noqa: F401
+        return await fetch_with_tavily(urls)
-        except ImportError:
+    else:
-            logger.error(
+        logger.info("Using default crawler for content extraction")
-                "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
+
-            )
+        default_crawler_config = (
-            raise ImportError
+            DefaultCrawlerConfig()
-        if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
+        )  # We've decided to use defaults, and configure through env vars as needed
-            raise ValueError("extraction_rules must be provided when not using Tavily")
+
-        extraction_rules = soup_crawler_config.extraction_rules
+        logger.info(
-        crawler = BeautifulSoupCrawler(
+            f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
-            concurrency=soup_crawler_config.concurrency,
+        )
-            crawl_delay=soup_crawler_config.crawl_delay,
+
-            timeout=soup_crawler_config.timeout,
+        crawler = DefaultUrlCrawler(
-            max_retries=soup_crawler_config.max_retries,
+            concurrency=default_crawler_config.concurrency,
-            retry_delay_factor=soup_crawler_config.retry_delay_factor,
+            crawl_delay=default_crawler_config.crawl_delay,
-            headers=soup_crawler_config.headers,
+            max_crawl_delay=default_crawler_config.max_crawl_delay,
-            robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
+            timeout=default_crawler_config.timeout,
            max_retries=default_crawler_config.max_retries,
            retry_delay_factor=default_crawler_config.retry_delay_factor,
            headers=default_crawler_config.headers,
            robots_cache_ttl=default_crawler_config.robots_cache_ttl,
        )
        try:
-            results = await crawler.fetch_with_bs4(
+            logger.info(
-                urls,
+                f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
                extraction_rules,
                use_playwright=soup_crawler_config.use_playwright,
                playwright_js_wait=soup_crawler_config.playwright_js_wait,
                join_all_matches=soup_crawler_config.join_all_matches,
            )
            results = await crawler.fetch_urls(
                urls,
                use_playwright=default_crawler_config.use_playwright,
                playwright_js_wait=default_crawler_config.playwright_js_wait,
            )
            logger.info(f"Successfully fetched content from {len(results)} URL(s)")
            return results
        except Exception as e:
            logger.error(f"Error fetching page content: {str(e)}")
            raise
        finally:
            logger.info("Closing BeautifulSoup crawler")
            await crawler.close()
-async def fetch_with_tavily(
+async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
    urls: Union[str, List[str]], tavily_config: Optional[TavilyConfig] = None
 ) -> Dict[str, str]:
    """Fetch content from URLs using the Tavily API.
    Args:
@ -108,19 +106,37 @@ async def fetch_with_tavily(
            "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
        )
        raise
-    client = AsyncTavilyClient(
+
-        api_key=tavily_config.api_key if tavily_config else None,
+    tavily_config = TavilyConfig()
-        proxies=tavily_config.proxies if tavily_config else None,
+    url_list = [urls] if isinstance(urls, str) else urls
    extract_depth = tavily_config.extract_depth if tavily_config else "basic"
    timeout = tavily_config.timeout if tavily_config else 10
    logger.info(
        f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
    )
    client = AsyncTavilyClient(
        api_key=tavily_config.api_key,
        proxies=tavily_config.proxies,
    )
    logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
    results = await client.extract(
        urls,
        format="text",
-        extract_depth=tavily_config.extract_depth if tavily_config else "basic",
+        extract_depth=extract_depth,
-        timeout=tavily_config.timeout if tavily_config else 10,
+        timeout=timeout,
    )
-    for failed_result in results.get("failed_results", []):
+
-        logger.warning(f"Failed to fetch {failed_result}")
+    failed_count = len(results.get("failed_results", []))
    if failed_count > 0:
        logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
        for failed_result in results.get("failed_results", []):
            logger.warning(f"Failed to fetch {failed_result}")
    return_results = {}
    for result in results.get("results", []):
        return_results[result["url"]] = result["raw_content"]
    logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
    return return_results
--- a/cognee/tasks/web_scraper/web_scraper_task.py
+++ b/cognee/tasks/web_scraper/web_scraper_task.py
@ -19,7 +19,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges
 from cognee.modules.engine.operations.setup import setup
 from .models import WebPage, WebSite, ScrapingJob
-from .config import SoupCrawlerConfig, TavilyConfig
+from .config import DefaultCrawlerConfig, TavilyConfig
 from .utils import fetch_page_content
 try:
@ -47,7 +47,7 @@ async def cron_web_scraper_task(
    schedule: str = None,
    extraction_rules: dict = None,
    tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
-    soup_crawler_config: SoupCrawlerConfig = None,
+    soup_crawler_config: DefaultCrawlerConfig = None,
    tavily_config: TavilyConfig = None,
    job_name: str = "scraping",
 ):
@ -121,7 +121,7 @@ async def web_scraper_task(
    schedule: str = None,
    extraction_rules: dict = None,
    tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
-    soup_crawler_config: SoupCrawlerConfig = None,
+    soup_crawler_config: DefaultCrawlerConfig = None,
    tavily_config: TavilyConfig = None,
    job_name: str = None,
 ):
@ -341,7 +341,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
        soup_crawler_config: Configuration for BeautifulSoup crawler.
    Returns:
-        Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
+        Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
            tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
    Raises:
@ -350,7 +350,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
    preferred_tool = "beautifulsoup"
    if extraction_rules and not soup_crawler_config:
-        soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
+        soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
    if tavily_api_key:
        if not tavily_config:
--- a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py
@ -0,0 +1,13 @@
 import pytest
 from cognee.tasks.web_scraper import DefaultUrlCrawler
@pytest.mark.asyncio
 async def test_fetch():
    crawler = DefaultUrlCrawler()
    url = "https://en.wikipedia.org/wiki/Large_language_model"
    results = await crawler.fetch_urls(url)
    assert len(results) == 1
    assert isinstance(results, dict)
    html = results[url]
    assert isinstance(html, str)
--- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
+++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py
@ -0,0 +1,19 @@
 import os
 import pytest
 from cognee.tasks.web_scraper.utils import fetch_with_tavily
 skip_in_ci = pytest.mark.skipif(
    os.getenv("GITHUB_ACTIONS") == "true",
    reason="Skipping in Github for now - before we get TAVILY_API_KEY",
 )
@skip_in_ci
@pytest.mark.asyncio
 async def test_fetch():
    url = "https://en.wikipedia.org/wiki/Large_language_model"
    results = await fetch_with_tavily(url)
    assert isinstance(results, dict)
    assert len(results) == 1
    html = results[url]
    assert isinstance(html, str)
--- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
+++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
@ -0,0 +1,310 @@
 import pytest
 import cognee
 from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
 from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
 from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
 from cognee.tasks.ingestion import save_data_item_to_storage
 from pathlib import Path
@pytest.mark.asyncio
 async def test_url_saves_as_html_file():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_saved_html_is_valid():
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        pytest.fail("Test case requires bs4 installed")
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        content = Path(file_path).read_text()
        soup = BeautifulSoup(content, "html.parser")
        assert soup.find() is not None, "File should contain parseable HTML"
        has_html_elements = any(
            [
                soup.find("html"),
                soup.find("head"),
                soup.find("body"),
                soup.find("div"),
                soup.find("p"),
            ]
        )
        assert has_html_elements, "File should contain common HTML elements"
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_add_url():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
@pytest.mark.asyncio
 async def test_add_url_without_incremental_loading():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
            incremental_loading=False,
        )
    except Exception as e:
        pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
 async def test_add_url_with_incremental_loading():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
            incremental_loading=True,
        )
    except Exception as e:
        pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
 async def test_add_url_with_extraction_rules():  # TODO: this'll fail due to not implemented `load()` yet
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        await cognee.add(
            "https://en.wikipedia.org/wiki/Large_language_model",
            preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
        )
    except Exception as e:
        pytest.fail(f"Failed to add url: {e}")
@pytest.mark.asyncio
 async def test_loader_is_none_by_default():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
        loader_engine = LoaderEngine()
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        loader = loader_engine.get_loader(
            file_path,
            preferred_loaders=preferred_loaders,
        )
        assert loader is None
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
        loader_engine = LoaderEngine()
        bs_loader = BeautifulSoupLoader()
        loader_engine.register_loader(bs_loader)
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        loader = loader_engine.get_loader(
            file_path,
            preferred_loaders=preferred_loaders,
        )
        assert loader == bs_loader
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
        loader_engine = LoaderEngine()
        bs_loader = BeautifulSoupLoader()
        loader_engine.register_loader(bs_loader)
        preferred_loaders = {"beautiful_soup_loader": {}}
        with pytest.raises(ValueError):
            await loader_engine.load_file(
                file_path,
                preferred_loaders=preferred_loaders,
            )
        extraction_rules = {
            "title": {"selector": "title"},
            "headings": {"selector": "h1, h2, h3", "all": True},
            "links": {"selector": "a", "attr": "href", "all": True},
            "paragraphs": {"selector": "p", "all": True},
        }
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        await loader_engine.load_file(
            file_path,
            preferred_loaders=preferred_loaders,
        )
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        file = Path(file_path)
        assert file.exists()
        assert file.stat().st_size > 0
        loader_engine = LoaderEngine()
        bs_loader = BeautifulSoupLoader()
        loader_engine.register_loader(bs_loader)
        extraction_rules = {
            "title": {"selector": "title"},
            "headings": {"selector": "h1, h2, h3", "all": True},
            "links": {"selector": "a", "attr": "href", "all": True},
            "paragraphs": {"selector": "p", "all": True},
        }
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        await loader_engine.load_file(
            file_path,
            preferred_loaders=preferred_loaders,
        )
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
 async def test_beautiful_soup_loads_file_successfully():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {"selector": "a", "attr": "href", "all": True},
        "paragraphs": {"selector": "p", "all": True},
    }
    try:
        original_file_path = await save_data_item_to_storage(
            "https://en.wikipedia.org/wiki/Large_language_model"
        )
        file_path = get_data_file_path(original_file_path)
        assert file_path.endswith(".html")
        original_file = Path(file_path)
        assert original_file.exists()
        assert original_file.stat().st_size > 0
        loader_engine = LoaderEngine()
        bs_loader = BeautifulSoupLoader()
        loader_engine.register_loader(bs_loader)
        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
        loader = loader_engine.get_loader(
            file_path,
            preferred_loaders=preferred_loaders,
        )
        assert loader == bs_loader
        cognee_loaded_txt_path = await loader_engine.load_file(
            file_path=file_path, preferred_loaders=preferred_loaders
        )
        cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
        assert cognee_loaded_txt_path.endswith(".txt")
        extracted_file = Path(cognee_loaded_txt_path)
        assert extracted_file.exists()
        assert extracted_file.stat().st_size > 0
        original_basename = original_file.stem
        extracted_basename = extracted_file.stem
        assert original_basename == extracted_basename, (
            f"Expected same base name: {original_basename} vs {extracted_basename}"
        )
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
--- a/cognee/tests/tasks/web_scraping/web_scraping_test.py
+++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py
@ -1,6 +1,6 @@
 import asyncio
 import cognee
-from cognee.tasks.web_scraper.config import SoupCrawlerConfig
+from cognee.tasks.web_scraper.config import DefaultCrawlerConfig
 from cognee.tasks.web_scraper import cron_web_scraper_task
@ -14,7 +14,7 @@ async def test_web_scraping_using_bs4():
        "authors": {"selector": ".quote small", "all": True},
    }
-    soup_config = SoupCrawlerConfig(
+    soup_config = DefaultCrawlerConfig(
        concurrency=5,
        crawl_delay=0.5,
        timeout=15.0,
@ -47,7 +47,7 @@ async def test_web_scraping_using_bs4_and_incremental_loading():
    url = "https://books.toscrape.com/"
    rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}
-    soup_config = SoupCrawlerConfig(
+    soup_config = DefaultCrawlerConfig(
        concurrency=1,
        crawl_delay=0.1,
        timeout=10.0,
--- a/examples/python/web_url_fetcher_example.py
+++ b/examples/python/web_url_fetcher_example.py
@ -0,0 +1,37 @@
 import asyncio
 import cognee
 async def main():
    await cognee.prune.prune_data()
    print("Data pruned.")
    await cognee.prune.prune_system(metadata=True)
    extraction_rules = {
        "title": {"selector": "title"},
        "headings": {"selector": "h1, h2, h3", "all": True},
        "links": {
            "selector": "a",
            "attr": "href",
            "all": True,
        },
        "paragraphs": {"selector": "p", "all": True},
    }
    await cognee.add(
        "https://en.wikipedia.org/wiki/Large_language_model",
        incremental_loading=False,
        preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
    )
    await cognee.cognify()
    print("Knowledge graph created.")
    await cognee.visualize_graph()
    print("Data visualized")
 if __name__ == "__main__":
    asyncio.run(main())
		`@ -0,0 +1,4 @@`
							`from typing import TypeAlias`


							`UrlsToHtmls: TypeAlias = dict[str, str]`