feat: web_url_fetcher

2025-10-20 20:54:39 +01:00 · 2025-10-20 20:54:39 +01:00 · 17b33ab443
commit 17b33ab443
parent 8fe789ee96
3 changed files with 93 additions and 0 deletions
--- a/cognee/tasks/ingestion/data_fetchers/init.py
+++ b/cognee/tasks/ingestion/data_fetchers/init.py
@ -0,0 +1,8 @@
 __all__ = []
 try:
    from .web_url_fetcher import WebUrlFetcher
    __all__.append("WebUrlFetcher")
 except ImportError:
    pass
--- a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
+++ b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py
@ -0,0 +1,15 @@
 from abc import ABC, abstractmethod
 from typing import Any
 class DataFetcherInterface(ABC):
    @abstractmethod
    def fetcher_name(self) -> str:
        pass
    @abstractmethod
    async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]) -> str:
        """
        args: data_item_path - path to the data item
        """
        pass
--- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
+++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
@ -0,0 +1,70 @@
 from cognee.modules.ingestion import save_data_to_file
 from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface
 from typing import Any
 from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
 from cognee.modules.ingestion.exceptions.exceptions import IngestionError
 from cognee.shared.logging_utils import get_logger
 logger = get_logger()
 class WebUrlFetcher(DataFetcherInterface):
    def __init__(self): ...
    def fetcher_name(self):
        return "web_url_fetcher"
    async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]):
        from cognee.context_global_variables import tavily_config, soup_crawler_config
        from cognee.tasks.web_scraper import fetch_page_content
        web_url_fetcher_config = fetchers_config.get(self.fetcher_name())
        if not isinstance(web_url_fetcher_config, dict):
            raise IngestionError(f"{self.fetcher_name()} configuration must be a valid dictionary")
        tavily_dict = web_url_fetcher_config.get("tavily_config")
        _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
        soup_dict = web_url_fetcher_config.get("soup_config")
        _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
        # Set global configs for downstream access
        tavily_config.set(_tavily_config)
        soup_crawler_config.set(_soup_config)
        preferred_tool = "beautifulsoup" if _soup_config else "tavily"
        if preferred_tool == "tavily" and _tavily_config is None:
            raise IngestionError(
                message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
            )
        if preferred_tool == "beautifulsoup" and _soup_config is None:
            raise IngestionError(
                message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
            )
        logger.info(f"Starting web URL crawling for: {data_item_path}")
        logger.info(f"Using scraping tool: {preferred_tool}")
        data = await fetch_page_content(
            data_item_path,
            preferred_tool=preferred_tool,
            soup_crawler_config=_soup_config,
            tavily_config=_tavily_config,
        )
        logger.info(f"Successfully fetched content from URL {data_item_path}")
        # fetch_page_content returns a dict like {url: content}
        # Extract the content string before saving
        if isinstance(data, dict):
            # Concatenate all URL contents (usually just one URL)
            content = ""
            for url, text in data.items():
                content += f"{url}:\n{text}\n\n"
            logger.info(
                f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters"
            )
        else:
            content = data
        return await save_data_to_file(content)