feat: web_url_fetcher
This commit is contained in:
parent
8fe789ee96
commit
17b33ab443
3 changed files with 93 additions and 0 deletions
8
cognee/tasks/ingestion/data_fetchers/__init__.py
Normal file
8
cognee/tasks/ingestion/data_fetchers/__init__.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
__all__ = []
|
||||
|
||||
try:
|
||||
from .web_url_fetcher import WebUrlFetcher
|
||||
|
||||
__all__.append("WebUrlFetcher")
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
|
||||
class DataFetcherInterface(ABC):
|
||||
@abstractmethod
|
||||
def fetcher_name(self) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]) -> str:
|
||||
"""
|
||||
args: data_item_path - path to the data item
|
||||
"""
|
||||
pass
|
||||
70
cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
Normal file
70
cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
from cognee.modules.ingestion import save_data_to_file
|
||||
from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface
|
||||
from typing import Any
|
||||
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
|
||||
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class WebUrlFetcher(DataFetcherInterface):
|
||||
def __init__(self): ...
|
||||
|
||||
def fetcher_name(self):
|
||||
return "web_url_fetcher"
|
||||
|
||||
async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]):
|
||||
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
||||
from cognee.tasks.web_scraper import fetch_page_content
|
||||
|
||||
web_url_fetcher_config = fetchers_config.get(self.fetcher_name())
|
||||
if not isinstance(web_url_fetcher_config, dict):
|
||||
raise IngestionError(f"{self.fetcher_name()} configuration must be a valid dictionary")
|
||||
|
||||
tavily_dict = web_url_fetcher_config.get("tavily_config")
|
||||
_tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
|
||||
|
||||
soup_dict = web_url_fetcher_config.get("soup_config")
|
||||
_soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
|
||||
|
||||
# Set global configs for downstream access
|
||||
tavily_config.set(_tavily_config)
|
||||
soup_crawler_config.set(_soup_config)
|
||||
|
||||
preferred_tool = "beautifulsoup" if _soup_config else "tavily"
|
||||
if preferred_tool == "tavily" and _tavily_config is None:
|
||||
raise IngestionError(
|
||||
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
||||
)
|
||||
if preferred_tool == "beautifulsoup" and _soup_config is None:
|
||||
raise IngestionError(
|
||||
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
||||
)
|
||||
|
||||
logger.info(f"Starting web URL crawling for: {data_item_path}")
|
||||
logger.info(f"Using scraping tool: {preferred_tool}")
|
||||
|
||||
data = await fetch_page_content(
|
||||
data_item_path,
|
||||
preferred_tool=preferred_tool,
|
||||
soup_crawler_config=_soup_config,
|
||||
tavily_config=_tavily_config,
|
||||
)
|
||||
|
||||
logger.info(f"Successfully fetched content from URL {data_item_path}")
|
||||
|
||||
# fetch_page_content returns a dict like {url: content}
|
||||
# Extract the content string before saving
|
||||
if isinstance(data, dict):
|
||||
# Concatenate all URL contents (usually just one URL)
|
||||
content = ""
|
||||
for url, text in data.items():
|
||||
content += f"{url}:\n{text}\n\n"
|
||||
logger.info(
|
||||
f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters"
|
||||
)
|
||||
else:
|
||||
content = data
|
||||
|
||||
return await save_data_to_file(content)
|
||||
Loading…
Add table
Reference in a new issue