diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index c67afe7de..731895201 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -12,11 +12,15 @@ from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import ( ) from cognee.modules.engine.operations.setup import setup from cognee.tasks.ingestion import ingest_data, resolve_data_directories -from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig -from cognee.context_global_variables import ( - tavily_config as tavily, - soup_crawler_config as soup_crawler, -) +try: + from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig + from cognee.context_global_variables import ( + tavily_config as tavily, + soup_crawler_config as soup_crawler, + ) +except: + pass +from pydantic import BaseModel from urllib.parse import urlparse @@ -31,8 +35,8 @@ async def add( preferred_loaders: List[str] = None, incremental_loading: bool = True, extraction_rules: Optional[Dict[str, Any]] = None, - tavily_config: Optional[TavilyConfig] = None, - soup_crawler_config: Optional[SoupCrawlerConfig] = None, + tavily_config: Optional[BaseModel] = None, + soup_crawler_config: Optional[BaseModel] = None, ): """ Add data to Cognee for knowledge graph processing. @@ -172,23 +176,27 @@ async def add( """ - if not soup_crawler_config and extraction_rules: - soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules) - if not tavily_config and os.getenv("TAVILY_API_KEY"): - tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY")) + try: - soup_crawler.set(soup_crawler_config) - tavily.set(tavily_config) + if not soup_crawler_config and extraction_rules: + soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules) + if not tavily_config and os.getenv("TAVILY_API_KEY"): + tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY")) - http_schemes = {"http", "https"} + soup_crawler.set(soup_crawler_config) + tavily.set(tavily_config) - def _is_http_url(item: Union[str, BinaryIO]) -> bool: - return isinstance(item, str) and urlparse(item).scheme in http_schemes + http_schemes = {"http", "https"} - if _is_http_url(data): - node_set = ["web_content"] if not node_set else node_set + ["web_content"] - elif isinstance(data, list) and any(_is_http_url(item) for item in data): - node_set = ["web_content"] if not node_set else node_set + ["web_content"] + def _is_http_url(item: Union[str, BinaryIO]) -> bool: + return isinstance(item, str) and urlparse(item).scheme in http_schemes + + if _is_http_url(data): + node_set = ["web_content"] if not node_set else node_set + ["web_content"] + elif isinstance(data, list) and any(_is_http_url(item) for item in data): + node_set = ["web_content"] if not node_set else node_set + ["web_content"] + except: + pass tasks = [ Task(resolve_data_directories, include_subdirectories=True), diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 105f17c0d..9df5e6e57 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -7,7 +7,7 @@ from cognee.modules.ingestion.exceptions import IngestionError from cognee.modules.ingestion import save_data_to_file from cognee.shared.logging_utils import get_logger from pydantic_settings import BaseSettings, SettingsConfigDict -from cognee.context_global_variables import tavily_config, soup_crawler_config + logger = get_logger() @@ -59,6 +59,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str elif parsed_url.scheme == "http" or parsed_url.scheme == "https": # Validate URL by sending a HEAD request try: + from cognee.context_global_variables import tavily_config, soup_crawler_config from cognee.tasks.web_scraper import fetch_page_content tavily = tavily_config.get()