From 791e38b2c0b87095200e9cd4f6744d52fd37e46c Mon Sep 17 00:00:00 2001 From: Geoff-Robin Date: Mon, 6 Oct 2025 18:00:20 +0530 Subject: [PATCH] Solved more nitpick comments --- cognee/api/v1/add/add.py | 28 ++++++----- cognee/tasks/web_scraper/config.py | 4 +- cognee/tasks/web_scraper/utils.py | 11 ++++- .../tasks/web_scraping/web_scraping_test.py | 48 ++++++++----------- 4 files changed, 45 insertions(+), 46 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 65ebb8748..3942fcf5f 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -171,19 +171,7 @@ async def add( - TAVILY_API_KEY: YOUR_TAVILY_API_KEY """ - tasks = [ - Task(resolve_data_directories, include_subdirectories=True), - Task( - ingest_data, - dataset_name, - user, - node_set, - dataset_id, - preferred_loaders, - ), - ] - - await setup() + if not soup_crawler_config and extraction_rules: soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules) if not tavily_config and os.getenv("TAVILY_API_KEY"): @@ -202,6 +190,20 @@ async def add( elif isinstance(data, list) and any(_is_http_url(item) for item in data): node_set = ["web_content"] if not node_set else node_set + ["web_content"] + tasks = [ + Task(resolve_data_directories, include_subdirectories=True), + Task( + ingest_data, + dataset_name, + user, + node_set, + dataset_id, + preferred_loaders, + ), + ] + + await setup() + user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user) await reset_dataset_pipeline_run_status( diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index e81db9b05..4b54c6470 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -1,12 +1,12 @@ from pydantic import BaseModel, Field -from typing import Any, Dict, Optional, Literal +from typing import Any, Dict, Optional import os class TavilyConfig(BaseModel): api_key: str = os.getenv("TAVILY_API_KEY") extract_depth: str = "basic" - timeout: int = Field(None, ge=1, le=60) + timeout: Optional[int] = Field(default=None, ge=1, le=60) class SoupCrawlerConfig(BaseModel): diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 53346fb1c..62a614e6e 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -58,8 +58,15 @@ async def fetch_page_content( "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1" ) raise - crawler = BeautifulSoupCrawler() - if not soup_crawler_config and soup_crawler_config.extraction_rules is None: + crawler = BeautifulSoupCrawler( + concurrency=soup_crawler_config.concurrency, + crawl_delay=soup_crawler_config.crawl_delay, + timeout=soup_crawler_config.timeout, + max_retries=soup_crawler_config.max_retries, + retry_delay_factor=soup_crawler_config.retry_delay_factor, + headers=soup_crawler_config.headers, + ) + if not soup_crawler_config or soup_crawler_config.extraction_rules is None: raise ValueError("extraction_rules must be provided when not using Tavily") extraction_rules = soup_crawler_config.extraction_rules try: diff --git a/cognee/tests/tasks/web_scraping/web_scraping_test.py b/cognee/tests/tasks/web_scraping/web_scraping_test.py index 3e1fa2f78..c598ef536 100644 --- a/cognee/tests/tasks/web_scraping/web_scraping_test.py +++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py @@ -139,48 +139,38 @@ async def test_cron_web_scraper(): # Run cron_web_scraper_task await cron_web_scraper_task( url=urls, - schedule="*/3 * * * *", # every 3 minutes job_name="cron_scraping_job", extraction_rules=extraction_rules, ) + results = await cognee.search( + "Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?", + query_type=cognee.SearchType.GRAPH_COMPLETION, + ) - scraping_job_done = await graph_db.get_node(uuid5(NAMESPACE_OID, name="cron_scraping_job")) - while True: - if scraping_job_done: - results = await cognee.search( - "Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?", - query_type=cognee.SearchType.GRAPH_COMPLETION, - ) + assert "Albert Einstein" in results[0] - assert "Albert Einstein" in results[0] + results_books = await cognee.search( + "What is the price of 'A Light in the Attic' book?", + query_type=cognee.SearchType.GRAPH_COMPLETION, + ) - results_books = await cognee.search( - "What is the price of 'A Light in the Attic' book?", - query_type=cognee.SearchType.GRAPH_COMPLETION, - ) + assert "51.77" in results_books[0] - assert "51.77" in results_books[0] - - print("Cron job web_scraping test passed!") - break - else: - scraping_job_done = await graph_db.get_node( - uuid5(NAMESPACE_OID, name="cron_scraping_job") - ) + print("Cron job web_scraping test passed!") async def main(): - # print("Starting BS4 incremental loading test...") - # await test_web_scraping_using_bs4_and_incremental_loading() + print("Starting BS4 incremental loading test...") + await test_web_scraping_using_bs4_and_incremental_loading() - # print("Starting BS4 normal test...") - # await test_web_scraping_using_bs4() + print("Starting BS4 normal test...") + await test_web_scraping_using_bs4() - # print("Starting Tavily incremental loading test...") - # await test_web_scraping_using_tavily_and_incremental_loading() + print("Starting Tavily incremental loading test...") + await test_web_scraping_using_tavily_and_incremental_loading() - # print("Starting Tavily normal test...") - # await test_web_scraping_using_tavily() + print("Starting Tavily normal test...") + await test_web_scraping_using_tavily() print("Starting cron job test...") await test_cron_web_scraper()