Solved more nitpick comments

2025-10-06 18:00:20 +05:30 · 2025-10-06 18:00:20 +05:30 · 791e38b2c0
commit 791e38b2c0
parent 1b5c099f8b
4 changed files with 45 additions and 46 deletions
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -171,19 +171,7 @@ async def add(
        - TAVILY_API_KEY: YOUR_TAVILY_API_KEY
    """
-    tasks = [
+    
        Task(resolve_data_directories, include_subdirectories=True),
        Task(
            ingest_data,
            dataset_name,
            user,
            node_set,
            dataset_id,
            preferred_loaders,
        ),
    ]
    await setup()
    if not soup_crawler_config and extraction_rules:
        soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
    if not tavily_config and os.getenv("TAVILY_API_KEY"):
@ -202,6 +190,20 @@ async def add(
    elif isinstance(data, list) and any(_is_http_url(item) for item in data):
        node_set = ["web_content"] if not node_set else node_set + ["web_content"]
    tasks = [
        Task(resolve_data_directories, include_subdirectories=True),
        Task(
            ingest_data,
            dataset_name,
            user,
            node_set,
            dataset_id,
            preferred_loaders,
        ),
    ]
    await setup()
    user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user)
    await reset_dataset_pipeline_run_status(
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -1,12 +1,12 @@
 from pydantic import BaseModel, Field
-from typing import Any, Dict, Optional, Literal
+from typing import Any, Dict, Optional
 import os
 class TavilyConfig(BaseModel):
    api_key: str = os.getenv("TAVILY_API_KEY")
    extract_depth: str = "basic"
-    timeout: int = Field(None, ge=1, le=60)
+    timeout: Optional[int] = Field(default=None, ge=1, le=60)
 class SoupCrawlerConfig(BaseModel):
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -58,8 +58,15 @@ async def fetch_page_content(
                "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
            )
            raise
-        crawler = BeautifulSoupCrawler()
+        crawler = BeautifulSoupCrawler(
-        if not soup_crawler_config and soup_crawler_config.extraction_rules is None:
+            concurrency=soup_crawler_config.concurrency,
            crawl_delay=soup_crawler_config.crawl_delay,
            timeout=soup_crawler_config.timeout,
            max_retries=soup_crawler_config.max_retries,
            retry_delay_factor=soup_crawler_config.retry_delay_factor,
            headers=soup_crawler_config.headers,
        )
        if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
            raise ValueError("extraction_rules must be provided when not using Tavily")
        extraction_rules = soup_crawler_config.extraction_rules
        try:
--- a/cognee/tests/tasks/web_scraping/web_scraping_test.py
+++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py
@ -139,48 +139,38 @@ async def test_cron_web_scraper():
    # Run cron_web_scraper_task
    await cron_web_scraper_task(
        url=urls,
        schedule="*/3 * * * *",  # every 3 minutes
        job_name="cron_scraping_job",
        extraction_rules=extraction_rules,
    )
    results = await cognee.search(
        "Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
        query_type=cognee.SearchType.GRAPH_COMPLETION,
    )
-    scraping_job_done = await graph_db.get_node(uuid5(NAMESPACE_OID, name="cron_scraping_job"))
+    assert "Albert Einstein" in results[0]
    while True:
        if scraping_job_done:
            results = await cognee.search(
                "Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
                query_type=cognee.SearchType.GRAPH_COMPLETION,
            )
-            assert "Albert Einstein" in results[0]
+    results_books = await cognee.search(
        "What is the price of 'A Light in the Attic' book?",
        query_type=cognee.SearchType.GRAPH_COMPLETION,
    )
-            results_books = await cognee.search(
+    assert "51.77" in results_books[0]
                "What is the price of 'A Light in the Attic' book?",
                query_type=cognee.SearchType.GRAPH_COMPLETION,
            )
-            assert "51.77" in results_books[0]
+    print("Cron job web_scraping test passed!")
            print("Cron job web_scraping test passed!")
            break
        else:
            scraping_job_done = await graph_db.get_node(
                uuid5(NAMESPACE_OID, name="cron_scraping_job")
            )
 async def main():
-    # print("Starting BS4 incremental loading test...")
+    print("Starting BS4 incremental loading test...")
-    # await test_web_scraping_using_bs4_and_incremental_loading()
+    await test_web_scraping_using_bs4_and_incremental_loading()
-    # print("Starting BS4 normal test...")
+    print("Starting BS4 normal test...")
-    # await test_web_scraping_using_bs4()
+    await test_web_scraping_using_bs4()
-    # print("Starting Tavily incremental loading test...")
+    print("Starting Tavily incremental loading test...")
-    # await test_web_scraping_using_tavily_and_incremental_loading()
+    await test_web_scraping_using_tavily_and_incremental_loading()
-    # print("Starting Tavily normal test...")
+    print("Starting Tavily normal test...")
-    # await test_web_scraping_using_tavily()
+    await test_web_scraping_using_tavily()
    print("Starting cron job test...")
    await test_cron_web_scraper()