Solved more nitpick comments

2025-10-06 18:00:20 +05:30 · 2025-10-06 18:00:20 +05:30 · 791e38b2c0
commit 791e38b2c0
parent 1b5c099f8b
4 changed files with 45 additions and 46 deletions
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -171,19 +171,7 @@ async def add(
        - TAVILY_API_KEY: YOUR_TAVILY_API_KEY

    """
-    tasks = [
-        Task(resolve_data_directories, include_subdirectories=True),
-        Task(
-            ingest_data,
-            dataset_name,
-            user,
-            node_set,
-            dataset_id,
-            preferred_loaders,
-        ),
-    ]
-
-    await setup()
+    
    if not soup_crawler_config and extraction_rules:
        soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
    if not tavily_config and os.getenv("TAVILY_API_KEY"):
@ -202,6 +190,20 @@ async def add(
    elif isinstance(data, list) and any(_is_http_url(item) for item in data):
        node_set = ["web_content"] if not node_set else node_set + ["web_content"]

+    tasks = [
+        Task(resolve_data_directories, include_subdirectories=True),
+        Task(
+            ingest_data,
+            dataset_name,
+            user,
+            node_set,
+            dataset_id,
+            preferred_loaders,
+        ),
+    ]
+
+    await setup()
+
    user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user)

    await reset_dataset_pipeline_run_status(
--- a/cognee/tasks/web_scraper/config.py
+++ b/cognee/tasks/web_scraper/config.py
@ -1,12 +1,12 @@
 from pydantic import BaseModel, Field
-from typing import Any, Dict, Optional, Literal
+from typing import Any, Dict, Optional
 import os


 class TavilyConfig(BaseModel):
    api_key: str = os.getenv("TAVILY_API_KEY")
    extract_depth: str = "basic"
-    timeout: int = Field(None, ge=1, le=60)
+    timeout: Optional[int] = Field(default=None, ge=1, le=60)


 class SoupCrawlerConfig(BaseModel):
--- a/cognee/tasks/web_scraper/utils.py
+++ b/cognee/tasks/web_scraper/utils.py
@ -58,8 +58,15 @@ async def fetch_page_content(
                "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
            )
            raise
-        crawler = BeautifulSoupCrawler()
-        if not soup_crawler_config and soup_crawler_config.extraction_rules is None:
+        crawler = BeautifulSoupCrawler(
+            concurrency=soup_crawler_config.concurrency,
+            crawl_delay=soup_crawler_config.crawl_delay,
+            timeout=soup_crawler_config.timeout,
+            max_retries=soup_crawler_config.max_retries,
+            retry_delay_factor=soup_crawler_config.retry_delay_factor,
+            headers=soup_crawler_config.headers,
+        )
+        if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
            raise ValueError("extraction_rules must be provided when not using Tavily")
        extraction_rules = soup_crawler_config.extraction_rules
        try:
--- a/cognee/tests/tasks/web_scraping/web_scraping_test.py
+++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py
@ -139,48 +139,38 @@ async def test_cron_web_scraper():
    # Run cron_web_scraper_task
    await cron_web_scraper_task(
        url=urls,
-        schedule="*/3 * * * *",  # every 3 minutes
        job_name="cron_scraping_job",
        extraction_rules=extraction_rules,
    )
+    results = await cognee.search(
+        "Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
+        query_type=cognee.SearchType.GRAPH_COMPLETION,
+    )

-    scraping_job_done = await graph_db.get_node(uuid5(NAMESPACE_OID, name="cron_scraping_job"))
-    while True:
-        if scraping_job_done:
-            results = await cognee.search(
-                "Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
-                query_type=cognee.SearchType.GRAPH_COMPLETION,
-            )
+    assert "Albert Einstein" in results[0]

-            assert "Albert Einstein" in results[0]
+    results_books = await cognee.search(
+        "What is the price of 'A Light in the Attic' book?",
+        query_type=cognee.SearchType.GRAPH_COMPLETION,
+    )

-            results_books = await cognee.search(
-                "What is the price of 'A Light in the Attic' book?",
-                query_type=cognee.SearchType.GRAPH_COMPLETION,
-            )
+    assert "51.77" in results_books[0]

-            assert "51.77" in results_books[0]
-
-            print("Cron job web_scraping test passed!")
-            break
-        else:
-            scraping_job_done = await graph_db.get_node(
-                uuid5(NAMESPACE_OID, name="cron_scraping_job")
-            )
+    print("Cron job web_scraping test passed!")


 async def main():
-    # print("Starting BS4 incremental loading test...")
-    # await test_web_scraping_using_bs4_and_incremental_loading()
+    print("Starting BS4 incremental loading test...")
+    await test_web_scraping_using_bs4_and_incremental_loading()

-    # print("Starting BS4 normal test...")
-    # await test_web_scraping_using_bs4()
+    print("Starting BS4 normal test...")
+    await test_web_scraping_using_bs4()

-    # print("Starting Tavily incremental loading test...")
-    # await test_web_scraping_using_tavily_and_incremental_loading()
+    print("Starting Tavily incremental loading test...")
+    await test_web_scraping_using_tavily_and_incremental_loading()

-    # print("Starting Tavily normal test...")
-    # await test_web_scraping_using_tavily()
+    print("Starting Tavily normal test...")
+    await test_web_scraping_using_tavily()

    print("Starting cron job test...")
    await test_cron_web_scraper()