Solved more nitpick comments

This commit is contained in:
Geoff-Robin 2025-10-06 18:00:20 +05:30
parent 1b5c099f8b
commit 791e38b2c0
4 changed files with 45 additions and 46 deletions

View file

@ -171,19 +171,7 @@ async def add(
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
"""
tasks = [
Task(resolve_data_directories, include_subdirectories=True),
Task(
ingest_data,
dataset_name,
user,
node_set,
dataset_id,
preferred_loaders,
),
]
await setup()
if not soup_crawler_config and extraction_rules:
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
if not tavily_config and os.getenv("TAVILY_API_KEY"):
@ -202,6 +190,20 @@ async def add(
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
tasks = [
Task(resolve_data_directories, include_subdirectories=True),
Task(
ingest_data,
dataset_name,
user,
node_set,
dataset_id,
preferred_loaders,
),
]
await setup()
user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user)
await reset_dataset_pipeline_run_status(

View file

@ -1,12 +1,12 @@
from pydantic import BaseModel, Field
from typing import Any, Dict, Optional, Literal
from typing import Any, Dict, Optional
import os
class TavilyConfig(BaseModel):
api_key: str = os.getenv("TAVILY_API_KEY")
extract_depth: str = "basic"
timeout: int = Field(None, ge=1, le=60)
timeout: Optional[int] = Field(default=None, ge=1, le=60)
class SoupCrawlerConfig(BaseModel):

View file

@ -58,8 +58,15 @@ async def fetch_page_content(
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
)
raise
crawler = BeautifulSoupCrawler()
if not soup_crawler_config and soup_crawler_config.extraction_rules is None:
crawler = BeautifulSoupCrawler(
concurrency=soup_crawler_config.concurrency,
crawl_delay=soup_crawler_config.crawl_delay,
timeout=soup_crawler_config.timeout,
max_retries=soup_crawler_config.max_retries,
retry_delay_factor=soup_crawler_config.retry_delay_factor,
headers=soup_crawler_config.headers,
)
if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
raise ValueError("extraction_rules must be provided when not using Tavily")
extraction_rules = soup_crawler_config.extraction_rules
try:

View file

@ -139,48 +139,38 @@ async def test_cron_web_scraper():
# Run cron_web_scraper_task
await cron_web_scraper_task(
url=urls,
schedule="*/3 * * * *", # every 3 minutes
job_name="cron_scraping_job",
extraction_rules=extraction_rules,
)
results = await cognee.search(
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
scraping_job_done = await graph_db.get_node(uuid5(NAMESPACE_OID, name="cron_scraping_job"))
while True:
if scraping_job_done:
results = await cognee.search(
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "Albert Einstein" in results[0]
assert "Albert Einstein" in results[0]
results_books = await cognee.search(
"What is the price of 'A Light in the Attic' book?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
results_books = await cognee.search(
"What is the price of 'A Light in the Attic' book?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "51.77" in results_books[0]
assert "51.77" in results_books[0]
print("Cron job web_scraping test passed!")
break
else:
scraping_job_done = await graph_db.get_node(
uuid5(NAMESPACE_OID, name="cron_scraping_job")
)
print("Cron job web_scraping test passed!")
async def main():
# print("Starting BS4 incremental loading test...")
# await test_web_scraping_using_bs4_and_incremental_loading()
print("Starting BS4 incremental loading test...")
await test_web_scraping_using_bs4_and_incremental_loading()
# print("Starting BS4 normal test...")
# await test_web_scraping_using_bs4()
print("Starting BS4 normal test...")
await test_web_scraping_using_bs4()
# print("Starting Tavily incremental loading test...")
# await test_web_scraping_using_tavily_and_incremental_loading()
print("Starting Tavily incremental loading test...")
await test_web_scraping_using_tavily_and_incremental_loading()
# print("Starting Tavily normal test...")
# await test_web_scraping_using_tavily()
print("Starting Tavily normal test...")
await test_web_scraping_using_tavily()
print("Starting cron job test...")
await test_cron_web_scraper()