Solved more nitpick comments
This commit is contained in:
parent
1b5c099f8b
commit
791e38b2c0
4 changed files with 45 additions and 46 deletions
|
|
@ -171,19 +171,7 @@ async def add(
|
|||
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
||||
|
||||
"""
|
||||
tasks = [
|
||||
Task(resolve_data_directories, include_subdirectories=True),
|
||||
Task(
|
||||
ingest_data,
|
||||
dataset_name,
|
||||
user,
|
||||
node_set,
|
||||
dataset_id,
|
||||
preferred_loaders,
|
||||
),
|
||||
]
|
||||
|
||||
await setup()
|
||||
|
||||
if not soup_crawler_config and extraction_rules:
|
||||
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
||||
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
||||
|
|
@ -202,6 +190,20 @@ async def add(
|
|||
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
||||
|
||||
tasks = [
|
||||
Task(resolve_data_directories, include_subdirectories=True),
|
||||
Task(
|
||||
ingest_data,
|
||||
dataset_name,
|
||||
user,
|
||||
node_set,
|
||||
dataset_id,
|
||||
preferred_loaders,
|
||||
),
|
||||
]
|
||||
|
||||
await setup()
|
||||
|
||||
user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user)
|
||||
|
||||
await reset_dataset_pipeline_run_status(
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
from pydantic import BaseModel, Field
|
||||
from typing import Any, Dict, Optional, Literal
|
||||
from typing import Any, Dict, Optional
|
||||
import os
|
||||
|
||||
|
||||
class TavilyConfig(BaseModel):
|
||||
api_key: str = os.getenv("TAVILY_API_KEY")
|
||||
extract_depth: str = "basic"
|
||||
timeout: int = Field(None, ge=1, le=60)
|
||||
timeout: Optional[int] = Field(default=None, ge=1, le=60)
|
||||
|
||||
|
||||
class SoupCrawlerConfig(BaseModel):
|
||||
|
|
|
|||
|
|
@ -58,8 +58,15 @@ async def fetch_page_content(
|
|||
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
|
||||
)
|
||||
raise
|
||||
crawler = BeautifulSoupCrawler()
|
||||
if not soup_crawler_config and soup_crawler_config.extraction_rules is None:
|
||||
crawler = BeautifulSoupCrawler(
|
||||
concurrency=soup_crawler_config.concurrency,
|
||||
crawl_delay=soup_crawler_config.crawl_delay,
|
||||
timeout=soup_crawler_config.timeout,
|
||||
max_retries=soup_crawler_config.max_retries,
|
||||
retry_delay_factor=soup_crawler_config.retry_delay_factor,
|
||||
headers=soup_crawler_config.headers,
|
||||
)
|
||||
if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
|
||||
raise ValueError("extraction_rules must be provided when not using Tavily")
|
||||
extraction_rules = soup_crawler_config.extraction_rules
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -139,48 +139,38 @@ async def test_cron_web_scraper():
|
|||
# Run cron_web_scraper_task
|
||||
await cron_web_scraper_task(
|
||||
url=urls,
|
||||
schedule="*/3 * * * *", # every 3 minutes
|
||||
job_name="cron_scraping_job",
|
||||
extraction_rules=extraction_rules,
|
||||
)
|
||||
results = await cognee.search(
|
||||
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
|
||||
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
||||
)
|
||||
|
||||
scraping_job_done = await graph_db.get_node(uuid5(NAMESPACE_OID, name="cron_scraping_job"))
|
||||
while True:
|
||||
if scraping_job_done:
|
||||
results = await cognee.search(
|
||||
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
|
||||
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
||||
)
|
||||
assert "Albert Einstein" in results[0]
|
||||
|
||||
assert "Albert Einstein" in results[0]
|
||||
results_books = await cognee.search(
|
||||
"What is the price of 'A Light in the Attic' book?",
|
||||
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
||||
)
|
||||
|
||||
results_books = await cognee.search(
|
||||
"What is the price of 'A Light in the Attic' book?",
|
||||
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
||||
)
|
||||
assert "51.77" in results_books[0]
|
||||
|
||||
assert "51.77" in results_books[0]
|
||||
|
||||
print("Cron job web_scraping test passed!")
|
||||
break
|
||||
else:
|
||||
scraping_job_done = await graph_db.get_node(
|
||||
uuid5(NAMESPACE_OID, name="cron_scraping_job")
|
||||
)
|
||||
print("Cron job web_scraping test passed!")
|
||||
|
||||
|
||||
async def main():
|
||||
# print("Starting BS4 incremental loading test...")
|
||||
# await test_web_scraping_using_bs4_and_incremental_loading()
|
||||
print("Starting BS4 incremental loading test...")
|
||||
await test_web_scraping_using_bs4_and_incremental_loading()
|
||||
|
||||
# print("Starting BS4 normal test...")
|
||||
# await test_web_scraping_using_bs4()
|
||||
print("Starting BS4 normal test...")
|
||||
await test_web_scraping_using_bs4()
|
||||
|
||||
# print("Starting Tavily incremental loading test...")
|
||||
# await test_web_scraping_using_tavily_and_incremental_loading()
|
||||
print("Starting Tavily incremental loading test...")
|
||||
await test_web_scraping_using_tavily_and_incremental_loading()
|
||||
|
||||
# print("Starting Tavily normal test...")
|
||||
# await test_web_scraping_using_tavily()
|
||||
print("Starting Tavily normal test...")
|
||||
await test_web_scraping_using_tavily()
|
||||
|
||||
print("Starting cron job test...")
|
||||
await test_cron_web_scraper()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue