Solved more nitpick comments
This commit is contained in:
parent
1b5c099f8b
commit
791e38b2c0
4 changed files with 45 additions and 46 deletions
|
|
@ -171,19 +171,7 @@ async def add(
|
||||||
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
||||||
|
|
||||||
"""
|
"""
|
||||||
tasks = [
|
|
||||||
Task(resolve_data_directories, include_subdirectories=True),
|
|
||||||
Task(
|
|
||||||
ingest_data,
|
|
||||||
dataset_name,
|
|
||||||
user,
|
|
||||||
node_set,
|
|
||||||
dataset_id,
|
|
||||||
preferred_loaders,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
await setup()
|
|
||||||
if not soup_crawler_config and extraction_rules:
|
if not soup_crawler_config and extraction_rules:
|
||||||
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
||||||
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
||||||
|
|
@ -202,6 +190,20 @@ async def add(
|
||||||
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
||||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
||||||
|
|
||||||
|
tasks = [
|
||||||
|
Task(resolve_data_directories, include_subdirectories=True),
|
||||||
|
Task(
|
||||||
|
ingest_data,
|
||||||
|
dataset_name,
|
||||||
|
user,
|
||||||
|
node_set,
|
||||||
|
dataset_id,
|
||||||
|
preferred_loaders,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
await setup()
|
||||||
|
|
||||||
user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user)
|
user, authorized_dataset = await resolve_authorized_user_dataset(dataset_id, dataset_name, user)
|
||||||
|
|
||||||
await reset_dataset_pipeline_run_status(
|
await reset_dataset_pipeline_run_status(
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,12 @@
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import Any, Dict, Optional, Literal
|
from typing import Any, Dict, Optional
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
class TavilyConfig(BaseModel):
|
class TavilyConfig(BaseModel):
|
||||||
api_key: str = os.getenv("TAVILY_API_KEY")
|
api_key: str = os.getenv("TAVILY_API_KEY")
|
||||||
extract_depth: str = "basic"
|
extract_depth: str = "basic"
|
||||||
timeout: int = Field(None, ge=1, le=60)
|
timeout: Optional[int] = Field(default=None, ge=1, le=60)
|
||||||
|
|
||||||
|
|
||||||
class SoupCrawlerConfig(BaseModel):
|
class SoupCrawlerConfig(BaseModel):
|
||||||
|
|
|
||||||
|
|
@ -58,8 +58,15 @@ async def fetch_page_content(
|
||||||
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
|
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
crawler = BeautifulSoupCrawler()
|
crawler = BeautifulSoupCrawler(
|
||||||
if not soup_crawler_config and soup_crawler_config.extraction_rules is None:
|
concurrency=soup_crawler_config.concurrency,
|
||||||
|
crawl_delay=soup_crawler_config.crawl_delay,
|
||||||
|
timeout=soup_crawler_config.timeout,
|
||||||
|
max_retries=soup_crawler_config.max_retries,
|
||||||
|
retry_delay_factor=soup_crawler_config.retry_delay_factor,
|
||||||
|
headers=soup_crawler_config.headers,
|
||||||
|
)
|
||||||
|
if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
|
||||||
raise ValueError("extraction_rules must be provided when not using Tavily")
|
raise ValueError("extraction_rules must be provided when not using Tavily")
|
||||||
extraction_rules = soup_crawler_config.extraction_rules
|
extraction_rules = soup_crawler_config.extraction_rules
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -139,48 +139,38 @@ async def test_cron_web_scraper():
|
||||||
# Run cron_web_scraper_task
|
# Run cron_web_scraper_task
|
||||||
await cron_web_scraper_task(
|
await cron_web_scraper_task(
|
||||||
url=urls,
|
url=urls,
|
||||||
schedule="*/3 * * * *", # every 3 minutes
|
|
||||||
job_name="cron_scraping_job",
|
job_name="cron_scraping_job",
|
||||||
extraction_rules=extraction_rules,
|
extraction_rules=extraction_rules,
|
||||||
)
|
)
|
||||||
|
results = await cognee.search(
|
||||||
|
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
|
||||||
|
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
||||||
|
)
|
||||||
|
|
||||||
scraping_job_done = await graph_db.get_node(uuid5(NAMESPACE_OID, name="cron_scraping_job"))
|
assert "Albert Einstein" in results[0]
|
||||||
while True:
|
|
||||||
if scraping_job_done:
|
|
||||||
results = await cognee.search(
|
|
||||||
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
|
|
||||||
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert "Albert Einstein" in results[0]
|
results_books = await cognee.search(
|
||||||
|
"What is the price of 'A Light in the Attic' book?",
|
||||||
|
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
||||||
|
)
|
||||||
|
|
||||||
results_books = await cognee.search(
|
assert "51.77" in results_books[0]
|
||||||
"What is the price of 'A Light in the Attic' book?",
|
|
||||||
query_type=cognee.SearchType.GRAPH_COMPLETION,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert "51.77" in results_books[0]
|
print("Cron job web_scraping test passed!")
|
||||||
|
|
||||||
print("Cron job web_scraping test passed!")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
scraping_job_done = await graph_db.get_node(
|
|
||||||
uuid5(NAMESPACE_OID, name="cron_scraping_job")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# print("Starting BS4 incremental loading test...")
|
print("Starting BS4 incremental loading test...")
|
||||||
# await test_web_scraping_using_bs4_and_incremental_loading()
|
await test_web_scraping_using_bs4_and_incremental_loading()
|
||||||
|
|
||||||
# print("Starting BS4 normal test...")
|
print("Starting BS4 normal test...")
|
||||||
# await test_web_scraping_using_bs4()
|
await test_web_scraping_using_bs4()
|
||||||
|
|
||||||
# print("Starting Tavily incremental loading test...")
|
print("Starting Tavily incremental loading test...")
|
||||||
# await test_web_scraping_using_tavily_and_incremental_loading()
|
await test_web_scraping_using_tavily_and_incremental_loading()
|
||||||
|
|
||||||
# print("Starting Tavily normal test...")
|
print("Starting Tavily normal test...")
|
||||||
# await test_web_scraping_using_tavily()
|
await test_web_scraping_using_tavily()
|
||||||
|
|
||||||
print("Starting cron job test...")
|
print("Starting cron job test...")
|
||||||
await test_cron_web_scraper()
|
await test_cron_web_scraper()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue