From 6348c9d8de04bd9e7f2a8db82bdfc313662d26de Mon Sep 17 00:00:00 2001 From: Geoff-Robin Date: Tue, 30 Sep 2025 20:46:26 +0530 Subject: [PATCH] Created models.py --- cognee/tasks/web_scraper/models.py | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 cognee/tasks/web_scraper/models.py diff --git a/cognee/tasks/web_scraper/models.py b/cognee/tasks/web_scraper/models.py new file mode 100644 index 000000000..8d5f32f43 --- /dev/null +++ b/cognee/tasks/web_scraper/models.py @@ -0,0 +1,39 @@ +from cognee.infrastructure.engine.models import DataPoint +from typing import Optional, Dict, Any, List +from datetime import datetime + +class WebPage(DataPoint): + """Represents a scraped web page with metadata""" + url: str + title: Optional[str] + content: str + content_hash: str + scraped_at: datetime + last_modified: Optional[datetime] + status_code: int + content_type: str + page_size: int + extraction_rules: Dict[str, Any] # CSS selectors, XPath rules used + metadata: dict = {"index_fields": ["url", "title", "scraped_at"]} + +class WebSite(DataPoint): + """Represents a website or domain being scraped""" + domain: str + base_url: str + robots_txt: Optional[str] + crawl_delay: float + last_crawled: datetime + page_count: int + scraping_config: Dict[str, Any] + metadata: dict = {"index_fields": ["domain", "base_url"]} + +class ScrapingJob(DataPoint): + """Represents a scraping job configuration""" + job_name: str + urls: List[str] + scraping_rules: Dict[str, Any] + schedule: Optional[str] # Cron-like schedule for recurring scrapes + status: str # "active", "paused", "completed", "failed" + last_run: Optional[datetime] + next_run: Optional[datetime] + metadata: dict = {"index_fields": ["job_name", "status"]} \ No newline at end of file