diff --git a/cognee/tasks/web_scraper/models.py b/cognee/tasks/web_scraper/models.py index 297aebb4f..aed2db0ed 100644 --- a/cognee/tasks/web_scraper/models.py +++ b/cognee/tasks/web_scraper/models.py @@ -16,7 +16,7 @@ class WebPage(DataPoint): page_size: int extraction_rules: Dict[str, Any] # CSS selectors, XPath rules used description: str - metadata: dict = {"index_fields": ["name", "description"]} + metadata: dict = {"index_fields": ["name", "description", "content"]} class WebSite(DataPoint): diff --git a/cognee/tasks/web_scraper/web_scraper_task.py b/cognee/tasks/web_scraper/web_scraper_task.py index 52b73b5b5..52154c6ef 100644 --- a/cognee/tasks/web_scraper/web_scraper_task.py +++ b/cognee/tasks/web_scraper/web_scraper_task.py @@ -250,13 +250,13 @@ async def web_scraper_task( # Create WebPage content_str = content if isinstance(content, str) else str(content) content_hash = hashlib.sha256(content_str.encode("utf-8")).hexdigest() - + content_preview = content_str[:500] + ("..." if len(content_str) > 500 else "") # Create description for WebPage webpage_description = ( f"Webpage: {parsed_url.path.lstrip('/') or 'Home'}\n" f"URL: {page_url}\n" f"Scraped at: {now.strftime('%Y-%m-%d %H:%M:%S')}\n" - f"Content: {content_str}\n" + f"Content: {content_preview}\n" f"Content type: text\n" f"Page size: {len(content_str)} bytes\n" f"Status code: 200"