(WIP) Fix/fix web parsing (#1552)
<!-- .github/pull_request_template.md --> ## Description <!-- Please provide a clear, human-generated description of the changes in this PR. DO NOT use AI-generated descriptions. We want to understand your thought process and reasoning. --> This PR (using TDD): 1. Separates web crawling implementation into separate fetching, and parsing (loader) steps 2. Fetching is used in `save_data_item_to_storage`. Default settings are used for fetching 3. Loader produces a txt file, scraping the fetched html and saves it in a txt file (`html_hash.html` -> `html_hash.txt`), similar to how we process pdf files ## Type of Change <!-- Please check the relevant option --> - [x] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [x] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) <!-- Add screenshots or videos to help explain your changes --> ## Pre-submission Checklist <!-- Please check all boxes that apply before submitting your PR --> - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
commit
738759bc5b
24 changed files with 859 additions and 339 deletions
1
.github/workflows/basic_tests.yml
vendored
1
.github/workflows/basic_tests.yml
vendored
|
|
@ -123,6 +123,7 @@ jobs:
|
||||||
uses: ./.github/actions/cognee_setup
|
uses: ./.github/actions/cognee_setup
|
||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
|
extra-dependencies: "scraping"
|
||||||
|
|
||||||
- name: Run Integration Tests
|
- name: Run Integration Tests
|
||||||
run: uv run pytest cognee/tests/integration/
|
run: uv run pytest cognee/tests/integration/
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,5 @@
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
import os
|
from typing import Union, BinaryIO, List, Optional, Any
|
||||||
from typing import Union, BinaryIO, List, Optional, Dict, Any
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
from cognee.modules.pipelines import Task, run_pipeline
|
from cognee.modules.pipelines import Task, run_pipeline
|
||||||
from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
|
from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
|
||||||
|
|
@ -17,16 +14,6 @@ from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
try:
|
|
||||||
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
|
|
||||||
from cognee.context_global_variables import (
|
|
||||||
tavily_config as tavily,
|
|
||||||
soup_crawler_config as soup_crawler,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
logger.debug(f"Unable to import {str(ImportError)}")
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
async def add(
|
async def add(
|
||||||
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
|
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
|
||||||
|
|
@ -36,11 +23,8 @@ async def add(
|
||||||
vector_db_config: dict = None,
|
vector_db_config: dict = None,
|
||||||
graph_db_config: dict = None,
|
graph_db_config: dict = None,
|
||||||
dataset_id: Optional[UUID] = None,
|
dataset_id: Optional[UUID] = None,
|
||||||
preferred_loaders: List[str] = None,
|
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||||
incremental_loading: bool = True,
|
incremental_loading: bool = True,
|
||||||
extraction_rules: Optional[Dict[str, Any]] = None,
|
|
||||||
tavily_config: Optional[BaseModel] = None,
|
|
||||||
soup_crawler_config: Optional[BaseModel] = None,
|
|
||||||
data_per_batch: Optional[int] = 20,
|
data_per_batch: Optional[int] = 20,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
@ -180,29 +164,6 @@ async def add(
|
||||||
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
|
||||||
if not soup_crawler_config and extraction_rules:
|
|
||||||
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
|
||||||
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
|
||||||
tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
|
|
||||||
|
|
||||||
soup_crawler.set(soup_crawler_config)
|
|
||||||
tavily.set(tavily_config)
|
|
||||||
|
|
||||||
http_schemes = {"http", "https"}
|
|
||||||
|
|
||||||
def _is_http_url(item: Union[str, BinaryIO]) -> bool:
|
|
||||||
return isinstance(item, str) and urlparse(item).scheme in http_schemes
|
|
||||||
|
|
||||||
if _is_http_url(data):
|
|
||||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
|
||||||
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
|
||||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
|
||||||
except NameError:
|
|
||||||
logger.debug(f"Unable to import {str(ImportError)}")
|
|
||||||
pass
|
|
||||||
|
|
||||||
tasks = [
|
tasks = [
|
||||||
Task(resolve_data_directories, include_subdirectories=True),
|
Task(resolve_data_directories, include_subdirectories=True),
|
||||||
Task(
|
Task(
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
from typing import Union, BinaryIO, List, Optional
|
from typing import Union, BinaryIO, List, Optional, Any
|
||||||
|
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
from cognee.api.v1.delete import delete
|
from cognee.api.v1.delete import delete
|
||||||
|
|
@ -15,7 +15,7 @@ async def update(
|
||||||
node_set: Optional[List[str]] = None,
|
node_set: Optional[List[str]] = None,
|
||||||
vector_db_config: dict = None,
|
vector_db_config: dict = None,
|
||||||
graph_db_config: dict = None,
|
graph_db_config: dict = None,
|
||||||
preferred_loaders: List[str] = None,
|
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||||
incremental_loading: bool = True,
|
incremental_loading: bool = True,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,6 @@ from cognee.modules.users.methods import get_user
|
||||||
vector_db_config = ContextVar("vector_db_config", default=None)
|
vector_db_config = ContextVar("vector_db_config", default=None)
|
||||||
graph_db_config = ContextVar("graph_db_config", default=None)
|
graph_db_config = ContextVar("graph_db_config", default=None)
|
||||||
session_user = ContextVar("session_user", default=None)
|
session_user = ContextVar("session_user", default=None)
|
||||||
soup_crawler_config = ContextVar("soup_crawler_config", default=None)
|
|
||||||
tavily_config = ContextVar("tavily_config", default=None)
|
|
||||||
|
|
||||||
|
|
||||||
async def set_session_user_context_variable(user):
|
async def set_session_user_context_variable(user):
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,9 @@ class LoaderEngine:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_loader(
|
def get_loader(
|
||||||
self, file_path: str, preferred_loaders: List[str] = None
|
self,
|
||||||
|
file_path: str,
|
||||||
|
preferred_loaders: dict[str, dict[str, Any]],
|
||||||
) -> Optional[LoaderInterface]:
|
) -> Optional[LoaderInterface]:
|
||||||
"""
|
"""
|
||||||
Get appropriate loader for a file.
|
Get appropriate loader for a file.
|
||||||
|
|
@ -76,14 +78,21 @@ class LoaderEngine:
|
||||||
Returns:
|
Returns:
|
||||||
LoaderInterface that can handle the file, or None if not found
|
LoaderInterface that can handle the file, or None if not found
|
||||||
"""
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
file_info = filetype.guess(file_path)
|
file_info = filetype.guess(file_path)
|
||||||
|
|
||||||
|
path_extension = Path(file_path).suffix.lstrip(".")
|
||||||
|
|
||||||
# Try preferred loaders first
|
# Try preferred loaders first
|
||||||
if preferred_loaders:
|
if preferred_loaders:
|
||||||
for loader_name in preferred_loaders:
|
for loader_name in preferred_loaders:
|
||||||
if loader_name in self._loaders:
|
if loader_name in self._loaders:
|
||||||
loader = self._loaders[loader_name]
|
loader = self._loaders[loader_name]
|
||||||
|
# Try with path extension first (for text formats like html)
|
||||||
|
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||||
|
return loader
|
||||||
|
# Fall back to content-detected extension
|
||||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||||
return loader
|
return loader
|
||||||
else:
|
else:
|
||||||
|
|
@ -93,6 +102,10 @@ class LoaderEngine:
|
||||||
for loader_name in self.default_loader_priority:
|
for loader_name in self.default_loader_priority:
|
||||||
if loader_name in self._loaders:
|
if loader_name in self._loaders:
|
||||||
loader = self._loaders[loader_name]
|
loader = self._loaders[loader_name]
|
||||||
|
# Try with path extension first (for text formats like html)
|
||||||
|
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||||
|
return loader
|
||||||
|
# Fall back to content-detected extension
|
||||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||||
return loader
|
return loader
|
||||||
else:
|
else:
|
||||||
|
|
@ -105,7 +118,7 @@ class LoaderEngine:
|
||||||
async def load_file(
|
async def load_file(
|
||||||
self,
|
self,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
preferred_loaders: Optional[List[str]] = None,
|
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
@ -113,7 +126,7 @@ class LoaderEngine:
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to be processed
|
file_path: Path to the file to be processed
|
||||||
preferred_loaders: List of preferred loader names to try first
|
preferred_loaders: Dict of loader names to their configurations
|
||||||
**kwargs: Additional loader-specific configuration
|
**kwargs: Additional loader-specific configuration
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
|
|
@ -125,8 +138,16 @@ class LoaderEngine:
|
||||||
raise ValueError(f"No loader found for file: {file_path}")
|
raise ValueError(f"No loader found for file: {file_path}")
|
||||||
|
|
||||||
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
||||||
# TODO: loading needs to be reworked to work with both file streams and file locations
|
|
||||||
return await loader.load(file_path, **kwargs)
|
# Extract loader-specific config from preferred_loaders
|
||||||
|
loader_config = {}
|
||||||
|
if preferred_loaders and loader.loader_name in preferred_loaders:
|
||||||
|
loader_config = preferred_loaders[loader.loader_name]
|
||||||
|
|
||||||
|
# Merge with any additional kwargs (kwargs take precedence)
|
||||||
|
merged_kwargs = {**loader_config, **kwargs}
|
||||||
|
|
||||||
|
return await loader.load(file_path, **merged_kwargs)
|
||||||
|
|
||||||
def get_available_loaders(self) -> List[str]:
|
def get_available_loaders(self) -> List[str]:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -27,3 +27,10 @@ try:
|
||||||
__all__.append("AdvancedPdfLoader")
|
__all__.append("AdvancedPdfLoader")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .beautiful_soup_loader import BeautifulSoupLoader
|
||||||
|
|
||||||
|
__all__.append("BeautifulSoupLoader")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
|
||||||
224
cognee/infrastructure/loaders/external/beautiful_soup_loader.py
vendored
Normal file
224
cognee/infrastructure/loaders/external/beautiful_soup_loader.py
vendored
Normal file
|
|
@ -0,0 +1,224 @@
|
||||||
|
"""BeautifulSoup-based web crawler for extracting content from web pages.
|
||||||
|
|
||||||
|
This module provides the BeautifulSoupCrawler class for fetching and extracting content
|
||||||
|
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
|
||||||
|
supports robots.txt handling, rate limiting, and custom extraction rules.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Union, Dict, Any, Optional, List
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionRule:
|
||||||
|
"""Normalized extraction rule for web content.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
selector: CSS selector for extraction (if any).
|
||||||
|
xpath: XPath expression for extraction (if any).
|
||||||
|
attr: HTML attribute to extract (if any).
|
||||||
|
all: If True, extract all matching elements; otherwise, extract first.
|
||||||
|
join_with: String to join multiple extracted elements.
|
||||||
|
"""
|
||||||
|
|
||||||
|
selector: Optional[str] = None
|
||||||
|
xpath: Optional[str] = None
|
||||||
|
attr: Optional[str] = None
|
||||||
|
all: bool = False
|
||||||
|
join_with: str = " "
|
||||||
|
|
||||||
|
|
||||||
|
class BeautifulSoupLoader(LoaderInterface):
|
||||||
|
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
||||||
|
|
||||||
|
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
||||||
|
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
concurrency: Number of concurrent requests allowed.
|
||||||
|
crawl_delay: Minimum seconds between requests to the same domain.
|
||||||
|
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
|
||||||
|
timeout: Per-request timeout in seconds.
|
||||||
|
max_retries: Number of retries for failed requests.
|
||||||
|
retry_delay_factor: Multiplier for exponential backoff on retries.
|
||||||
|
headers: HTTP headers for requests (e.g., User-Agent).
|
||||||
|
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_extensions(self) -> List[str]:
|
||||||
|
return ["html"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
return ["text/html", "text/plain"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def loader_name(self) -> str:
|
||||||
|
return "beautiful_soup_loader"
|
||||||
|
|
||||||
|
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||||
|
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
||||||
|
return can
|
||||||
|
|
||||||
|
async def load(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
extraction_rules: dict[str, Any] = None,
|
||||||
|
join_all_matches: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Load an HTML file, extract content, and save to storage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the HTML file
|
||||||
|
extraction_rules: Dict of CSS selector rules for content extraction
|
||||||
|
join_all_matches: If True, extract all matching elements for each rule
|
||||||
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the stored extracted text file
|
||||||
|
"""
|
||||||
|
if extraction_rules is None:
|
||||||
|
raise ValueError("extraction_rules required for BeautifulSoupLoader")
|
||||||
|
|
||||||
|
logger.info(f"Processing HTML file: {file_path}")
|
||||||
|
|
||||||
|
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
||||||
|
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||||
|
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
file_metadata = await get_file_metadata(f)
|
||||||
|
f.seek(0)
|
||||||
|
html = f.read()
|
||||||
|
|
||||||
|
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
||||||
|
|
||||||
|
# Normalize extraction rules
|
||||||
|
normalized_rules: List[ExtractionRule] = []
|
||||||
|
for _, rule in extraction_rules.items():
|
||||||
|
r = self._normalize_rule(rule)
|
||||||
|
if join_all_matches:
|
||||||
|
r.all = True
|
||||||
|
normalized_rules.append(r)
|
||||||
|
|
||||||
|
pieces = []
|
||||||
|
for rule in normalized_rules:
|
||||||
|
text = self._extract_from_html(html, rule)
|
||||||
|
if text:
|
||||||
|
pieces.append(text)
|
||||||
|
|
||||||
|
full_content = " ".join(pieces).strip()
|
||||||
|
|
||||||
|
# Fallback: If no content extracted, check if the file is plain text (not HTML)
|
||||||
|
if not full_content:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
# If there are no HTML tags, treat as plain text
|
||||||
|
if not soup.find():
|
||||||
|
logger.warning(
|
||||||
|
f"No HTML tags found in {file_path}. Treating as plain text. "
|
||||||
|
"This may happen when content is pre-extracted (e.g., via Tavily with text format)."
|
||||||
|
)
|
||||||
|
full_content = html.decode("utf-8") if isinstance(html, bytes) else html
|
||||||
|
full_content = full_content.strip()
|
||||||
|
|
||||||
|
if not full_content:
|
||||||
|
logger.warning(f"No content extracted from HTML file: {file_path}")
|
||||||
|
|
||||||
|
# Store the extracted content
|
||||||
|
storage_config = get_storage_config()
|
||||||
|
data_root_directory = storage_config["data_root_directory"]
|
||||||
|
storage = get_file_storage(data_root_directory)
|
||||||
|
|
||||||
|
full_file_path = await storage.store(storage_file_name, full_content)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(full_content)} characters from HTML")
|
||||||
|
return full_file_path
|
||||||
|
|
||||||
|
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||||
|
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rule: A string (CSS selector) or dict with extraction parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractionRule: Normalized extraction rule.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the rule is invalid.
|
||||||
|
"""
|
||||||
|
if isinstance(rule, str):
|
||||||
|
return ExtractionRule(selector=rule)
|
||||||
|
if isinstance(rule, dict):
|
||||||
|
return ExtractionRule(
|
||||||
|
selector=rule.get("selector"),
|
||||||
|
xpath=rule.get("xpath"),
|
||||||
|
attr=rule.get("attr"),
|
||||||
|
all=bool(rule.get("all", False)),
|
||||||
|
join_with=rule.get("join_with", " "),
|
||||||
|
)
|
||||||
|
raise ValueError(f"Invalid extraction rule: {rule}")
|
||||||
|
|
||||||
|
def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
|
||||||
|
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html: The HTML content to extract from.
|
||||||
|
rule: The extraction rule to apply.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If XPath is used but lxml is not installed.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
if rule.xpath:
|
||||||
|
try:
|
||||||
|
from lxml import html as lxml_html
|
||||||
|
except ImportError:
|
||||||
|
raise RuntimeError(
|
||||||
|
"XPath requested but lxml is not available. Install lxml or use CSS selectors."
|
||||||
|
)
|
||||||
|
doc = lxml_html.fromstring(html)
|
||||||
|
nodes = doc.xpath(rule.xpath)
|
||||||
|
texts = []
|
||||||
|
for n in nodes:
|
||||||
|
if hasattr(n, "text_content"):
|
||||||
|
texts.append(n.text_content().strip())
|
||||||
|
else:
|
||||||
|
texts.append(str(n).strip())
|
||||||
|
return rule.join_with.join(t for t in texts if t)
|
||||||
|
|
||||||
|
if not rule.selector:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if rule.all:
|
||||||
|
nodes = soup.select(rule.selector)
|
||||||
|
pieces = []
|
||||||
|
for el in nodes:
|
||||||
|
if rule.attr:
|
||||||
|
val = el.get(rule.attr)
|
||||||
|
if val:
|
||||||
|
pieces.append(val.strip())
|
||||||
|
else:
|
||||||
|
text = el.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
pieces.append(text)
|
||||||
|
return rule.join_with.join(pieces).strip()
|
||||||
|
else:
|
||||||
|
el = soup.select_one(rule.selector)
|
||||||
|
if el is None:
|
||||||
|
return ""
|
||||||
|
if rule.attr:
|
||||||
|
val = el.get(rule.attr)
|
||||||
|
return (val or "").strip()
|
||||||
|
return el.get_text(strip=True)
|
||||||
|
|
@ -23,3 +23,10 @@ try:
|
||||||
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cognee.infrastructure.loaders.external import BeautifulSoupLoader
|
||||||
|
|
||||||
|
supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import BinaryIO, Union
|
from typing import BinaryIO, Union, Optional
|
||||||
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||||
from .classify import classify
|
from .classify import classify
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
async def save_data_to_file(
|
||||||
|
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
|
||||||
|
):
|
||||||
storage_config = get_storage_config()
|
storage_config = get_storage_config()
|
||||||
|
|
||||||
data_root_directory = storage_config["data_root_directory"]
|
data_root_directory = storage_config["data_root_directory"]
|
||||||
|
|
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
||||||
|
|
||||||
file_name = file_metadata["name"]
|
file_name = file_metadata["name"]
|
||||||
|
|
||||||
|
if file_extension is not None:
|
||||||
|
extension = file_extension.lstrip(".")
|
||||||
|
file_name_without_ext = file_name.rsplit(".", 1)[0]
|
||||||
|
file_name = f"{file_name_without_ext}.{extension}"
|
||||||
|
|
||||||
storage = get_file_storage(data_root_directory)
|
storage = get_file_storage(data_root_directory)
|
||||||
|
|
||||||
full_file_path = await storage.store(file_name, data)
|
full_file_path = await storage.store(file_name, data)
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
|
||||||
from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
|
from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
|
||||||
check_pipeline_run_qualification,
|
check_pipeline_run_qualification,
|
||||||
)
|
)
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
logger = get_logger("cognee.pipeline")
|
logger = get_logger("cognee.pipeline")
|
||||||
|
|
||||||
|
|
@ -80,7 +81,14 @@ async def run_pipeline_per_dataset(
|
||||||
return
|
return
|
||||||
|
|
||||||
pipeline_run = run_tasks(
|
pipeline_run = run_tasks(
|
||||||
tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_per_batch
|
tasks,
|
||||||
|
dataset.id,
|
||||||
|
data,
|
||||||
|
user,
|
||||||
|
pipeline_name,
|
||||||
|
context,
|
||||||
|
incremental_loading,
|
||||||
|
data_per_batch,
|
||||||
)
|
)
|
||||||
|
|
||||||
async for pipeline_run_info in pipeline_run:
|
async for pipeline_run_info in pipeline_run:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import os
|
import os
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from typing import List, Tuple
|
from typing import Any, List, Tuple
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
|
@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
|
||||||
|
|
||||||
|
|
||||||
async def data_item_to_text_file(
|
async def data_item_to_text_file(
|
||||||
data_item_path: str, preferred_loaders: List[str]
|
data_item_path: str,
|
||||||
|
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||||
) -> Tuple[str, LoaderInterface]:
|
) -> Tuple[str, LoaderInterface]:
|
||||||
if isinstance(data_item_path, str):
|
if isinstance(data_item_path, str):
|
||||||
parsed_url = urlparse(data_item_path)
|
parsed_url = urlparse(data_item_path)
|
||||||
|
|
@ -74,6 +75,5 @@ async def data_item_to_text_file(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise IngestionError(message="Local files are not accepted.")
|
raise IngestionError(message="Local files are not accepted.")
|
||||||
|
|
||||||
# data is not a supported type
|
# data is not a supported type
|
||||||
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
|
||||||
import cognee.modules.ingestion as ingestion
|
import cognee.modules.ingestion as ingestion
|
||||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
from cognee.modules.data.models import Data
|
from cognee.modules.data.models import Data
|
||||||
|
from cognee.modules.ingestion.exceptions import IngestionError
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
from cognee.modules.users.methods import get_default_user
|
from cognee.modules.users.methods import get_default_user
|
||||||
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
||||||
|
|
@ -27,7 +28,7 @@ async def ingest_data(
|
||||||
user: User,
|
user: User,
|
||||||
node_set: Optional[List[str]] = None,
|
node_set: Optional[List[str]] = None,
|
||||||
dataset_id: UUID = None,
|
dataset_id: UUID = None,
|
||||||
preferred_loaders: List[str] = None,
|
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
if not user:
|
if not user:
|
||||||
user = await get_default_user()
|
user = await get_default_user()
|
||||||
|
|
@ -44,7 +45,7 @@ async def ingest_data(
|
||||||
user: User,
|
user: User,
|
||||||
node_set: Optional[List[str]] = None,
|
node_set: Optional[List[str]] = None,
|
||||||
dataset_id: UUID = None,
|
dataset_id: UUID = None,
|
||||||
preferred_loaders: List[str] = None,
|
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
new_datapoints = []
|
new_datapoints = []
|
||||||
existing_data_points = []
|
existing_data_points = []
|
||||||
|
|
@ -77,22 +78,27 @@ async def ingest_data(
|
||||||
dataset_data_map = {str(data.id): True for data in dataset_data}
|
dataset_data_map = {str(data.id): True for data in dataset_data}
|
||||||
|
|
||||||
for data_item in data:
|
for data_item in data:
|
||||||
# Get file path of data item or create a file it doesn't exist
|
# Get file path of data item or create a file if it doesn't exist
|
||||||
original_file_path = await save_data_item_to_storage(data_item)
|
original_file_path = await save_data_item_to_storage(data_item)
|
||||||
|
|
||||||
# Transform file path to be OS usable
|
# Transform file path to be OS usable
|
||||||
actual_file_path = get_data_file_path(original_file_path)
|
actual_file_path = get_data_file_path(original_file_path)
|
||||||
|
|
||||||
# Store all input data as text files in Cognee data storage
|
# Store all input data as text files in Cognee data storage
|
||||||
cognee_storage_file_path, loader_engine = await data_item_to_text_file(
|
cognee_storage_file_path, loader_engine = await data_item_to_text_file(
|
||||||
actual_file_path, preferred_loaders
|
actual_file_path,
|
||||||
|
preferred_loaders,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if loader_engine is None:
|
||||||
|
raise IngestionError("Loader cannot be None")
|
||||||
|
|
||||||
# Find metadata from original file
|
# Find metadata from original file
|
||||||
|
# Standard flow: extract metadata from both original and stored files
|
||||||
async with open_data_file(original_file_path) as file:
|
async with open_data_file(original_file_path) as file:
|
||||||
classified_data = ingestion.classify(file)
|
classified_data = ingestion.classify(file)
|
||||||
|
|
||||||
# data_id is the hash of original file contents + owner id to avoid duplicate data
|
# data_id is the hash of original file contents + owner id to avoid duplicate data
|
||||||
|
|
||||||
data_id = ingestion.identify(classified_data, user)
|
data_id = ingestion.identify(classified_data, user)
|
||||||
original_file_metadata = classified_data.get_metadata()
|
original_file_metadata = classified_data.get_metadata()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@ from cognee.modules.ingestion import save_data_to_file
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
from cognee.tasks.web_scraper.utils import fetch_page_content
|
||||||
|
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
@ -18,13 +20,6 @@ class SaveDataSettings(BaseSettings):
|
||||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||||
|
|
||||||
|
|
||||||
class HTMLContent(str):
|
|
||||||
def __new__(cls, value: str):
|
|
||||||
if not ("<" in value and ">" in value):
|
|
||||||
raise ValueError("Not valid HTML-like content")
|
|
||||||
return super().__new__(cls, value)
|
|
||||||
|
|
||||||
|
|
||||||
settings = SaveDataSettings()
|
settings = SaveDataSettings()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -63,40 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
||||||
if parsed_url.scheme == "s3":
|
if parsed_url.scheme == "s3":
|
||||||
return data_item
|
return data_item
|
||||||
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
||||||
# Validate URL by sending a HEAD request
|
urls_to_page_contents = await fetch_page_content(data_item)
|
||||||
try:
|
return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
|
||||||
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
|
||||||
from cognee.tasks.web_scraper import fetch_page_content
|
|
||||||
|
|
||||||
tavily = tavily_config.get()
|
|
||||||
soup_crawler = soup_crawler_config.get()
|
|
||||||
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
|
|
||||||
if preferred_tool == "tavily" and tavily is None:
|
|
||||||
raise IngestionError(
|
|
||||||
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
|
||||||
)
|
|
||||||
if preferred_tool == "beautifulsoup" and soup_crawler is None:
|
|
||||||
raise IngestionError(
|
|
||||||
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
|
||||||
)
|
|
||||||
|
|
||||||
data = await fetch_page_content(
|
|
||||||
data_item,
|
|
||||||
preferred_tool=preferred_tool,
|
|
||||||
tavily_config=tavily,
|
|
||||||
soup_crawler_config=soup_crawler,
|
|
||||||
)
|
|
||||||
content = ""
|
|
||||||
for key, value in data.items():
|
|
||||||
content += f"{key}:\n{value}\n\n"
|
|
||||||
return await save_data_to_file(content)
|
|
||||||
except IngestionError:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
raise IngestionError(
|
|
||||||
message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# data is local file path
|
# data is local file path
|
||||||
elif parsed_url.scheme == "file":
|
elif parsed_url.scheme == "file":
|
||||||
if settings.accept_local_file_path:
|
if settings.accept_local_file_path:
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,24 @@ data in a graph database. It includes classes and functions for crawling web pag
|
||||||
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .bs4_crawler import BeautifulSoupCrawler
|
|
||||||
from .utils import fetch_page_content
|
from .utils import fetch_page_content
|
||||||
from .web_scraper_task import cron_web_scraper_task, web_scraper_task
|
from .default_url_crawler import DefaultUrlCrawler
|
||||||
|
|
||||||
|
# Lazy import for web_scraper_task to avoid requiring apscheduler
|
||||||
|
# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
|
||||||
|
|
||||||
|
|
||||||
|
def __getattr__(name):
|
||||||
|
"""Lazy load web scraper task functions that require apscheduler."""
|
||||||
|
if name == "cron_web_scraper_task":
|
||||||
|
from .web_scraper_task import cron_web_scraper_task
|
||||||
|
|
||||||
|
return cron_web_scraper_task
|
||||||
|
elif name == "web_scraper_task":
|
||||||
|
from .web_scraper_task import web_scraper_task
|
||||||
|
|
||||||
|
return web_scraper_task
|
||||||
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|
@ -15,4 +30,5 @@ __all__ = [
|
||||||
"fetch_page_content",
|
"fetch_page_content",
|
||||||
"cron_web_scraper_task",
|
"cron_web_scraper_task",
|
||||||
"web_scraper_task",
|
"web_scraper_task",
|
||||||
|
"DefaultUrlCrawler",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -10,14 +10,16 @@ class TavilyConfig(BaseModel):
|
||||||
timeout: Optional[int] = Field(default=10, ge=1, le=60)
|
timeout: Optional[int] = Field(default=10, ge=1, le=60)
|
||||||
|
|
||||||
|
|
||||||
class SoupCrawlerConfig(BaseModel):
|
class DefaultCrawlerConfig(BaseModel):
|
||||||
concurrency: int = 5
|
concurrency: int = 5
|
||||||
crawl_delay: float = 0.5
|
crawl_delay: float = 0.5
|
||||||
|
max_crawl_delay: Optional[float] = (
|
||||||
|
10.0 # Maximum crawl delay to respect from robots.txt (None = no limit)
|
||||||
|
)
|
||||||
timeout: float = 15.0
|
timeout: float = 15.0
|
||||||
max_retries: int = 2
|
max_retries: int = 2
|
||||||
retry_delay_factor: float = 0.5
|
retry_delay_factor: float = 0.5
|
||||||
headers: Optional[Dict[str, str]] = None
|
headers: Optional[Dict[str, str]] = None
|
||||||
extraction_rules: Dict[str, Any]
|
|
||||||
use_playwright: bool = False
|
use_playwright: bool = False
|
||||||
playwright_js_wait: float = 0.8
|
playwright_js_wait: float = 0.8
|
||||||
robots_cache_ttl: float = 3600.0
|
robots_cache_ttl: float = 3600.0
|
||||||
|
|
|
||||||
|
|
@ -1,21 +1,21 @@
|
||||||
"""BeautifulSoup-based web crawler for extracting content from web pages.
|
|
||||||
|
|
||||||
This module provides the BeautifulSoupCrawler class for fetching and extracting content
|
|
||||||
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
|
|
||||||
supports robots.txt handling, rate limiting, and custom extraction rules.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
|
||||||
from typing import Union, List, Dict, Any, Optional
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
import time
|
||||||
|
from typing import Any, Union, List, Dict, Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from cognee.shared.logging_utils import get_logger
|
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from protego import Protego
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
|
||||||
|
Protego = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
|
@ -25,31 +25,6 @@ except ImportError:
|
||||||
)
|
)
|
||||||
async_playwright = None
|
async_playwright = None
|
||||||
|
|
||||||
try:
|
|
||||||
from protego import Protego
|
|
||||||
except ImportError:
|
|
||||||
logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
|
|
||||||
Protego = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ExtractionRule:
|
|
||||||
"""Normalized extraction rule for web content.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
selector: CSS selector for extraction (if any).
|
|
||||||
xpath: XPath expression for extraction (if any).
|
|
||||||
attr: HTML attribute to extract (if any).
|
|
||||||
all: If True, extract all matching elements; otherwise, extract first.
|
|
||||||
join_with: String to join multiple extracted elements.
|
|
||||||
"""
|
|
||||||
|
|
||||||
selector: Optional[str] = None
|
|
||||||
xpath: Optional[str] = None
|
|
||||||
attr: Optional[str] = None
|
|
||||||
all: bool = False
|
|
||||||
join_with: str = " "
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RobotsTxtCache:
|
class RobotsTxtCache:
|
||||||
|
|
@ -66,27 +41,13 @@ class RobotsTxtCache:
|
||||||
timestamp: float = field(default_factory=time.time)
|
timestamp: float = field(default_factory=time.time)
|
||||||
|
|
||||||
|
|
||||||
class BeautifulSoupCrawler:
|
class DefaultUrlCrawler:
|
||||||
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
|
||||||
|
|
||||||
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
|
||||||
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
concurrency: Number of concurrent requests allowed.
|
|
||||||
crawl_delay: Minimum seconds between requests to the same domain.
|
|
||||||
timeout: Per-request timeout in seconds.
|
|
||||||
max_retries: Number of retries for failed requests.
|
|
||||||
retry_delay_factor: Multiplier for exponential backoff on retries.
|
|
||||||
headers: HTTP headers for requests (e.g., User-Agent).
|
|
||||||
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
concurrency: int = 5,
|
concurrency: int = 5,
|
||||||
crawl_delay: float = 0.5,
|
crawl_delay: float = 0.5,
|
||||||
|
max_crawl_delay: Optional[float] = 10.0,
|
||||||
timeout: float = 15.0,
|
timeout: float = 15.0,
|
||||||
max_retries: int = 2,
|
max_retries: int = 2,
|
||||||
retry_delay_factor: float = 0.5,
|
retry_delay_factor: float = 0.5,
|
||||||
|
|
@ -98,6 +59,7 @@ class BeautifulSoupCrawler:
|
||||||
Args:
|
Args:
|
||||||
concurrency: Number of concurrent requests allowed.
|
concurrency: Number of concurrent requests allowed.
|
||||||
crawl_delay: Minimum seconds between requests to the same domain.
|
crawl_delay: Minimum seconds between requests to the same domain.
|
||||||
|
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
|
||||||
timeout: Per-request timeout in seconds.
|
timeout: Per-request timeout in seconds.
|
||||||
max_retries: Number of retries for failed requests.
|
max_retries: Number of retries for failed requests.
|
||||||
retry_delay_factor: Multiplier for exponential backoff on retries.
|
retry_delay_factor: Multiplier for exponential backoff on retries.
|
||||||
|
|
@ -107,6 +69,7 @@ class BeautifulSoupCrawler:
|
||||||
self.concurrency = concurrency
|
self.concurrency = concurrency
|
||||||
self._sem = asyncio.Semaphore(concurrency)
|
self._sem = asyncio.Semaphore(concurrency)
|
||||||
self.crawl_delay = crawl_delay
|
self.crawl_delay = crawl_delay
|
||||||
|
self.max_crawl_delay = max_crawl_delay
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.max_retries = max_retries
|
self.max_retries = max_retries
|
||||||
self.retry_delay_factor = retry_delay_factor
|
self.retry_delay_factor = retry_delay_factor
|
||||||
|
|
@ -183,7 +146,11 @@ class BeautifulSoupCrawler:
|
||||||
elapsed = time.time() - last
|
elapsed = time.time() - last
|
||||||
wait_for = delay - elapsed
|
wait_for = delay - elapsed
|
||||||
if wait_for > 0:
|
if wait_for > 0:
|
||||||
|
logger.info(
|
||||||
|
f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
|
||||||
|
)
|
||||||
await asyncio.sleep(wait_for)
|
await asyncio.sleep(wait_for)
|
||||||
|
logger.info(f"Rate limit wait completed for {url}")
|
||||||
self._last_request_time_per_domain[domain] = time.time()
|
self._last_request_time_per_domain[domain] = time.time()
|
||||||
|
|
||||||
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
|
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
|
||||||
|
|
@ -236,7 +203,16 @@ class BeautifulSoupCrawler:
|
||||||
crawl_delay = self.crawl_delay
|
crawl_delay = self.crawl_delay
|
||||||
if protego:
|
if protego:
|
||||||
delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
|
delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
|
||||||
crawl_delay = delay if delay else self.crawl_delay
|
if delay:
|
||||||
|
# Apply max_crawl_delay cap if configured
|
||||||
|
if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
|
||||||
|
logger.warning(
|
||||||
|
f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
|
||||||
|
f"capping to max_crawl_delay={self.max_crawl_delay}s"
|
||||||
|
)
|
||||||
|
crawl_delay = self.max_crawl_delay
|
||||||
|
else:
|
||||||
|
crawl_delay = delay
|
||||||
|
|
||||||
cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
|
cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
|
||||||
self._robots_cache[domain_root] = cache_entry
|
self._robots_cache[domain_root] = cache_entry
|
||||||
|
|
@ -307,12 +283,16 @@ class BeautifulSoupCrawler:
|
||||||
|
|
||||||
attempt = 0
|
attempt = 0
|
||||||
crawl_delay = await self._get_crawl_delay(url)
|
crawl_delay = await self._get_crawl_delay(url)
|
||||||
|
logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
await self._respect_rate_limit(url, crawl_delay)
|
await self._respect_rate_limit(url, crawl_delay)
|
||||||
resp = await self._client.get(url)
|
resp = await self._client.get(url)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
logger.info(
|
||||||
|
f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
|
||||||
|
)
|
||||||
return resp.text
|
return resp.text
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
attempt += 1
|
attempt += 1
|
||||||
|
|
@ -347,22 +327,35 @@ class BeautifulSoupCrawler:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
|
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
timeout_val = timeout or self.timeout
|
||||||
|
logger.info(
|
||||||
|
f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
|
||||||
|
)
|
||||||
|
|
||||||
attempt = 0
|
attempt = 0
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
|
logger.info(f"Launching headless Chromium browser for {url}")
|
||||||
browser = await p.chromium.launch(headless=True)
|
browser = await p.chromium.launch(headless=True)
|
||||||
try:
|
try:
|
||||||
context = await browser.new_context()
|
context = await browser.new_context()
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
logger.info(f"Navigating to {url} and waiting for network idle")
|
||||||
await page.goto(
|
await page.goto(
|
||||||
url,
|
url,
|
||||||
wait_until="networkidle",
|
wait_until="networkidle",
|
||||||
timeout=int((timeout or self.timeout) * 1000),
|
timeout=int(timeout_val * 1000),
|
||||||
)
|
)
|
||||||
if js_wait:
|
if js_wait:
|
||||||
|
logger.info(f"Waiting {js_wait}s for JavaScript to execute")
|
||||||
await asyncio.sleep(js_wait)
|
await asyncio.sleep(js_wait)
|
||||||
return await page.content()
|
content = await page.content()
|
||||||
|
logger.info(
|
||||||
|
f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
|
||||||
|
)
|
||||||
|
return content
|
||||||
finally:
|
finally:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|
@ -376,96 +369,13 @@ class BeautifulSoupCrawler:
|
||||||
)
|
)
|
||||||
await asyncio.sleep(backoff)
|
await asyncio.sleep(backoff)
|
||||||
|
|
||||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
async def fetch_urls(
|
||||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
rule: A string (CSS selector) or dict with extraction parameters.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ExtractionRule: Normalized extraction rule.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If the rule is invalid.
|
|
||||||
"""
|
|
||||||
if isinstance(rule, str):
|
|
||||||
return ExtractionRule(selector=rule)
|
|
||||||
if isinstance(rule, dict):
|
|
||||||
return ExtractionRule(
|
|
||||||
selector=rule.get("selector"),
|
|
||||||
xpath=rule.get("xpath"),
|
|
||||||
attr=rule.get("attr"),
|
|
||||||
all=bool(rule.get("all", False)),
|
|
||||||
join_with=rule.get("join_with", " "),
|
|
||||||
)
|
|
||||||
raise ValueError(f"Invalid extraction rule: {rule}")
|
|
||||||
|
|
||||||
def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str:
|
|
||||||
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html: The HTML content to extract from.
|
|
||||||
rule: The extraction rule to apply.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The extracted content.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
RuntimeError: If XPath is used but lxml is not installed.
|
|
||||||
"""
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
|
|
||||||
if rule.xpath:
|
|
||||||
try:
|
|
||||||
from lxml import html as lxml_html
|
|
||||||
except ImportError:
|
|
||||||
raise RuntimeError(
|
|
||||||
"XPath requested but lxml is not available. Install lxml or use CSS selectors."
|
|
||||||
)
|
|
||||||
doc = lxml_html.fromstring(html)
|
|
||||||
nodes = doc.xpath(rule.xpath)
|
|
||||||
texts = []
|
|
||||||
for n in nodes:
|
|
||||||
if hasattr(n, "text_content"):
|
|
||||||
texts.append(n.text_content().strip())
|
|
||||||
else:
|
|
||||||
texts.append(str(n).strip())
|
|
||||||
return rule.join_with.join(t for t in texts if t)
|
|
||||||
|
|
||||||
if not rule.selector:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
if rule.all:
|
|
||||||
nodes = soup.select(rule.selector)
|
|
||||||
pieces = []
|
|
||||||
for el in nodes:
|
|
||||||
if rule.attr:
|
|
||||||
val = el.get(rule.attr)
|
|
||||||
if val:
|
|
||||||
pieces.append(val.strip())
|
|
||||||
else:
|
|
||||||
text = el.get_text(strip=True)
|
|
||||||
if text:
|
|
||||||
pieces.append(text)
|
|
||||||
return rule.join_with.join(pieces).strip()
|
|
||||||
else:
|
|
||||||
el = soup.select_one(rule.selector)
|
|
||||||
if el is None:
|
|
||||||
return ""
|
|
||||||
if rule.attr:
|
|
||||||
val = el.get(rule.attr)
|
|
||||||
return (val or "").strip()
|
|
||||||
return el.get_text(strip=True)
|
|
||||||
|
|
||||||
async def fetch_with_bs4(
|
|
||||||
self,
|
self,
|
||||||
urls: Union[str, List[str], Dict[str, Dict[str, Any]]],
|
urls: Union[str, List[str]],
|
||||||
extraction_rules: Optional[Dict[str, Any]] = None,
|
|
||||||
*,
|
*,
|
||||||
use_playwright: bool = False,
|
use_playwright: bool = False,
|
||||||
playwright_js_wait: float = 0.8,
|
playwright_js_wait: float = 0.8,
|
||||||
join_all_matches: bool = False,
|
) -> UrlsToHtmls:
|
||||||
) -> Dict[str, str]:
|
|
||||||
"""Fetch and extract content from URLs using BeautifulSoup or Playwright.
|
"""Fetch and extract content from URLs using BeautifulSoup or Playwright.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -482,65 +392,55 @@ class BeautifulSoupCrawler:
|
||||||
ValueError: If extraction_rules are missing when required or if urls is invalid.
|
ValueError: If extraction_rules are missing when required or if urls is invalid.
|
||||||
Exception: If fetching or extraction fails.
|
Exception: If fetching or extraction fails.
|
||||||
"""
|
"""
|
||||||
url_rules_map: Dict[str, Dict[str, Any]] = {}
|
|
||||||
|
|
||||||
if isinstance(urls, str):
|
if isinstance(urls, str):
|
||||||
if not extraction_rules:
|
urls = [urls]
|
||||||
raise ValueError("extraction_rules required when urls is a string")
|
|
||||||
url_rules_map[urls] = extraction_rules
|
|
||||||
elif isinstance(urls, list):
|
|
||||||
if not extraction_rules:
|
|
||||||
raise ValueError("extraction_rules required when urls is a list")
|
|
||||||
for url in urls:
|
|
||||||
url_rules_map[url] = extraction_rules
|
|
||||||
elif isinstance(urls, dict):
|
|
||||||
url_rules_map = urls
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid urls type: {type(urls)}")
|
raise ValueError(f"Invalid urls type: {type(urls)}")
|
||||||
|
|
||||||
normalized_url_rules: Dict[str, List[ExtractionRule]] = {}
|
|
||||||
for url, rules in url_rules_map.items():
|
|
||||||
normalized_rules = []
|
|
||||||
for _, rule in rules.items():
|
|
||||||
r = self._normalize_rule(rule)
|
|
||||||
if join_all_matches:
|
|
||||||
r.all = True
|
|
||||||
normalized_rules.append(r)
|
|
||||||
normalized_url_rules[url] = normalized_rules
|
|
||||||
|
|
||||||
async def _task(url: str):
|
async def _task(url: str):
|
||||||
async with self._sem:
|
async with self._sem:
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"Processing URL: {url}")
|
||||||
|
|
||||||
|
# Check robots.txt
|
||||||
allowed = await self._is_url_allowed(url)
|
allowed = await self._is_url_allowed(url)
|
||||||
if not allowed:
|
if not allowed:
|
||||||
logger.warning(f"URL disallowed by robots.txt: {url}")
|
logger.warning(f"URL disallowed by robots.txt: {url}")
|
||||||
return url, ""
|
return url, ""
|
||||||
|
|
||||||
|
logger.info(f"Robots.txt check passed for {url}")
|
||||||
|
|
||||||
|
# Fetch HTML
|
||||||
if use_playwright:
|
if use_playwright:
|
||||||
|
logger.info(
|
||||||
|
f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
|
||||||
|
)
|
||||||
html = await self._render_with_playwright(
|
html = await self._render_with_playwright(
|
||||||
url, js_wait=playwright_js_wait, timeout=self.timeout
|
url, js_wait=playwright_js_wait, timeout=self.timeout
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
logger.info(f"Fetching {url} with httpx")
|
||||||
html = await self._fetch_httpx(url)
|
html = await self._fetch_httpx(url)
|
||||||
|
|
||||||
pieces = []
|
logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
|
||||||
for rule in normalized_url_rules[url]:
|
|
||||||
text = self._extract_with_bs4(html, rule)
|
|
||||||
if text:
|
|
||||||
pieces.append(text)
|
|
||||||
|
|
||||||
concatenated = " ".join(pieces).strip()
|
return url, html
|
||||||
return url, concatenated
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing {url}: {e}")
|
logger.error(f"Error processing {url}: {e}")
|
||||||
return url, ""
|
return url, ""
|
||||||
|
|
||||||
tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()]
|
logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
|
||||||
|
tasks = [asyncio.create_task(_task(u)) for u in urls]
|
||||||
results = {}
|
results = {}
|
||||||
|
completed = 0
|
||||||
|
total = len(tasks)
|
||||||
|
|
||||||
for coro in asyncio.as_completed(tasks):
|
for coro in asyncio.as_completed(tasks):
|
||||||
url, text = await coro
|
url, html = await coro
|
||||||
results[url] = text
|
results[url] = html
|
||||||
|
completed += 1
|
||||||
|
logger.info(f"Progress: {completed}/{total} URLs processed")
|
||||||
|
|
||||||
|
logger.info(f"Completed fetching all {len(results)} URL(s)")
|
||||||
return results
|
return results
|
||||||
4
cognee/tasks/web_scraper/types.py
Normal file
4
cognee/tasks/web_scraper/types.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from typing import TypeAlias
|
||||||
|
|
||||||
|
|
||||||
|
UrlsToHtmls: TypeAlias = dict[str, str]
|
||||||
|
|
@ -4,21 +4,17 @@ This module provides functions to fetch and extract content from web pages, supp
|
||||||
both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
|
both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Dict, List, Union, Optional, Literal
|
import os
|
||||||
|
from typing import List, Union
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from .bs4_crawler import BeautifulSoupCrawler
|
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
||||||
from .config import TavilyConfig, SoupCrawlerConfig
|
from .default_url_crawler import DefaultUrlCrawler
|
||||||
|
from .config import DefaultCrawlerConfig, TavilyConfig
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
async def fetch_page_content(
|
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||||
urls: Union[str, List[str]],
|
|
||||||
*,
|
|
||||||
preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup",
|
|
||||||
tavily_config: Optional[TavilyConfig] = None,
|
|
||||||
soup_crawler_config: Optional[SoupCrawlerConfig] = None,
|
|
||||||
) -> Dict[str, str]:
|
|
||||||
"""Fetch content from one or more URLs using the specified tool.
|
"""Fetch content from one or more URLs using the specified tool.
|
||||||
|
|
||||||
This function retrieves web page content using either BeautifulSoup (with custom
|
This function retrieves web page content using either BeautifulSoup (with custom
|
||||||
|
|
@ -31,7 +27,7 @@ async def fetch_page_content(
|
||||||
Defaults to "beautifulsoup".
|
Defaults to "beautifulsoup".
|
||||||
tavily_config: Configuration for Tavily API, including API key.
|
tavily_config: Configuration for Tavily API, including API key.
|
||||||
Required if preferred_tool is "tavily".
|
Required if preferred_tool is "tavily".
|
||||||
soup_crawler_config: Configuration for BeautifulSoup crawler, including
|
default_crawler_config: Configuration for BeautifulSoup crawler, including
|
||||||
extraction rules. Required if preferred_tool is "beautifulsoup" and
|
extraction rules. Required if preferred_tool is "beautifulsoup" and
|
||||||
extraction_rules are needed.
|
extraction_rules are needed.
|
||||||
|
|
||||||
|
|
@ -45,50 +41,52 @@ async def fetch_page_content(
|
||||||
ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
|
ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
|
||||||
installed.
|
installed.
|
||||||
"""
|
"""
|
||||||
if preferred_tool == "tavily":
|
url_list = [urls] if isinstance(urls, str) else urls
|
||||||
if not tavily_config or tavily_config.api_key is None:
|
|
||||||
raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily")
|
|
||||||
return await fetch_with_tavily(urls, tavily_config)
|
|
||||||
|
|
||||||
if preferred_tool == "beautifulsoup":
|
if os.getenv("TAVILY_API_KEY"):
|
||||||
try:
|
logger.info("Using Tavily API for url fetching")
|
||||||
from bs4 import BeautifulSoup as _ # noqa: F401
|
return await fetch_with_tavily(urls)
|
||||||
except ImportError:
|
else:
|
||||||
logger.error(
|
logger.info("Using default crawler for content extraction")
|
||||||
"Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1"
|
|
||||||
)
|
default_crawler_config = (
|
||||||
raise ImportError
|
DefaultCrawlerConfig()
|
||||||
if not soup_crawler_config or soup_crawler_config.extraction_rules is None:
|
) # We've decided to use defaults, and configure through env vars as needed
|
||||||
raise ValueError("extraction_rules must be provided when not using Tavily")
|
|
||||||
extraction_rules = soup_crawler_config.extraction_rules
|
logger.info(
|
||||||
crawler = BeautifulSoupCrawler(
|
f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
|
||||||
concurrency=soup_crawler_config.concurrency,
|
)
|
||||||
crawl_delay=soup_crawler_config.crawl_delay,
|
|
||||||
timeout=soup_crawler_config.timeout,
|
crawler = DefaultUrlCrawler(
|
||||||
max_retries=soup_crawler_config.max_retries,
|
concurrency=default_crawler_config.concurrency,
|
||||||
retry_delay_factor=soup_crawler_config.retry_delay_factor,
|
crawl_delay=default_crawler_config.crawl_delay,
|
||||||
headers=soup_crawler_config.headers,
|
max_crawl_delay=default_crawler_config.max_crawl_delay,
|
||||||
robots_cache_ttl=soup_crawler_config.robots_cache_ttl,
|
timeout=default_crawler_config.timeout,
|
||||||
|
max_retries=default_crawler_config.max_retries,
|
||||||
|
retry_delay_factor=default_crawler_config.retry_delay_factor,
|
||||||
|
headers=default_crawler_config.headers,
|
||||||
|
robots_cache_ttl=default_crawler_config.robots_cache_ttl,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
results = await crawler.fetch_with_bs4(
|
logger.info(
|
||||||
urls,
|
f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
|
||||||
extraction_rules,
|
|
||||||
use_playwright=soup_crawler_config.use_playwright,
|
|
||||||
playwright_js_wait=soup_crawler_config.playwright_js_wait,
|
|
||||||
join_all_matches=soup_crawler_config.join_all_matches,
|
|
||||||
)
|
)
|
||||||
|
results = await crawler.fetch_urls(
|
||||||
|
urls,
|
||||||
|
use_playwright=default_crawler_config.use_playwright,
|
||||||
|
playwright_js_wait=default_crawler_config.playwright_js_wait,
|
||||||
|
)
|
||||||
|
logger.info(f"Successfully fetched content from {len(results)} URL(s)")
|
||||||
return results
|
return results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error fetching page content: {str(e)}")
|
logger.error(f"Error fetching page content: {str(e)}")
|
||||||
raise
|
raise
|
||||||
finally:
|
finally:
|
||||||
|
logger.info("Closing BeautifulSoup crawler")
|
||||||
await crawler.close()
|
await crawler.close()
|
||||||
|
|
||||||
|
|
||||||
async def fetch_with_tavily(
|
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
||||||
urls: Union[str, List[str]], tavily_config: Optional[TavilyConfig] = None
|
|
||||||
) -> Dict[str, str]:
|
|
||||||
"""Fetch content from URLs using the Tavily API.
|
"""Fetch content from URLs using the Tavily API.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -108,19 +106,37 @@ async def fetch_with_tavily(
|
||||||
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
|
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
client = AsyncTavilyClient(
|
|
||||||
api_key=tavily_config.api_key if tavily_config else None,
|
tavily_config = TavilyConfig()
|
||||||
proxies=tavily_config.proxies if tavily_config else None,
|
url_list = [urls] if isinstance(urls, str) else urls
|
||||||
|
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
|
||||||
|
timeout = tavily_config.timeout if tavily_config else 10
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
|
||||||
)
|
)
|
||||||
|
client = AsyncTavilyClient(
|
||||||
|
api_key=tavily_config.api_key,
|
||||||
|
proxies=tavily_config.proxies,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
|
||||||
results = await client.extract(
|
results = await client.extract(
|
||||||
urls,
|
urls,
|
||||||
format="text",
|
format="text",
|
||||||
extract_depth=tavily_config.extract_depth if tavily_config else "basic",
|
extract_depth=extract_depth,
|
||||||
timeout=tavily_config.timeout if tavily_config else 10,
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
for failed_result in results.get("failed_results", []):
|
|
||||||
logger.warning(f"Failed to fetch {failed_result}")
|
failed_count = len(results.get("failed_results", []))
|
||||||
|
if failed_count > 0:
|
||||||
|
logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
|
||||||
|
for failed_result in results.get("failed_results", []):
|
||||||
|
logger.warning(f"Failed to fetch {failed_result}")
|
||||||
|
|
||||||
return_results = {}
|
return_results = {}
|
||||||
for result in results.get("results", []):
|
for result in results.get("results", []):
|
||||||
return_results[result["url"]] = result["raw_content"]
|
return_results[result["url"]] = result["raw_content"]
|
||||||
|
|
||||||
|
logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
|
||||||
return return_results
|
return return_results
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
||||||
from cognee.modules.engine.operations.setup import setup
|
from cognee.modules.engine.operations.setup import setup
|
||||||
|
|
||||||
from .models import WebPage, WebSite, ScrapingJob
|
from .models import WebPage, WebSite, ScrapingJob
|
||||||
from .config import SoupCrawlerConfig, TavilyConfig
|
from .config import DefaultCrawlerConfig, TavilyConfig
|
||||||
from .utils import fetch_page_content
|
from .utils import fetch_page_content
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -47,7 +47,7 @@ async def cron_web_scraper_task(
|
||||||
schedule: str = None,
|
schedule: str = None,
|
||||||
extraction_rules: dict = None,
|
extraction_rules: dict = None,
|
||||||
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
||||||
soup_crawler_config: SoupCrawlerConfig = None,
|
soup_crawler_config: DefaultCrawlerConfig = None,
|
||||||
tavily_config: TavilyConfig = None,
|
tavily_config: TavilyConfig = None,
|
||||||
job_name: str = "scraping",
|
job_name: str = "scraping",
|
||||||
):
|
):
|
||||||
|
|
@ -121,7 +121,7 @@ async def web_scraper_task(
|
||||||
schedule: str = None,
|
schedule: str = None,
|
||||||
extraction_rules: dict = None,
|
extraction_rules: dict = None,
|
||||||
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
||||||
soup_crawler_config: SoupCrawlerConfig = None,
|
soup_crawler_config: DefaultCrawlerConfig = None,
|
||||||
tavily_config: TavilyConfig = None,
|
tavily_config: TavilyConfig = None,
|
||||||
job_name: str = None,
|
job_name: str = None,
|
||||||
):
|
):
|
||||||
|
|
@ -341,7 +341,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
|
||||||
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
|
Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
|
||||||
tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
|
tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
|
|
@ -350,7 +350,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle
|
||||||
preferred_tool = "beautifulsoup"
|
preferred_tool = "beautifulsoup"
|
||||||
|
|
||||||
if extraction_rules and not soup_crawler_config:
|
if extraction_rules and not soup_crawler_config:
|
||||||
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
|
||||||
|
|
||||||
if tavily_api_key:
|
if tavily_api_key:
|
||||||
if not tavily_config:
|
if not tavily_config:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
import pytest
|
||||||
|
from cognee.tasks.web_scraper import DefaultUrlCrawler
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_fetch():
|
||||||
|
crawler = DefaultUrlCrawler()
|
||||||
|
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
results = await crawler.fetch_urls(url)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert isinstance(results, dict)
|
||||||
|
html = results[url]
|
||||||
|
assert isinstance(html, str)
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from cognee.tasks.web_scraper.utils import fetch_with_tavily
|
||||||
|
|
||||||
|
skip_in_ci = pytest.mark.skipif(
|
||||||
|
os.getenv("GITHUB_ACTIONS") == "true",
|
||||||
|
reason="Skipping in Github for now - before we get TAVILY_API_KEY",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_in_ci
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_fetch():
|
||||||
|
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
results = await fetch_with_tavily(url)
|
||||||
|
assert isinstance(results, dict)
|
||||||
|
assert len(results) == 1
|
||||||
|
html = results[url]
|
||||||
|
assert isinstance(html, str)
|
||||||
310
cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
Normal file
310
cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
Normal file
|
|
@ -0,0 +1,310 @@
|
||||||
|
import pytest
|
||||||
|
import cognee
|
||||||
|
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
|
||||||
|
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
|
||||||
|
from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
|
||||||
|
from cognee.tasks.ingestion import save_data_item_to_storage
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_url_saves_as_html_file():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_saved_html_is_valid():
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
except ImportError:
|
||||||
|
pytest.fail("Test case requires bs4 installed")
|
||||||
|
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
content = Path(file_path).read_text()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
assert soup.find() is not None, "File should contain parseable HTML"
|
||||||
|
|
||||||
|
has_html_elements = any(
|
||||||
|
[
|
||||||
|
soup.find("html"),
|
||||||
|
soup.find("head"),
|
||||||
|
soup.find("body"),
|
||||||
|
soup.find("div"),
|
||||||
|
soup.find("p"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert has_html_elements, "File should contain common HTML elements"
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_url():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_url_without_incremental_loading():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await cognee.add(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
|
incremental_loading=False,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to add url: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_url_with_incremental_loading():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await cognee.add(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
|
incremental_loading=True,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to add url: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
await cognee.add(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
|
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to add url: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_loader_is_none_by_default():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
loader = loader_engine.get_loader(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader is None
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
loader = loader_engine.get_loader(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader == bs_loader
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {}}
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
await loader_engine.load_file(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
await loader_engine.load_file(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
await loader_engine.load_file(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loads_file_successfully():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
original_file = Path(file_path)
|
||||||
|
assert original_file.exists()
|
||||||
|
assert original_file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
loader = loader_engine.get_loader(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader == bs_loader
|
||||||
|
|
||||||
|
cognee_loaded_txt_path = await loader_engine.load_file(
|
||||||
|
file_path=file_path, preferred_loaders=preferred_loaders
|
||||||
|
)
|
||||||
|
|
||||||
|
cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
|
||||||
|
|
||||||
|
assert cognee_loaded_txt_path.endswith(".txt")
|
||||||
|
|
||||||
|
extracted_file = Path(cognee_loaded_txt_path)
|
||||||
|
|
||||||
|
assert extracted_file.exists()
|
||||||
|
assert extracted_file.stat().st_size > 0
|
||||||
|
|
||||||
|
original_basename = original_file.stem
|
||||||
|
extracted_basename = extracted_file.stem
|
||||||
|
assert original_basename == extracted_basename, (
|
||||||
|
f"Expected same base name: {original_basename} vs {extracted_basename}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import cognee
|
import cognee
|
||||||
from cognee.tasks.web_scraper.config import SoupCrawlerConfig
|
from cognee.tasks.web_scraper.config import DefaultCrawlerConfig
|
||||||
from cognee.tasks.web_scraper import cron_web_scraper_task
|
from cognee.tasks.web_scraper import cron_web_scraper_task
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -14,7 +14,7 @@ async def test_web_scraping_using_bs4():
|
||||||
"authors": {"selector": ".quote small", "all": True},
|
"authors": {"selector": ".quote small", "all": True},
|
||||||
}
|
}
|
||||||
|
|
||||||
soup_config = SoupCrawlerConfig(
|
soup_config = DefaultCrawlerConfig(
|
||||||
concurrency=5,
|
concurrency=5,
|
||||||
crawl_delay=0.5,
|
crawl_delay=0.5,
|
||||||
timeout=15.0,
|
timeout=15.0,
|
||||||
|
|
@ -47,7 +47,7 @@ async def test_web_scraping_using_bs4_and_incremental_loading():
|
||||||
url = "https://books.toscrape.com/"
|
url = "https://books.toscrape.com/"
|
||||||
rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}
|
rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}
|
||||||
|
|
||||||
soup_config = SoupCrawlerConfig(
|
soup_config = DefaultCrawlerConfig(
|
||||||
concurrency=1,
|
concurrency=1,
|
||||||
crawl_delay=0.1,
|
crawl_delay=0.1,
|
||||||
timeout=10.0,
|
timeout=10.0,
|
||||||
|
|
|
||||||
37
examples/python/web_url_fetcher_example.py
Normal file
37
examples/python/web_url_fetcher_example.py
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
import cognee
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
print("Data pruned.")
|
||||||
|
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {
|
||||||
|
"selector": "a",
|
||||||
|
"attr": "href",
|
||||||
|
"all": True,
|
||||||
|
},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
await cognee.add(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
|
incremental_loading=False,
|
||||||
|
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
||||||
|
)
|
||||||
|
|
||||||
|
await cognee.cognify()
|
||||||
|
print("Knowledge graph created.")
|
||||||
|
|
||||||
|
await cognee.visualize_graph()
|
||||||
|
print("Data visualized")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Loading…
Add table
Reference in a new issue