ugly fix for the scraper
This commit is contained in:
parent
339e164635
commit
e445dd7f8b
2 changed files with 30 additions and 21 deletions
|
|
@ -12,11 +12,15 @@ from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
|
||||||
)
|
)
|
||||||
from cognee.modules.engine.operations.setup import setup
|
from cognee.modules.engine.operations.setup import setup
|
||||||
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
||||||
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
|
try:
|
||||||
from cognee.context_global_variables import (
|
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
|
||||||
tavily_config as tavily,
|
from cognee.context_global_variables import (
|
||||||
soup_crawler_config as soup_crawler,
|
tavily_config as tavily,
|
||||||
)
|
soup_crawler_config as soup_crawler,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
from pydantic import BaseModel
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -31,8 +35,8 @@ async def add(
|
||||||
preferred_loaders: List[str] = None,
|
preferred_loaders: List[str] = None,
|
||||||
incremental_loading: bool = True,
|
incremental_loading: bool = True,
|
||||||
extraction_rules: Optional[Dict[str, Any]] = None,
|
extraction_rules: Optional[Dict[str, Any]] = None,
|
||||||
tavily_config: Optional[TavilyConfig] = None,
|
tavily_config: Optional[BaseModel] = None,
|
||||||
soup_crawler_config: Optional[SoupCrawlerConfig] = None,
|
soup_crawler_config: Optional[BaseModel] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Add data to Cognee for knowledge graph processing.
|
Add data to Cognee for knowledge graph processing.
|
||||||
|
|
@ -172,23 +176,27 @@ async def add(
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not soup_crawler_config and extraction_rules:
|
try:
|
||||||
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
|
||||||
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
|
||||||
tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
|
|
||||||
|
|
||||||
soup_crawler.set(soup_crawler_config)
|
if not soup_crawler_config and extraction_rules:
|
||||||
tavily.set(tavily_config)
|
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
||||||
|
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
||||||
|
tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
|
||||||
|
|
||||||
http_schemes = {"http", "https"}
|
soup_crawler.set(soup_crawler_config)
|
||||||
|
tavily.set(tavily_config)
|
||||||
|
|
||||||
def _is_http_url(item: Union[str, BinaryIO]) -> bool:
|
http_schemes = {"http", "https"}
|
||||||
return isinstance(item, str) and urlparse(item).scheme in http_schemes
|
|
||||||
|
|
||||||
if _is_http_url(data):
|
def _is_http_url(item: Union[str, BinaryIO]) -> bool:
|
||||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
return isinstance(item, str) and urlparse(item).scheme in http_schemes
|
||||||
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
|
||||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
if _is_http_url(data):
|
||||||
|
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
||||||
|
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
||||||
|
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
tasks = [
|
tasks = [
|
||||||
Task(resolve_data_directories, include_subdirectories=True),
|
Task(resolve_data_directories, include_subdirectories=True),
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from cognee.modules.ingestion.exceptions import IngestionError
|
||||||
from cognee.modules.ingestion import save_data_to_file
|
from cognee.modules.ingestion import save_data_to_file
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
@ -59,6 +59,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
||||||
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
||||||
# Validate URL by sending a HEAD request
|
# Validate URL by sending a HEAD request
|
||||||
try:
|
try:
|
||||||
|
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
||||||
from cognee.tasks.web_scraper import fetch_page_content
|
from cognee.tasks.web_scraper import fetch_page_content
|
||||||
|
|
||||||
tavily = tavily_config.get()
|
tavily = tavily_config.get()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue