cognee/cognee/infrastructure/loaders/external/web_url_loader.py
Daulet Amirkhanov b5190c90f1 add logging for crawling status; add cap to the crawl_delay from robots.txt
- Not advising to use the cap, but giving an option to be able to configure it
2025-10-21 22:46:49 +01:00

133 lines
5.2 KiB
Python

from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
from typing import List
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
from cognee.modules.ingestion import save_data_to_file
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
from cognee.shared.logging_utils import get_logger
logger = get_logger()
class WebUrlLoader(LoaderInterface):
@property
def supported_extensions(self) -> List[str]:
"""
List of file extensions this loader supports.
Returns:
List of extensions including the dot (e.g., ['.txt', '.md'])
"""
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
@property
def supported_mime_types(self) -> List[str]:
"""
List of MIME types this loader supports.
Returns:
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
"""
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
@property
def loader_name(self) -> str:
"""
Unique name identifier for this loader.
Returns:
String identifier used for registration and configuration
"""
return "web_url_loader"
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
"""
Check if this loader can handle the given file.
Args:
extension: File extension
mime_type: MIME type of the file
Returns:
True if this loader can process the file, False otherwise
"""
if data_item_path is None:
raise IngestionError(
"data_item_path should not be None"
) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
return data_item_path.startswith(("http://", "https://"))
async def load(self, file_path: str, **kwargs):
"""
Load and process the file, returning standardized result.
Args:
file_path: Path to the file to be processed
file_stream: If file stream is provided it will be used to process file instead
**kwargs: Additional loader-specific configuration
Returns:
file path to the stored file
Raises:
Exception: If file cannot be processed
"""
loaders_config = kwargs.get("loaders_config")
if not isinstance(loaders_config, dict):
raise IngestionError("loaders_config must be a valid dictionary")
web_url_loader_config = loaders_config.get(self.loader_name)
if not isinstance(web_url_loader_config, dict):
raise IngestionError(f"{self.loader_name} configuration must be a valid dictionary")
try:
from cognee.context_global_variables import tavily_config, soup_crawler_config
from cognee.tasks.web_scraper import fetch_page_content
tavily_dict = web_url_loader_config.get("tavily_config")
_tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
soup_dict = web_url_loader_config.get("soup_config")
_soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
# Set global configs for downstream access
tavily_config.set(_tavily_config)
soup_crawler_config.set(_soup_config)
preferred_tool = "beautifulsoup" if _soup_config else "tavily"
if preferred_tool == "tavily" and _tavily_config is None:
raise IngestionError(
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
)
if preferred_tool == "beautifulsoup" and _soup_config is None:
raise IngestionError(
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
)
logger.info(f"Starting web URL crawling for: {file_path}")
logger.info(f"Using scraping tool: {preferred_tool}")
data = await fetch_page_content(
file_path,
preferred_tool=preferred_tool,
tavily_config=_tavily_config,
soup_crawler_config=_soup_config,
)
logger.info(f"Successfully fetched content from {len(data)} URL(s)")
logger.info("Processing and concatenating fetched content")
content = ""
for key, value in data.items():
content += f"{key}:\n{value}\n\n"
logger.info(f"Saving content to file (total size: {len(content)} characters)")
stored_path = await save_data_to_file(content)
logger.info(f"Successfully saved content to: {stored_path}")
return stored_path
except IngestionError:
raise
except Exception as e:
raise IngestionError(
message=f"Error ingesting webpage from URL {file_path}: {str(e)}"
) from e