133 lines
5.2 KiB
Python
133 lines
5.2 KiB
Python
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
from typing import List
|
|
|
|
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
|
from cognee.modules.ingestion import save_data_to_file
|
|
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
|
|
from cognee.shared.logging_utils import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
class WebUrlLoader(LoaderInterface):
|
|
@property
|
|
def supported_extensions(self) -> List[str]:
|
|
"""
|
|
List of file extensions this loader supports.
|
|
|
|
Returns:
|
|
List of extensions including the dot (e.g., ['.txt', '.md'])
|
|
"""
|
|
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
|
|
|
@property
|
|
def supported_mime_types(self) -> List[str]:
|
|
"""
|
|
List of MIME types this loader supports.
|
|
|
|
Returns:
|
|
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
|
"""
|
|
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
|
|
|
@property
|
|
def loader_name(self) -> str:
|
|
"""
|
|
Unique name identifier for this loader.
|
|
|
|
Returns:
|
|
String identifier used for registration and configuration
|
|
"""
|
|
return "web_url_loader"
|
|
|
|
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
|
|
"""
|
|
Check if this loader can handle the given file.
|
|
|
|
Args:
|
|
extension: File extension
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
True if this loader can process the file, False otherwise
|
|
"""
|
|
if data_item_path is None:
|
|
raise IngestionError(
|
|
"data_item_path should not be None"
|
|
) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
|
return data_item_path.startswith(("http://", "https://"))
|
|
|
|
async def load(self, file_path: str, **kwargs):
|
|
"""
|
|
Load and process the file, returning standardized result.
|
|
|
|
Args:
|
|
file_path: Path to the file to be processed
|
|
file_stream: If file stream is provided it will be used to process file instead
|
|
**kwargs: Additional loader-specific configuration
|
|
|
|
Returns:
|
|
file path to the stored file
|
|
Raises:
|
|
Exception: If file cannot be processed
|
|
"""
|
|
loaders_config = kwargs.get("loaders_config")
|
|
if not isinstance(loaders_config, dict):
|
|
raise IngestionError("loaders_config must be a valid dictionary")
|
|
|
|
web_url_loader_config = loaders_config.get(self.loader_name)
|
|
if not isinstance(web_url_loader_config, dict):
|
|
raise IngestionError(f"{self.loader_name} configuration must be a valid dictionary")
|
|
|
|
try:
|
|
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
|
from cognee.tasks.web_scraper import fetch_page_content
|
|
|
|
tavily_dict = web_url_loader_config.get("tavily_config")
|
|
_tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None
|
|
|
|
soup_dict = web_url_loader_config.get("soup_config")
|
|
_soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None
|
|
|
|
# Set global configs for downstream access
|
|
tavily_config.set(_tavily_config)
|
|
soup_crawler_config.set(_soup_config)
|
|
|
|
preferred_tool = "beautifulsoup" if _soup_config else "tavily"
|
|
if preferred_tool == "tavily" and _tavily_config is None:
|
|
raise IngestionError(
|
|
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
|
)
|
|
if preferred_tool == "beautifulsoup" and _soup_config is None:
|
|
raise IngestionError(
|
|
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
|
)
|
|
|
|
logger.info(f"Starting web URL crawling for: {file_path}")
|
|
logger.info(f"Using scraping tool: {preferred_tool}")
|
|
|
|
data = await fetch_page_content(
|
|
file_path,
|
|
preferred_tool=preferred_tool,
|
|
tavily_config=_tavily_config,
|
|
soup_crawler_config=_soup_config,
|
|
)
|
|
|
|
logger.info(f"Successfully fetched content from {len(data)} URL(s)")
|
|
logger.info("Processing and concatenating fetched content")
|
|
|
|
content = ""
|
|
for key, value in data.items():
|
|
content += f"{key}:\n{value}\n\n"
|
|
|
|
logger.info(f"Saving content to file (total size: {len(content)} characters)")
|
|
stored_path = await save_data_to_file(content)
|
|
logger.info(f"Successfully saved content to: {stored_path}")
|
|
|
|
return stored_path
|
|
except IngestionError:
|
|
raise
|
|
except Exception as e:
|
|
raise IngestionError(
|
|
message=f"Error ingesting webpage from URL {file_path}: {str(e)}"
|
|
) from e
|