extend LoaderInterface to support web_url_loader, implement load()

This commit is contained in:
Daulet Amirkhanov 2025-10-16 12:40:07 +01:00
parent 305969c61b
commit d884867d2c
5 changed files with 71 additions and 9 deletions

View file

@ -64,7 +64,7 @@ class LoaderEngine:
return True
def get_loader(
self, file_path: str, preferred_loaders: List[str] = None
self, data_item_path: str, preferred_loaders: List[str] = None
) -> Optional[LoaderInterface]:
"""
Get appropriate loader for a file.
@ -77,20 +77,26 @@ class LoaderEngine:
LoaderInterface that can handle the file, or None if not found
"""
file_info = filetype.guess(file_path)
file_info = filetype.guess(data_item_path)
# Try preferred loaders first
if preferred_loaders:
for loader_name in preferred_loaders:
if loader_name in self._loaders:
loader = self._loaders[loader_name]
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
if loader.can_handle(
extension=file_info.extension,
mime_type=file_info.mime,
data_item_path=data_item_path,
): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
return loader
else:
logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
# Try default priority order
for loader_name in self.default_loader_priority:
for loader_name in (
self.default_loader_priority
): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
if loader_name in self._loaders:
loader = self._loaders[loader_name]
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):

View file

@ -44,7 +44,9 @@ class LoaderInterface(ABC):
pass
@abstractmethod
def can_handle(self, extension: str, mime_type: str) -> bool:
def can_handle(
self, extension: str, mime_type: str, data_item_path: str = None
) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
"""
Check if this loader can handle the given file.

View file

@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe
"""
from .pypdf_loader import PyPdfLoader
from .web_url_loader import WebUrlLoader
__all__ = ["PyPdfLoader"]
@ -27,3 +28,10 @@ try:
__all__.append("AdvancedPdfLoader")
except ImportError:
pass
try:
from .web_url_loader import WebUrlLoader
__all__.append("WebUrlLoader")
except ImportError:
pass

View file

@ -1,8 +1,11 @@
from cognee.infrastructure.loaders import LoaderInterface
from typing import List
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
from cognee.modules.ingestion import save_data_to_file
class WebLoader(LoaderInterface):
class WebUrlLoader(LoaderInterface):
@property
def supported_extensions(self) -> List[str]:
"""
@ -31,9 +34,9 @@ class WebLoader(LoaderInterface):
Returns:
String identifier used for registration and configuration
"""
raise NotImplementedError
return "web_url_loader"
def can_handle(self, extension: str, mime_type: str) -> bool:
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
"""
Check if this loader can handle the given file.
@ -44,7 +47,9 @@ class WebLoader(LoaderInterface):
Returns:
True if this loader can process the file, False otherwise
"""
raise NotImplementedError
if data_item_path is None:
raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
return data_item_path.startswith(("http://", "https://"))
async def load(self, file_path: str, **kwargs):
"""
@ -58,4 +63,38 @@ class WebLoader(LoaderInterface):
Raises:
Exception: If file cannot be processed
"""
try:
from cognee.context_global_variables import tavily_config, soup_crawler_config
from cognee.tasks.web_scraper import fetch_page_content
tavily = tavily_config.get()
soup_crawler = soup_crawler_config.get()
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
if preferred_tool == "tavily" and tavily is None:
raise IngestionError(
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
)
if preferred_tool == "beautifulsoup" and soup_crawler is None:
raise IngestionError(
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
)
data = await fetch_page_content(
file_path,
preferred_tool=preferred_tool,
tavily_config=tavily,
soup_crawler_config=soup_crawler,
)
content = ""
for key, value in data.items():
content += f"{key}:\n{value}\n\n"
await save_data_to_file(content)
return content
except IngestionError:
raise
except Exception as e:
raise IngestionError(
message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
)
raise NotImplementedError

View file

@ -23,3 +23,10 @@ try:
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
except ImportError:
pass
try:
from cognee.infrastructure.loaders.external import WebUrlLoader
supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
except ImportError:
pass