extend LoaderInterface to support web_url_loader, implement load()
This commit is contained in:
parent
305969c61b
commit
d884867d2c
5 changed files with 71 additions and 9 deletions
|
|
@ -64,7 +64,7 @@ class LoaderEngine:
|
|||
return True
|
||||
|
||||
def get_loader(
|
||||
self, file_path: str, preferred_loaders: List[str] = None
|
||||
self, data_item_path: str, preferred_loaders: List[str] = None
|
||||
) -> Optional[LoaderInterface]:
|
||||
"""
|
||||
Get appropriate loader for a file.
|
||||
|
|
@ -77,20 +77,26 @@ class LoaderEngine:
|
|||
LoaderInterface that can handle the file, or None if not found
|
||||
"""
|
||||
|
||||
file_info = filetype.guess(file_path)
|
||||
file_info = filetype.guess(data_item_path)
|
||||
|
||||
# Try preferred loaders first
|
||||
if preferred_loaders:
|
||||
for loader_name in preferred_loaders:
|
||||
if loader_name in self._loaders:
|
||||
loader = self._loaders[loader_name]
|
||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||
if loader.can_handle(
|
||||
extension=file_info.extension,
|
||||
mime_type=file_info.mime,
|
||||
data_item_path=data_item_path,
|
||||
): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
|
||||
return loader
|
||||
else:
|
||||
logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
|
||||
|
||||
# Try default priority order
|
||||
for loader_name in self.default_loader_priority:
|
||||
for loader_name in (
|
||||
self.default_loader_priority
|
||||
): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
|
||||
if loader_name in self._loaders:
|
||||
loader = self._loaders[loader_name]
|
||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||
|
|
|
|||
|
|
@ -44,7 +44,9 @@ class LoaderInterface(ABC):
|
|||
pass
|
||||
|
||||
@abstractmethod
|
||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||
def can_handle(
|
||||
self, extension: str, mime_type: str, data_item_path: str = None
|
||||
) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
||||
"""
|
||||
Check if this loader can handle the given file.
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe
|
|||
"""
|
||||
|
||||
from .pypdf_loader import PyPdfLoader
|
||||
from .web_url_loader import WebUrlLoader
|
||||
|
||||
__all__ = ["PyPdfLoader"]
|
||||
|
||||
|
|
@ -27,3 +28,10 @@ try:
|
|||
__all__.append("AdvancedPdfLoader")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from .web_url_loader import WebUrlLoader
|
||||
|
||||
__all__.append("WebUrlLoader")
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
from cognee.infrastructure.loaders import LoaderInterface
|
||||
from typing import List
|
||||
|
||||
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
||||
from cognee.modules.ingestion import save_data_to_file
|
||||
|
||||
class WebLoader(LoaderInterface):
|
||||
|
||||
class WebUrlLoader(LoaderInterface):
|
||||
@property
|
||||
def supported_extensions(self) -> List[str]:
|
||||
"""
|
||||
|
|
@ -31,9 +34,9 @@ class WebLoader(LoaderInterface):
|
|||
Returns:
|
||||
String identifier used for registration and configuration
|
||||
"""
|
||||
raise NotImplementedError
|
||||
return "web_url_loader"
|
||||
|
||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
|
||||
"""
|
||||
Check if this loader can handle the given file.
|
||||
|
||||
|
|
@ -44,7 +47,9 @@ class WebLoader(LoaderInterface):
|
|||
Returns:
|
||||
True if this loader can process the file, False otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
if data_item_path is None:
|
||||
raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
||||
return data_item_path.startswith(("http://", "https://"))
|
||||
|
||||
async def load(self, file_path: str, **kwargs):
|
||||
"""
|
||||
|
|
@ -58,4 +63,38 @@ class WebLoader(LoaderInterface):
|
|||
Raises:
|
||||
Exception: If file cannot be processed
|
||||
"""
|
||||
try:
|
||||
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
||||
from cognee.tasks.web_scraper import fetch_page_content
|
||||
|
||||
tavily = tavily_config.get()
|
||||
soup_crawler = soup_crawler_config.get()
|
||||
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
|
||||
if preferred_tool == "tavily" and tavily is None:
|
||||
raise IngestionError(
|
||||
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
||||
)
|
||||
if preferred_tool == "beautifulsoup" and soup_crawler is None:
|
||||
raise IngestionError(
|
||||
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
||||
)
|
||||
|
||||
data = await fetch_page_content(
|
||||
file_path,
|
||||
preferred_tool=preferred_tool,
|
||||
tavily_config=tavily,
|
||||
soup_crawler_config=soup_crawler,
|
||||
)
|
||||
content = ""
|
||||
for key, value in data.items():
|
||||
content += f"{key}:\n{value}\n\n"
|
||||
await save_data_to_file(content)
|
||||
|
||||
return content
|
||||
except IngestionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise IngestionError(
|
||||
message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
|
||||
)
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -23,3 +23,10 @@ try:
|
|||
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from cognee.infrastructure.loaders.external import WebUrlLoader
|
||||
|
||||
supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue