extend LoaderInterface to support web_url_loader, implement load()
This commit is contained in:
parent
305969c61b
commit
d884867d2c
5 changed files with 71 additions and 9 deletions
|
|
@ -64,7 +64,7 @@ class LoaderEngine:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_loader(
|
def get_loader(
|
||||||
self, file_path: str, preferred_loaders: List[str] = None
|
self, data_item_path: str, preferred_loaders: List[str] = None
|
||||||
) -> Optional[LoaderInterface]:
|
) -> Optional[LoaderInterface]:
|
||||||
"""
|
"""
|
||||||
Get appropriate loader for a file.
|
Get appropriate loader for a file.
|
||||||
|
|
@ -77,20 +77,26 @@ class LoaderEngine:
|
||||||
LoaderInterface that can handle the file, or None if not found
|
LoaderInterface that can handle the file, or None if not found
|
||||||
"""
|
"""
|
||||||
|
|
||||||
file_info = filetype.guess(file_path)
|
file_info = filetype.guess(data_item_path)
|
||||||
|
|
||||||
# Try preferred loaders first
|
# Try preferred loaders first
|
||||||
if preferred_loaders:
|
if preferred_loaders:
|
||||||
for loader_name in preferred_loaders:
|
for loader_name in preferred_loaders:
|
||||||
if loader_name in self._loaders:
|
if loader_name in self._loaders:
|
||||||
loader = self._loaders[loader_name]
|
loader = self._loaders[loader_name]
|
||||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
if loader.can_handle(
|
||||||
|
extension=file_info.extension,
|
||||||
|
mime_type=file_info.mime,
|
||||||
|
data_item_path=data_item_path,
|
||||||
|
): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time
|
||||||
return loader
|
return loader
|
||||||
else:
|
else:
|
||||||
logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
|
logger.info(f"Skipping {loader_name}: Preferred Loader not registered")
|
||||||
|
|
||||||
# Try default priority order
|
# Try default priority order
|
||||||
for loader_name in self.default_loader_priority:
|
for loader_name in (
|
||||||
|
self.default_loader_priority
|
||||||
|
): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review
|
||||||
if loader_name in self._loaders:
|
if loader_name in self._loaders:
|
||||||
loader = self._loaders[loader_name]
|
loader = self._loaders[loader_name]
|
||||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,9 @@ class LoaderInterface(ABC):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
def can_handle(
|
||||||
|
self, extension: str, mime_type: str, data_item_path: str = None
|
||||||
|
) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
||||||
"""
|
"""
|
||||||
Check if this loader can handle the given file.
|
Check if this loader can handle the given file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .pypdf_loader import PyPdfLoader
|
from .pypdf_loader import PyPdfLoader
|
||||||
|
from .web_url_loader import WebUrlLoader
|
||||||
|
|
||||||
__all__ = ["PyPdfLoader"]
|
__all__ = ["PyPdfLoader"]
|
||||||
|
|
||||||
|
|
@ -27,3 +28,10 @@ try:
|
||||||
__all__.append("AdvancedPdfLoader")
|
__all__.append("AdvancedPdfLoader")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .web_url_loader import WebUrlLoader
|
||||||
|
|
||||||
|
__all__.append("WebUrlLoader")
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,11 @@
|
||||||
from cognee.infrastructure.loaders import LoaderInterface
|
from cognee.infrastructure.loaders import LoaderInterface
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
||||||
|
from cognee.modules.ingestion import save_data_to_file
|
||||||
|
|
||||||
class WebLoader(LoaderInterface):
|
|
||||||
|
class WebUrlLoader(LoaderInterface):
|
||||||
@property
|
@property
|
||||||
def supported_extensions(self) -> List[str]:
|
def supported_extensions(self) -> List[str]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -31,9 +34,9 @@ class WebLoader(LoaderInterface):
|
||||||
Returns:
|
Returns:
|
||||||
String identifier used for registration and configuration
|
String identifier used for registration and configuration
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
return "web_url_loader"
|
||||||
|
|
||||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if this loader can handle the given file.
|
Check if this loader can handle the given file.
|
||||||
|
|
||||||
|
|
@ -44,7 +47,9 @@ class WebLoader(LoaderInterface):
|
||||||
Returns:
|
Returns:
|
||||||
True if this loader can process the file, False otherwise
|
True if this loader can process the file, False otherwise
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
if data_item_path is None:
|
||||||
|
raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
||||||
|
return data_item_path.startswith(("http://", "https://"))
|
||||||
|
|
||||||
async def load(self, file_path: str, **kwargs):
|
async def load(self, file_path: str, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|
@ -58,4 +63,38 @@ class WebLoader(LoaderInterface):
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If file cannot be processed
|
Exception: If file cannot be processed
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
||||||
|
from cognee.tasks.web_scraper import fetch_page_content
|
||||||
|
|
||||||
|
tavily = tavily_config.get()
|
||||||
|
soup_crawler = soup_crawler_config.get()
|
||||||
|
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
|
||||||
|
if preferred_tool == "tavily" and tavily is None:
|
||||||
|
raise IngestionError(
|
||||||
|
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
||||||
|
)
|
||||||
|
if preferred_tool == "beautifulsoup" and soup_crawler is None:
|
||||||
|
raise IngestionError(
|
||||||
|
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
||||||
|
)
|
||||||
|
|
||||||
|
data = await fetch_page_content(
|
||||||
|
file_path,
|
||||||
|
preferred_tool=preferred_tool,
|
||||||
|
tavily_config=tavily,
|
||||||
|
soup_crawler_config=soup_crawler,
|
||||||
|
)
|
||||||
|
content = ""
|
||||||
|
for key, value in data.items():
|
||||||
|
content += f"{key}:\n{value}\n\n"
|
||||||
|
await save_data_to_file(content)
|
||||||
|
|
||||||
|
return content
|
||||||
|
except IngestionError:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
raise IngestionError(
|
||||||
|
message=f"Error ingesting webpage results of url {file_path}: {str(e)}"
|
||||||
|
)
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
||||||
|
|
@ -23,3 +23,10 @@ try:
|
||||||
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cognee.infrastructure.loaders.external import WebUrlLoader
|
||||||
|
|
||||||
|
supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue