remove web_url_loader since there is no logic post fetching for loader
This commit is contained in:
parent
d7417d9b06
commit
fc660b46bb
5 changed files with 3 additions and 115 deletions
|
|
@ -27,10 +27,3 @@ try:
|
|||
__all__.append("AdvancedPdfLoader")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from .web_url_loader import WebUrlLoader
|
||||
|
||||
__all__.append("WebUrlLoader")
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -1,73 +0,0 @@
|
|||
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||
from typing import List
|
||||
|
||||
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class WebUrlLoader(LoaderInterface):
|
||||
@property
|
||||
def supported_extensions(self) -> List[str]:
|
||||
"""
|
||||
List of file extensions this loader supports.
|
||||
|
||||
Returns:
|
||||
List of extensions including the dot (e.g., ['.txt', '.md'])
|
||||
"""
|
||||
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
||||
|
||||
@property
|
||||
def supported_mime_types(self) -> List[str]:
|
||||
"""
|
||||
List of MIME types this loader supports.
|
||||
|
||||
Returns:
|
||||
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
||||
"""
|
||||
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
||||
|
||||
@property
|
||||
def loader_name(self) -> str:
|
||||
"""
|
||||
Unique name identifier for this loader.
|
||||
|
||||
Returns:
|
||||
String identifier used for registration and configuration
|
||||
"""
|
||||
return "web_url_loader"
|
||||
|
||||
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
|
||||
"""
|
||||
Check if this loader can handle the given file.
|
||||
|
||||
Args:
|
||||
extension: File extension
|
||||
mime_type: MIME type of the file
|
||||
|
||||
Returns:
|
||||
True if this loader can process the file, False otherwise
|
||||
"""
|
||||
if data_item_path is None:
|
||||
raise IngestionError(
|
||||
"data_item_path should not be None"
|
||||
) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
||||
return data_item_path.startswith(("http://", "https://"))
|
||||
|
||||
async def load(self, file_path: str, **kwargs):
|
||||
"""
|
||||
Load and process the file, returning standardized result.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to be processed (already saved by fetcher)
|
||||
file_stream: If file stream is provided it will be used to process file instead
|
||||
**kwargs: Additional loader-specific configuration
|
||||
|
||||
Returns:
|
||||
file path to the stored file
|
||||
Raises:
|
||||
Exception: If file cannot be processed
|
||||
"""
|
||||
|
||||
return file_path
|
||||
|
|
@ -76,16 +76,5 @@ async def data_item_to_text_file(
|
|||
)
|
||||
else:
|
||||
raise IngestionError(message="Local files are not accepted.")
|
||||
|
||||
elif data_item_path.startswith(("http://", "https://")):
|
||||
loader = get_loader_engine()
|
||||
return (
|
||||
await loader.load_file(
|
||||
data_item_path,
|
||||
preferred_loaders,
|
||||
loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
|
||||
),
|
||||
loader.get_loader(data_item_path, preferred_loaders),
|
||||
)
|
||||
# data is not a supported type
|
||||
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
||||
|
|
|
|||
|
|
@ -1,20 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from cognee.infrastructure.loaders import get_loader_engine
|
||||
from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader
|
||||
|
||||
|
||||
def test_get_loader_returns_none_by_default_for_web_urls():
|
||||
loader_engine = get_loader_engine()
|
||||
urls = ["https://cognee.ai", "http://cognee.ai"]
|
||||
for url in urls:
|
||||
loader = loader_engine.get_loader(url)
|
||||
assert loader is None
|
||||
|
||||
|
||||
def test_get_loader_returns_valid_loader_when_preferred_loaders_specified():
|
||||
loader_engine = get_loader_engine()
|
||||
urls = ["https://cognee.ai", "http://cognee.ai"]
|
||||
for url in urls:
|
||||
loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"])
|
||||
assert isinstance(loader, WebUrlLoader)
|
||||
|
|
@ -20,7 +20,7 @@ async def main():
|
|||
"paragraphs": {"selector": "p", "all": True},
|
||||
}
|
||||
|
||||
loaders_config = {
|
||||
fetchers_config = {
|
||||
"web_url_loader": {
|
||||
"soup_config": {
|
||||
"max_depth": 1,
|
||||
|
|
@ -32,9 +32,8 @@ async def main():
|
|||
|
||||
await cognee.add(
|
||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||
preferred_loaders=["web_url_loader"],
|
||||
incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
|
||||
loaders_config=loaders_config,
|
||||
incremental_loading=False,
|
||||
fetchers_config=fetchers_config,
|
||||
)
|
||||
|
||||
await cognee.cognify()
|
||||
Loading…
Add table
Reference in a new issue