remove web_url_loader since there is no logic post fetching for loader

This commit is contained in:
Daulet Amirkhanov 2025-10-20 21:50:10 +01:00
parent d7417d9b06
commit fc660b46bb
5 changed files with 3 additions and 115 deletions

View file

@ -27,10 +27,3 @@ try:
__all__.append("AdvancedPdfLoader")
except ImportError:
pass
try:
from .web_url_loader import WebUrlLoader
__all__.append("WebUrlLoader")
except ImportError:
pass

View file

@ -1,73 +0,0 @@
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
from typing import List
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
from cognee.shared.logging_utils import get_logger
logger = get_logger()
class WebUrlLoader(LoaderInterface):
@property
def supported_extensions(self) -> List[str]:
"""
List of file extensions this loader supports.
Returns:
List of extensions including the dot (e.g., ['.txt', '.md'])
"""
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
@property
def supported_mime_types(self) -> List[str]:
"""
List of MIME types this loader supports.
Returns:
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
"""
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
@property
def loader_name(self) -> str:
"""
Unique name identifier for this loader.
Returns:
String identifier used for registration and configuration
"""
return "web_url_loader"
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
"""
Check if this loader can handle the given file.
Args:
extension: File extension
mime_type: MIME type of the file
Returns:
True if this loader can process the file, False otherwise
"""
if data_item_path is None:
raise IngestionError(
"data_item_path should not be None"
) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
return data_item_path.startswith(("http://", "https://"))
async def load(self, file_path: str, **kwargs):
"""
Load and process the file, returning standardized result.
Args:
file_path: Path to the file to be processed (already saved by fetcher)
file_stream: If file stream is provided it will be used to process file instead
**kwargs: Additional loader-specific configuration
Returns:
file path to the stored file
Raises:
Exception: If file cannot be processed
"""
return file_path

View file

@ -76,16 +76,5 @@ async def data_item_to_text_file(
)
else:
raise IngestionError(message="Local files are not accepted.")
elif data_item_path.startswith(("http://", "https://")):
loader = get_loader_engine()
return (
await loader.load_file(
data_item_path,
preferred_loaders,
loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
),
loader.get_loader(data_item_path, preferred_loaders),
)
# data is not a supported type
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")

View file

@ -1,20 +0,0 @@
import pytest
from cognee.infrastructure.loaders import get_loader_engine
from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader
def test_get_loader_returns_none_by_default_for_web_urls():
loader_engine = get_loader_engine()
urls = ["https://cognee.ai", "http://cognee.ai"]
for url in urls:
loader = loader_engine.get_loader(url)
assert loader is None
def test_get_loader_returns_valid_loader_when_preferred_loaders_specified():
loader_engine = get_loader_engine()
urls = ["https://cognee.ai", "http://cognee.ai"]
for url in urls:
loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"])
assert isinstance(loader, WebUrlLoader)

View file

@ -20,7 +20,7 @@ async def main():
"paragraphs": {"selector": "p", "all": True},
}
loaders_config = {
fetchers_config = {
"web_url_loader": {
"soup_config": {
"max_depth": 1,
@ -32,9 +32,8 @@ async def main():
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
preferred_loaders=["web_url_loader"],
incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
loaders_config=loaders_config,
incremental_loading=False,
fetchers_config=fetchers_config,
)
await cognee.cognify()