remove web_url_loader since there is no logic post fetching for loader
This commit is contained in:
parent
d7417d9b06
commit
fc660b46bb
5 changed files with 3 additions and 115 deletions
|
|
@ -27,10 +27,3 @@ try:
|
||||||
__all__.append("AdvancedPdfLoader")
|
__all__.append("AdvancedPdfLoader")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
|
||||||
from .web_url_loader import WebUrlLoader
|
|
||||||
|
|
||||||
__all__.append("WebUrlLoader")
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from cognee.modules.ingestion.exceptions.exceptions import IngestionError
|
|
||||||
from cognee.shared.logging_utils import get_logger
|
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
class WebUrlLoader(LoaderInterface):
|
|
||||||
@property
|
|
||||||
def supported_extensions(self) -> List[str]:
|
|
||||||
"""
|
|
||||||
List of file extensions this loader supports.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of extensions including the dot (e.g., ['.txt', '.md'])
|
|
||||||
"""
|
|
||||||
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
|
||||||
|
|
||||||
@property
|
|
||||||
def supported_mime_types(self) -> List[str]:
|
|
||||||
"""
|
|
||||||
List of MIME types this loader supports.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
|
||||||
"""
|
|
||||||
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
|
||||||
|
|
||||||
@property
|
|
||||||
def loader_name(self) -> str:
|
|
||||||
"""
|
|
||||||
Unique name identifier for this loader.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
String identifier used for registration and configuration
|
|
||||||
"""
|
|
||||||
return "web_url_loader"
|
|
||||||
|
|
||||||
def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool:
|
|
||||||
"""
|
|
||||||
Check if this loader can handle the given file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
extension: File extension
|
|
||||||
mime_type: MIME type of the file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if this loader can process the file, False otherwise
|
|
||||||
"""
|
|
||||||
if data_item_path is None:
|
|
||||||
raise IngestionError(
|
|
||||||
"data_item_path should not be None"
|
|
||||||
) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py
|
|
||||||
return data_item_path.startswith(("http://", "https://"))
|
|
||||||
|
|
||||||
async def load(self, file_path: str, **kwargs):
|
|
||||||
"""
|
|
||||||
Load and process the file, returning standardized result.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the file to be processed (already saved by fetcher)
|
|
||||||
file_stream: If file stream is provided it will be used to process file instead
|
|
||||||
**kwargs: Additional loader-specific configuration
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
file path to the stored file
|
|
||||||
Raises:
|
|
||||||
Exception: If file cannot be processed
|
|
||||||
"""
|
|
||||||
|
|
||||||
return file_path
|
|
||||||
|
|
@ -76,16 +76,5 @@ async def data_item_to_text_file(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise IngestionError(message="Local files are not accepted.")
|
raise IngestionError(message="Local files are not accepted.")
|
||||||
|
|
||||||
elif data_item_path.startswith(("http://", "https://")):
|
|
||||||
loader = get_loader_engine()
|
|
||||||
return (
|
|
||||||
await loader.load_file(
|
|
||||||
data_item_path,
|
|
||||||
preferred_loaders,
|
|
||||||
loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal
|
|
||||||
),
|
|
||||||
loader.get_loader(data_item_path, preferred_loaders),
|
|
||||||
)
|
|
||||||
# data is not a supported type
|
# data is not a supported type
|
||||||
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
||||||
|
|
|
||||||
|
|
@ -1,20 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from cognee.infrastructure.loaders import get_loader_engine
|
|
||||||
from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_loader_returns_none_by_default_for_web_urls():
|
|
||||||
loader_engine = get_loader_engine()
|
|
||||||
urls = ["https://cognee.ai", "http://cognee.ai"]
|
|
||||||
for url in urls:
|
|
||||||
loader = loader_engine.get_loader(url)
|
|
||||||
assert loader is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_loader_returns_valid_loader_when_preferred_loaders_specified():
|
|
||||||
loader_engine = get_loader_engine()
|
|
||||||
urls = ["https://cognee.ai", "http://cognee.ai"]
|
|
||||||
for url in urls:
|
|
||||||
loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"])
|
|
||||||
assert isinstance(loader, WebUrlLoader)
|
|
||||||
|
|
@ -20,7 +20,7 @@ async def main():
|
||||||
"paragraphs": {"selector": "p", "all": True},
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
}
|
}
|
||||||
|
|
||||||
loaders_config = {
|
fetchers_config = {
|
||||||
"web_url_loader": {
|
"web_url_loader": {
|
||||||
"soup_config": {
|
"soup_config": {
|
||||||
"max_depth": 1,
|
"max_depth": 1,
|
||||||
|
|
@ -32,9 +32,8 @@ async def main():
|
||||||
|
|
||||||
await cognee.add(
|
await cognee.add(
|
||||||
"https://en.wikipedia.org/wiki/Large_language_model",
|
"https://en.wikipedia.org/wiki/Large_language_model",
|
||||||
preferred_loaders=["web_url_loader"],
|
incremental_loading=False,
|
||||||
incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix
|
fetchers_config=fetchers_config,
|
||||||
loaders_config=loaders_config,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
await cognee.cognify()
|
await cognee.cognify()
|
||||||
Loading…
Add table
Reference in a new issue