diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index af6b53e93..d6c4d4d8c 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -76,8 +76,15 @@ class LoaderEngine: Returns: LoaderInterface that can handle the file, or None if not found """ + is_url = data_item_path.startswith(("http://", "https://")) - file_info = filetype.guess(data_item_path) + if is_url: + extension = None + mime_type = None + else: + file_info = filetype.guess(data_item_path) + extension = file_info.extension if file_info else None + mime_type = file_info.mime if file_info else None # Try preferred loaders first if preferred_loaders: @@ -85,8 +92,8 @@ class LoaderEngine: if loader_name in self._loaders: loader = self._loaders[loader_name] if loader.can_handle( - extension=file_info.extension, - mime_type=file_info.mime, + extension=extension, + mime_type=mime_type, data_item_path=data_item_path, ): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time return loader @@ -99,7 +106,7 @@ class LoaderEngine: ): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review if loader_name in self._loaders: loader = self._loaders[loader_name] - if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): + if loader.can_handle(extension=extension, mime_type=mime_type): return loader else: logger.info( diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 5e0cf07f1..38bca2523 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -1,4 +1,4 @@ -from cognee.infrastructure.loaders import LoaderInterface +from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface from typing import List from cognee.modules.ingestion.exceptions.exceptions import IngestionError diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 91d09059a..f82d9a0dc 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -50,17 +50,17 @@ async def data_item_to_text_file( await pull_from_s3(data_item_path, temp_file) temp_file.flush() # Data needs to be saved to local storage loader = get_loader_engine() - return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader( - temp_file.name, preferred_loaders - ) + return await loader.load_file( + temp_file.name, None, preferred_loaders + ), loader.get_loader(temp_file.name, preferred_loaders) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders - ) + return await loader.load_file( + data_item_path, None, preferred_loaders + ), loader.get_loader(data_item_path, preferred_loaders) else: raise IngestionError(message="Local files are not accepted.") @@ -71,9 +71,9 @@ async def data_item_to_text_file( # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path) if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders - ) + return await loader.load_file( + data_item_path, None, preferred_loaders + ), loader.get_loader(data_item_path, preferred_loaders) else: raise IngestionError(message="Local files are not accepted.") @@ -82,8 +82,9 @@ async def data_item_to_text_file( return ( await loader.load_file( data_item_path, + None, preferred_loaders, - loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal + loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal ), loader.get_loader(data_item_path, preferred_loaders), ) diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index b9840df3d..0c4332c6d 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -6,10 +6,10 @@ import cognee async def test_add_fails_when_preferred_loader_not_specified(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - with pytest.raises: + with pytest.raises(ValueError): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], + incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix ) @@ -17,10 +17,22 @@ async def test_add_fails_when_preferred_loader_not_specified(): async def test_add_succesfully_adds_url_when_preferred_loader_specified(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) + + loaders_config = { + "web_url_loader": { + "soup_config": { + "max_depth": 1, + "follow_links": False, + } + } + } + try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", preferred_loaders=["web_url_loader"], + incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix + loaders_config=loaders_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}")