diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py index 7ffda79bd..242d130a9 100644 --- a/cognee/infrastructure/files/utils/get_data_file_path.py +++ b/cognee/infrastructure/files/utils/get_data_file_path.py @@ -38,6 +38,9 @@ def get_data_file_path(file_path: str): return normalized_url + elif file_path.startswith(("http://", "https://")): + return file_path + else: # Regular file path - normalize separators normalized_path = os.path.normpath(file_path) diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/WebLoader.py index 609ade2e0..db24c86e6 100644 --- a/cognee/infrastructure/loaders/external/WebLoader.py +++ b/cognee/infrastructure/loaders/external/WebLoader.py @@ -11,7 +11,7 @@ class WebLoader(LoaderInterface): Returns: List of extensions including the dot (e.g., ['.txt', '.md']) """ - raise NotImplementedError + return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality @property def supported_mime_types(self) -> List[str]: @@ -21,7 +21,7 @@ class WebLoader(LoaderInterface): Returns: List of MIME type strings (e.g., ['text/plain', 'application/pdf']) """ - raise NotImplementedError + return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality @property def loader_name(self) -> str: diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 9fcafca57..cd722bd76 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -75,5 +75,10 @@ async def data_item_to_text_file( else: raise IngestionError(message="Local files are not accepted.") + elif data_item_path.startswith(("http://", "https://")): + loader = get_loader_engine() + return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( + data_item_path, preferred_loaders + ) # data is not a supported type raise IngestionError(message=f"Data type not supported: {type(data_item_path)}") diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index b6e1f7d00..d9f1beae7 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str if parsed_url.scheme == "s3": return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": - # Validate URL by sending a HEAD request - try: - from cognee.context_global_variables import tavily_config, soup_crawler_config - from cognee.tasks.web_scraper import fetch_page_content - - tavily = tavily_config.get() - soup_crawler = soup_crawler_config.get() - preferred_tool = "beautifulsoup" if soup_crawler else "tavily" - if preferred_tool == "tavily" and tavily is None: - raise IngestionError( - message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." - ) - if preferred_tool == "beautifulsoup" and soup_crawler is None: - raise IngestionError( - message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." - ) - - data = await fetch_page_content( - data_item, - preferred_tool=preferred_tool, - tavily_config=tavily, - soup_crawler_config=soup_crawler, - ) - content = "" - for key, value in data.items(): - content += f"{key}:\n{value}\n\n" - return await save_data_to_file(content) - except IngestionError: - raise - except Exception as e: - raise IngestionError( - message=f"Error ingesting webpage results of url {data_item}: {str(e)}" - ) - + return data_item # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: