fix: ensure web urls correctly go through ingest_data and reach loaders

This commit is contained in:
Daulet Amirkhanov 2025-10-16 12:16:37 +01:00
parent 9395539868
commit 95106d5914
4 changed files with 11 additions and 36 deletions

View file

@ -38,6 +38,9 @@ def get_data_file_path(file_path: str):
return normalized_url
elif file_path.startswith(("http://", "https://")):
return file_path
else:
# Regular file path - normalize separators
normalized_path = os.path.normpath(file_path)

View file

@ -11,7 +11,7 @@ class WebLoader(LoaderInterface):
Returns:
List of extensions including the dot (e.g., ['.txt', '.md'])
"""
raise NotImplementedError
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
@property
def supported_mime_types(self) -> List[str]:
@ -21,7 +21,7 @@ class WebLoader(LoaderInterface):
Returns:
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
"""
raise NotImplementedError
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
@property
def loader_name(self) -> str:

View file

@ -75,5 +75,10 @@ async def data_item_to_text_file(
else:
raise IngestionError(message="Local files are not accepted.")
elif data_item_path.startswith(("http://", "https://")):
loader = get_loader_engine()
return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
data_item_path, preferred_loaders
)
# data is not a supported type
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")

View file

@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
if parsed_url.scheme == "s3":
return data_item
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
# Validate URL by sending a HEAD request
try:
from cognee.context_global_variables import tavily_config, soup_crawler_config
from cognee.tasks.web_scraper import fetch_page_content
tavily = tavily_config.get()
soup_crawler = soup_crawler_config.get()
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
if preferred_tool == "tavily" and tavily is None:
raise IngestionError(
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
)
if preferred_tool == "beautifulsoup" and soup_crawler is None:
raise IngestionError(
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
)
data = await fetch_page_content(
data_item,
preferred_tool=preferred_tool,
tavily_config=tavily,
soup_crawler_config=soup_crawler,
)
content = ""
for key, value in data.items():
content += f"{key}:\n{value}\n\n"
return await save_data_to_file(content)
except IngestionError:
raise
except Exception as e:
raise IngestionError(
message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
)
return data_item
# data is local file path
elif parsed_url.scheme == "file":
if settings.accept_local_file_path: