fix: ensure web urls correctly go through ingest_data and reach loaders
This commit is contained in:
parent
9395539868
commit
95106d5914
4 changed files with 11 additions and 36 deletions
|
|
@ -38,6 +38,9 @@ def get_data_file_path(file_path: str):
|
|||
|
||||
return normalized_url
|
||||
|
||||
elif file_path.startswith(("http://", "https://")):
|
||||
return file_path
|
||||
|
||||
else:
|
||||
# Regular file path - normalize separators
|
||||
normalized_path = os.path.normpath(file_path)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ class WebLoader(LoaderInterface):
|
|||
Returns:
|
||||
List of extensions including the dot (e.g., ['.txt', '.md'])
|
||||
"""
|
||||
raise NotImplementedError
|
||||
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
||||
|
||||
@property
|
||||
def supported_mime_types(self) -> List[str]:
|
||||
|
|
@ -21,7 +21,7 @@ class WebLoader(LoaderInterface):
|
|||
Returns:
|
||||
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
||||
"""
|
||||
raise NotImplementedError
|
||||
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
||||
|
||||
@property
|
||||
def loader_name(self) -> str:
|
||||
|
|
|
|||
|
|
@ -75,5 +75,10 @@ async def data_item_to_text_file(
|
|||
else:
|
||||
raise IngestionError(message="Local files are not accepted.")
|
||||
|
||||
elif data_item_path.startswith(("http://", "https://")):
|
||||
loader = get_loader_engine()
|
||||
return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
|
||||
data_item_path, preferred_loaders
|
||||
)
|
||||
# data is not a supported type
|
||||
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
||||
|
|
|
|||
|
|
@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|||
if parsed_url.scheme == "s3":
|
||||
return data_item
|
||||
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
||||
# Validate URL by sending a HEAD request
|
||||
try:
|
||||
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
||||
from cognee.tasks.web_scraper import fetch_page_content
|
||||
|
||||
tavily = tavily_config.get()
|
||||
soup_crawler = soup_crawler_config.get()
|
||||
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
|
||||
if preferred_tool == "tavily" and tavily is None:
|
||||
raise IngestionError(
|
||||
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
||||
)
|
||||
if preferred_tool == "beautifulsoup" and soup_crawler is None:
|
||||
raise IngestionError(
|
||||
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
||||
)
|
||||
|
||||
data = await fetch_page_content(
|
||||
data_item,
|
||||
preferred_tool=preferred_tool,
|
||||
tavily_config=tavily,
|
||||
soup_crawler_config=soup_crawler,
|
||||
)
|
||||
content = ""
|
||||
for key, value in data.items():
|
||||
content += f"{key}:\n{value}\n\n"
|
||||
return await save_data_to_file(content)
|
||||
except IngestionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise IngestionError(
|
||||
message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
|
||||
)
|
||||
|
||||
return data_item
|
||||
# data is local file path
|
||||
elif parsed_url.scheme == "file":
|
||||
if settings.accept_local_file_path:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue