fix: ensure web urls correctly go through ingest_data and reach loaders
This commit is contained in:
parent
9395539868
commit
95106d5914
4 changed files with 11 additions and 36 deletions
|
|
@ -38,6 +38,9 @@ def get_data_file_path(file_path: str):
|
||||||
|
|
||||||
return normalized_url
|
return normalized_url
|
||||||
|
|
||||||
|
elif file_path.startswith(("http://", "https://")):
|
||||||
|
return file_path
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Regular file path - normalize separators
|
# Regular file path - normalize separators
|
||||||
normalized_path = os.path.normpath(file_path)
|
normalized_path = os.path.normpath(file_path)
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ class WebLoader(LoaderInterface):
|
||||||
Returns:
|
Returns:
|
||||||
List of extensions including the dot (e.g., ['.txt', '.md'])
|
List of extensions including the dot (e.g., ['.txt', '.md'])
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def supported_mime_types(self) -> List[str]:
|
def supported_mime_types(self) -> List[str]:
|
||||||
|
|
@ -21,7 +21,7 @@ class WebLoader(LoaderInterface):
|
||||||
Returns:
|
Returns:
|
||||||
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def loader_name(self) -> str:
|
def loader_name(self) -> str:
|
||||||
|
|
|
||||||
|
|
@ -75,5 +75,10 @@ async def data_item_to_text_file(
|
||||||
else:
|
else:
|
||||||
raise IngestionError(message="Local files are not accepted.")
|
raise IngestionError(message="Local files are not accepted.")
|
||||||
|
|
||||||
|
elif data_item_path.startswith(("http://", "https://")):
|
||||||
|
loader = get_loader_engine()
|
||||||
|
return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
|
||||||
|
data_item_path, preferred_loaders
|
||||||
|
)
|
||||||
# data is not a supported type
|
# data is not a supported type
|
||||||
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
||||||
|
|
|
||||||
|
|
@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
||||||
if parsed_url.scheme == "s3":
|
if parsed_url.scheme == "s3":
|
||||||
return data_item
|
return data_item
|
||||||
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
||||||
# Validate URL by sending a HEAD request
|
return data_item
|
||||||
try:
|
|
||||||
from cognee.context_global_variables import tavily_config, soup_crawler_config
|
|
||||||
from cognee.tasks.web_scraper import fetch_page_content
|
|
||||||
|
|
||||||
tavily = tavily_config.get()
|
|
||||||
soup_crawler = soup_crawler_config.get()
|
|
||||||
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
|
|
||||||
if preferred_tool == "tavily" and tavily is None:
|
|
||||||
raise IngestionError(
|
|
||||||
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
|
|
||||||
)
|
|
||||||
if preferred_tool == "beautifulsoup" and soup_crawler is None:
|
|
||||||
raise IngestionError(
|
|
||||||
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
|
|
||||||
)
|
|
||||||
|
|
||||||
data = await fetch_page_content(
|
|
||||||
data_item,
|
|
||||||
preferred_tool=preferred_tool,
|
|
||||||
tavily_config=tavily,
|
|
||||||
soup_crawler_config=soup_crawler,
|
|
||||||
)
|
|
||||||
content = ""
|
|
||||||
for key, value in data.items():
|
|
||||||
content += f"{key}:\n{value}\n\n"
|
|
||||||
return await save_data_to_file(content)
|
|
||||||
except IngestionError:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
raise IngestionError(
|
|
||||||
message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# data is local file path
|
# data is local file path
|
||||||
elif parsed_url.scheme == "file":
|
elif parsed_url.scheme == "file":
|
||||||
if settings.accept_local_file_path:
|
if settings.accept_local_file_path:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue