fix: fixes s3 access in get_raw_data endpoint

This commit is contained in:
hajdul88 2025-12-17 14:47:10 +01:00
parent 8340c69f85
commit b975161d68

View file

@ -7,7 +7,7 @@ from fastapi import status
from fastapi import APIRouter from fastapi import APIRouter
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi import HTTPException, Query, Depends from fastapi import HTTPException, Query, Depends
from fastapi.responses import JSONResponse, FileResponse from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
from urllib.parse import urlparse from urllib.parse import urlparse
from pathlib import Path from pathlib import Path
@ -482,12 +482,32 @@ def get_datasets_router() -> APIRouter:
if raw_location.startswith("file:"): if raw_location.startswith("file:"):
raw_location = urlparse(raw_location).path raw_location = urlparse(raw_location).path
if raw_location.startswith("s3://"):
from cognee.infrastructure.files.utils.open_data_file import open_data_file
from cognee.infrastructure.utils.run_async import run_async
parsed = urlparse(raw_location)
download_name = Path(parsed.path).name or data.name
media_type = data.mime_type or "application/octet-stream"
async def file_iterator(chunk_size: int = 1024 * 1024):
async with open_data_file(raw_location, mode="rb") as file:
while True:
chunk = await run_async(file.read, chunk_size)
if not chunk:
break
yield chunk
return StreamingResponse(
file_iterator(),
media_type=media_type,
headers={"Content-Disposition": f'attachment; filename="{download_name}"'},
)
path = Path(raw_location) path = Path(raw_location)
if not path.is_file(): if not path.is_file():
raise DataNotFoundError( raise DataNotFoundError(message=f"Raw file not found on disk for data ({data_id}).")
message=f"Raw file not found on disk for data ({data_id})."
)
return FileResponse(path=path) return FileResponse(path=path)