Merge branch 'dev' into feature/cog-3532-empower-test_search-db-retrievers-tests-reorg-4

This commit is contained in:
hajdul88 2025-12-18 16:10:16 +01:00 committed by GitHub
commit ef51dcfb7a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 195 additions and 23 deletions

View file

@ -7,7 +7,9 @@ from fastapi import status
from fastapi import APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi import HTTPException, Query, Depends
from fastapi.responses import JSONResponse, FileResponse
from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
from urllib.parse import urlparse
from pathlib import Path
from cognee.api.DTO import InDTO, OutDTO
from cognee.infrastructure.databases.relational import get_relational_engine
@ -476,6 +478,40 @@ def get_datasets_router() -> APIRouter:
message=f"Data ({data_id}) not found in dataset ({dataset_id})."
)
return data.raw_data_location
raw_location = data.raw_data_location
if raw_location.startswith("file://"):
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
raw_location = get_data_file_path(raw_location)
if raw_location.startswith("s3://"):
from cognee.infrastructure.files.utils.open_data_file import open_data_file
from cognee.infrastructure.utils.run_async import run_async
parsed = urlparse(raw_location)
download_name = Path(parsed.path).name or data.name
media_type = data.mime_type or "application/octet-stream"
async def file_iterator(chunk_size: int = 1024 * 1024):
async with open_data_file(raw_location, mode="rb") as file:
while True:
chunk = await run_async(file.read, chunk_size)
if not chunk:
break
yield chunk
return StreamingResponse(
file_iterator(),
media_type=media_type,
headers={"Content-Disposition": f'attachment; filename="{download_name}"'},
)
path = Path(raw_location)
if not path.is_file():
raise DataNotFoundError(message=f"Raw file not found on disk for data ({data_id}).")
return FileResponse(path=path)
return router

View file

@ -0,0 +1,136 @@
import io
import uuid
from contextlib import asynccontextmanager
from types import SimpleNamespace
from unittest.mock import AsyncMock
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from cognee.modules.users.methods import get_authenticated_user
@pytest.fixture(scope="session")
def test_client():
from cognee.api.v1.datasets.routers.get_datasets_router import get_datasets_router
app = FastAPI()
app.include_router(get_datasets_router(), prefix="/api/v1/datasets")
with TestClient(app) as c:
yield c
@pytest.fixture
def client(test_client):
async def override_get_authenticated_user():
return SimpleNamespace(
id=str(uuid.uuid4()),
email="default@example.com",
is_active=True,
tenant_id=str(uuid.uuid4()),
)
import importlib
datasets_router_module = importlib.import_module(
"cognee.api.v1.datasets.routers.get_datasets_router"
)
datasets_router_module.send_telemetry = lambda *args, **kwargs: None
test_client.app.dependency_overrides[get_authenticated_user] = override_get_authenticated_user
yield test_client
test_client.app.dependency_overrides.pop(get_authenticated_user, None)
def _patch_raw_download_dependencies(
monkeypatch, *, dataset_id, data_id, raw_data_location, name, mime_type
):
"""
Patch the internal dataset/data lookups used by GET /datasets/{dataset_id}/data/{data_id}/raw.
Keeps the test focused on response behavior (FileResponse vs StreamingResponse).
"""
import importlib
datasets_router_module = importlib.import_module(
"cognee.api.v1.datasets.routers.get_datasets_router"
)
monkeypatch.setattr(
datasets_router_module,
"get_authorized_existing_datasets",
AsyncMock(return_value=[SimpleNamespace(id=dataset_id)]),
)
import cognee.modules.data.methods as data_methods_module
monkeypatch.setattr(
data_methods_module,
"get_dataset_data",
AsyncMock(return_value=[SimpleNamespace(id=data_id)]),
)
monkeypatch.setattr(
data_methods_module,
"get_data",
AsyncMock(
return_value=SimpleNamespace(
id=data_id,
raw_data_location=raw_data_location,
name=name,
mime_type=mime_type,
)
),
)
def test_get_raw_data_local_file_downloads_bytes(client, monkeypatch, tmp_path):
"""Downloads bytes from a file:// raw_data_location."""
dataset_id = uuid.uuid4()
data_id = uuid.uuid4()
file_path = tmp_path / "example.txt"
content = b"hello from disk"
file_path.write_bytes(content)
_patch_raw_download_dependencies(
monkeypatch,
dataset_id=dataset_id,
data_id=data_id,
raw_data_location=file_path.as_uri(),
name="example.txt",
mime_type="text/plain",
)
response = client.get(f"/api/v1/datasets/{dataset_id}/data/{data_id}/raw")
assert response.status_code == 200
assert response.content == content
def test_get_raw_data_s3_streams_bytes_without_s3_dependency(client, monkeypatch):
"""Streams bytes from an s3:// raw_data_location (mocked)."""
dataset_id = uuid.uuid4()
data_id = uuid.uuid4()
_patch_raw_download_dependencies(
monkeypatch,
dataset_id=dataset_id,
data_id=data_id,
raw_data_location="s3://bucket/path/to/file.txt",
name="file.txt",
mime_type="text/plain",
)
import cognee.infrastructure.files.utils.open_data_file as open_data_file_module
@asynccontextmanager
async def fake_open_data_file(_file_path: str, mode: str = "rb", **_kwargs):
assert mode == "rb"
yield io.BytesIO(b"hello from s3")
monkeypatch.setattr(open_data_file_module, "open_data_file", fake_open_data_file)
response = client.get(f"/api/v1/datasets/{dataset_id}/data/{data_id}/raw")
assert response.status_code == 200
assert response.content == b"hello from s3"
assert response.headers.get("content-disposition") == 'attachment; filename="file.txt"'

View file

@ -1,7 +1,7 @@
[project]
name = "cognee"
version = "0.5.0"
version = "0.5.1.dev0"
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
authors = [
{ name = "Vasilije Markovic" },

40
uv.lock generated
View file

@ -1,5 +1,5 @@
version = 1
revision = 2
revision = 3
requires-python = ">=3.10, <3.14"
resolution-markers = [
"python_full_version >= '3.13' and sys_platform == 'darwin'",
@ -927,7 +927,7 @@ wheels = [
[[package]]
name = "cognee"
version = "0.5.0"
version = "0.5.1.dev0"
source = { editable = "." }
dependencies = [
{ name = "aiofiles" },
@ -5216,7 +5216,7 @@ name = "nvidia-cudnn-cu12"
version = "9.10.2.21"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nvidia-cublas-cu12" },
{ name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
@ -5227,7 +5227,7 @@ name = "nvidia-cufft-cu12"
version = "11.3.3.83"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nvidia-nvjitlink-cu12" },
{ name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
@ -5254,9 +5254,9 @@ name = "nvidia-cusolver-cu12"
version = "11.7.3.90"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nvidia-cublas-cu12" },
{ name = "nvidia-cusparse-cu12" },
{ name = "nvidia-nvjitlink-cu12" },
{ name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
@ -5267,7 +5267,7 @@ name = "nvidia-cusparse-cu12"
version = "12.5.8.93"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nvidia-nvjitlink-cu12" },
{ name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
@ -5327,9 +5327,9 @@ name = "ocrmac"
version = "1.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "pillow" },
{ name = "pyobjc-framework-vision" },
{ name = "click", marker = "sys_platform == 'darwin'" },
{ name = "pillow", marker = "sys_platform == 'darwin'" },
{ name = "pyobjc-framework-vision", marker = "sys_platform == 'darwin'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
wheels = [
@ -6937,7 +6937,7 @@ name = "pyobjc-framework-cocoa"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-core", marker = "sys_platform == 'darwin'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" }
wheels = [
@ -6953,8 +6953,8 @@ name = "pyobjc-framework-coreml"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-core", marker = "sys_platform == 'darwin'" },
{ name = "pyobjc-framework-cocoa", marker = "sys_platform == 'darwin'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/30/2d/baa9ea02cbb1c200683cb7273b69b4bee5070e86f2060b77e6a27c2a9d7e/pyobjc_framework_coreml-12.1.tar.gz", hash = "sha256:0d1a4216891a18775c9e0170d908714c18e4f53f9dc79fb0f5263b2aa81609ba", size = 40465, upload-time = "2025-11-14T10:14:02.265Z" }
wheels = [
@ -6970,8 +6970,8 @@ name = "pyobjc-framework-quartz"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-core", marker = "sys_platform == 'darwin'" },
{ name = "pyobjc-framework-cocoa", marker = "sys_platform == 'darwin'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" }
wheels = [
@ -6987,10 +6987,10 @@ name = "pyobjc-framework-vision"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-framework-coreml" },
{ name = "pyobjc-framework-quartz" },
{ name = "pyobjc-core", marker = "sys_platform == 'darwin'" },
{ name = "pyobjc-framework-cocoa", marker = "sys_platform == 'darwin'" },
{ name = "pyobjc-framework-coreml", marker = "sys_platform == 'darwin'" },
{ name = "pyobjc-framework-quartz", marker = "sys_platform == 'darwin'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c2/5a/08bb3e278f870443d226c141af14205ff41c0274da1e053b72b11dfc9fb2/pyobjc_framework_vision-12.1.tar.gz", hash = "sha256:a30959100e85dcede3a786c544e621ad6eb65ff6abf85721f805822b8c5fe9b0", size = 59538, upload-time = "2025-11-14T10:23:21.979Z" }
wheels = [