From a70ce2785b6b2b5aebec120d2935c965f0416de9 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 18 Dec 2025 16:07:19 +0100 Subject: [PATCH 1/2] Release v0.5.1.dev0 --- pyproject.toml | 2 +- uv.lock | 40 ++++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 24ea6ca9b..1ae95c9cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "cognee" -version = "0.5.0" +version = "0.5.1.dev0" description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." authors = [ { name = "Vasilije Markovic" }, diff --git a/uv.lock b/uv.lock index 9df387449..b4a613337 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10, <3.14" resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'darwin'", @@ -927,7 +927,7 @@ wheels = [ [[package]] name = "cognee" -version = "0.5.0" +version = "0.5.1.dev0" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -5216,7 +5216,7 @@ name = "nvidia-cudnn-cu12" version = "9.10.2.21" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, @@ -5227,7 +5227,7 @@ name = "nvidia-cufft-cu12" version = "11.3.3.83" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, @@ -5254,9 +5254,9 @@ name = "nvidia-cusolver-cu12" version = "11.7.3.90" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, @@ -5267,7 +5267,7 @@ name = "nvidia-cusparse-cu12" version = "12.5.8.93" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, @@ -5327,9 +5327,9 @@ name = "ocrmac" version = "1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click" }, - { name = "pillow" }, - { name = "pyobjc-framework-vision" }, + { name = "click", marker = "sys_platform == 'darwin'" }, + { name = "pillow", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-vision", marker = "sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" } wheels = [ @@ -6937,7 +6937,7 @@ name = "pyobjc-framework-cocoa" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, + { name = "pyobjc-core", marker = "sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" } wheels = [ @@ -6953,8 +6953,8 @@ name = "pyobjc-framework-coreml" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-core", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-cocoa", marker = "sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/30/2d/baa9ea02cbb1c200683cb7273b69b4bee5070e86f2060b77e6a27c2a9d7e/pyobjc_framework_coreml-12.1.tar.gz", hash = "sha256:0d1a4216891a18775c9e0170d908714c18e4f53f9dc79fb0f5263b2aa81609ba", size = 40465, upload-time = "2025-11-14T10:14:02.265Z" } wheels = [ @@ -6970,8 +6970,8 @@ name = "pyobjc-framework-quartz" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-core", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-cocoa", marker = "sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" } wheels = [ @@ -6987,10 +6987,10 @@ name = "pyobjc-framework-vision" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, - { name = "pyobjc-framework-coreml" }, - { name = "pyobjc-framework-quartz" }, + { name = "pyobjc-core", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-cocoa", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-coreml", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-quartz", marker = "sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c2/5a/08bb3e278f870443d226c141af14205ff41c0274da1e053b72b11dfc9fb2/pyobjc_framework_vision-12.1.tar.gz", hash = "sha256:a30959100e85dcede3a786c544e621ad6eb65ff6abf85721f805822b8c5fe9b0", size = 59538, upload-time = "2025-11-14T10:23:21.979Z" } wheels = [ From 4f07adee66b51411a09af7044e5aa5ab7de8c434 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:10:05 +0100 Subject: [PATCH 2/2] chore: fixes get_raw_data endpoint and adds s3 support (#1916) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description This PR fixes get_raw_data endpoint in get_dataset_router - Fixes local path access - Adds s3 access - Covers new fixed functionality with unit tests ## Acceptance Criteria ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. ## Summary by CodeRabbit * **New Features** * Streaming support for remote S3 data locations so large dataset files can be retrieved efficiently. * Improved handling of local and remote file paths for downloads. * **Improvements** * Standardized error responses for missing datasets or data files. * **Tests** * Added unit tests covering local file downloads and S3 streaming, including content and attachment header verification. ✏️ Tip: You can customize this high-level summary in your review settings. --- .../datasets/routers/get_datasets_router.py | 40 +++++- .../unit/api/test_get_raw_data_endpoint.py | 136 ++++++++++++++++++ 2 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 cognee/tests/unit/api/test_get_raw_data_endpoint.py diff --git a/cognee/api/v1/datasets/routers/get_datasets_router.py b/cognee/api/v1/datasets/routers/get_datasets_router.py index 040ed14bf..afd2b2cce 100644 --- a/cognee/api/v1/datasets/routers/get_datasets_router.py +++ b/cognee/api/v1/datasets/routers/get_datasets_router.py @@ -7,7 +7,9 @@ from fastapi import status from fastapi import APIRouter from fastapi.encoders import jsonable_encoder from fastapi import HTTPException, Query, Depends -from fastapi.responses import JSONResponse, FileResponse +from fastapi.responses import JSONResponse, FileResponse, StreamingResponse +from urllib.parse import urlparse +from pathlib import Path from cognee.api.DTO import InDTO, OutDTO from cognee.infrastructure.databases.relational import get_relational_engine @@ -476,6 +478,40 @@ def get_datasets_router() -> APIRouter: message=f"Data ({data_id}) not found in dataset ({dataset_id})." ) - return data.raw_data_location + raw_location = data.raw_data_location + + if raw_location.startswith("file://"): + from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path + + raw_location = get_data_file_path(raw_location) + + if raw_location.startswith("s3://"): + from cognee.infrastructure.files.utils.open_data_file import open_data_file + from cognee.infrastructure.utils.run_async import run_async + + parsed = urlparse(raw_location) + download_name = Path(parsed.path).name or data.name + media_type = data.mime_type or "application/octet-stream" + + async def file_iterator(chunk_size: int = 1024 * 1024): + async with open_data_file(raw_location, mode="rb") as file: + while True: + chunk = await run_async(file.read, chunk_size) + if not chunk: + break + yield chunk + + return StreamingResponse( + file_iterator(), + media_type=media_type, + headers={"Content-Disposition": f'attachment; filename="{download_name}"'}, + ) + + path = Path(raw_location) + + if not path.is_file(): + raise DataNotFoundError(message=f"Raw file not found on disk for data ({data_id}).") + + return FileResponse(path=path) return router diff --git a/cognee/tests/unit/api/test_get_raw_data_endpoint.py b/cognee/tests/unit/api/test_get_raw_data_endpoint.py new file mode 100644 index 000000000..392919755 --- /dev/null +++ b/cognee/tests/unit/api/test_get_raw_data_endpoint.py @@ -0,0 +1,136 @@ +import io +import uuid +from contextlib import asynccontextmanager +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from cognee.modules.users.methods import get_authenticated_user + + +@pytest.fixture(scope="session") +def test_client(): + from cognee.api.v1.datasets.routers.get_datasets_router import get_datasets_router + + app = FastAPI() + app.include_router(get_datasets_router(), prefix="/api/v1/datasets") + + with TestClient(app) as c: + yield c + + +@pytest.fixture +def client(test_client): + async def override_get_authenticated_user(): + return SimpleNamespace( + id=str(uuid.uuid4()), + email="default@example.com", + is_active=True, + tenant_id=str(uuid.uuid4()), + ) + + import importlib + + datasets_router_module = importlib.import_module( + "cognee.api.v1.datasets.routers.get_datasets_router" + ) + datasets_router_module.send_telemetry = lambda *args, **kwargs: None + + test_client.app.dependency_overrides[get_authenticated_user] = override_get_authenticated_user + yield test_client + test_client.app.dependency_overrides.pop(get_authenticated_user, None) + + +def _patch_raw_download_dependencies( + monkeypatch, *, dataset_id, data_id, raw_data_location, name, mime_type +): + """ + Patch the internal dataset/data lookups used by GET /datasets/{dataset_id}/data/{data_id}/raw. + Keeps the test focused on response behavior (FileResponse vs StreamingResponse). + """ + import importlib + + datasets_router_module = importlib.import_module( + "cognee.api.v1.datasets.routers.get_datasets_router" + ) + + monkeypatch.setattr( + datasets_router_module, + "get_authorized_existing_datasets", + AsyncMock(return_value=[SimpleNamespace(id=dataset_id)]), + ) + + import cognee.modules.data.methods as data_methods_module + + monkeypatch.setattr( + data_methods_module, + "get_dataset_data", + AsyncMock(return_value=[SimpleNamespace(id=data_id)]), + ) + monkeypatch.setattr( + data_methods_module, + "get_data", + AsyncMock( + return_value=SimpleNamespace( + id=data_id, + raw_data_location=raw_data_location, + name=name, + mime_type=mime_type, + ) + ), + ) + + +def test_get_raw_data_local_file_downloads_bytes(client, monkeypatch, tmp_path): + """Downloads bytes from a file:// raw_data_location.""" + dataset_id = uuid.uuid4() + data_id = uuid.uuid4() + + file_path = tmp_path / "example.txt" + content = b"hello from disk" + file_path.write_bytes(content) + + _patch_raw_download_dependencies( + monkeypatch, + dataset_id=dataset_id, + data_id=data_id, + raw_data_location=file_path.as_uri(), + name="example.txt", + mime_type="text/plain", + ) + + response = client.get(f"/api/v1/datasets/{dataset_id}/data/{data_id}/raw") + assert response.status_code == 200 + assert response.content == content + + +def test_get_raw_data_s3_streams_bytes_without_s3_dependency(client, monkeypatch): + """Streams bytes from an s3:// raw_data_location (mocked).""" + dataset_id = uuid.uuid4() + data_id = uuid.uuid4() + + _patch_raw_download_dependencies( + monkeypatch, + dataset_id=dataset_id, + data_id=data_id, + raw_data_location="s3://bucket/path/to/file.txt", + name="file.txt", + mime_type="text/plain", + ) + + import cognee.infrastructure.files.utils.open_data_file as open_data_file_module + + @asynccontextmanager + async def fake_open_data_file(_file_path: str, mode: str = "rb", **_kwargs): + assert mode == "rb" + yield io.BytesIO(b"hello from s3") + + monkeypatch.setattr(open_data_file_module, "open_data_file", fake_open_data_file) + + response = client.get(f"/api/v1/datasets/{dataset_id}/data/{data_id}/raw") + assert response.status_code == 200 + assert response.content == b"hello from s3" + assert response.headers.get("content-disposition") == 'attachment; filename="file.txt"'