From 82d48663bb5d722ec9310f5745998299793fabaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Pe=C3=B1a=20del=20R=C3=ADo?= Date: Mon, 17 Nov 2025 23:27:31 +0100 Subject: [PATCH 1/8] Add custom label support to Data model (#1769) --- .../a1b2c3d4e5f6_add_label_column_to_data.py | 26 +++++++++++++++++++ .../datasets/routers/get_datasets_router.py | 1 + cognee/modules/data/models/Data.py | 3 ++- cognee/tasks/ingestion/data_item.py | 8 ++++++ cognee/tasks/ingestion/ingest_data.py | 14 ++++++++-- uv.lock | 12 +++++++-- 6 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py create mode 100644 cognee/tasks/ingestion/data_item.py diff --git a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py new file mode 100644 index 000000000..8e7bc19b1 --- /dev/null +++ b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py @@ -0,0 +1,26 @@ +"""Add sync_operations table + +Revision ID: a1b2c3d4e5f6 +Revises: 211ab850ef3d +Create Date: 2025-11-17 17:54:32.123456 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = "a1b2c3d4e5f6" +down_revision: Union[str, None] = "211ab850ef3d" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +def upgrade() -> None: + op.add_column( + "data", + sa.Column("label", sa.String(), nullable=True)), + +def downgrade() -> None: + op.drop_column("data", "label") \ No newline at end of file diff --git a/cognee/api/v1/datasets/routers/get_datasets_router.py b/cognee/api/v1/datasets/routers/get_datasets_router.py index eff87b3af..84ba3126b 100644 --- a/cognee/api/v1/datasets/routers/get_datasets_router.py +++ b/cognee/api/v1/datasets/routers/get_datasets_router.py @@ -44,6 +44,7 @@ class DatasetDTO(OutDTO): class DataDTO(OutDTO): id: UUID name: str + label: Optional[str] = None created_at: datetime updated_at: Optional[datetime] = None extension: str diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py index ef228f2e1..3cdead9d9 100644 --- a/cognee/modules/data/models/Data.py +++ b/cognee/modules/data/models/Data.py @@ -13,7 +13,7 @@ class Data(Base): __tablename__ = "data" id = Column(UUID, primary_key=True, default=uuid4) - + label = Column(String, nullable=True) name = Column(String) extension = Column(String) mime_type = Column(String) @@ -49,6 +49,7 @@ class Data(Base): return { "id": str(self.id), "name": self.name, + "label": self.label, "extension": self.extension, "mimeType": self.mime_type, "rawDataLocation": self.raw_data_location, diff --git a/cognee/tasks/ingestion/data_item.py b/cognee/tasks/ingestion/data_item.py new file mode 100644 index 000000000..23570bf77 --- /dev/null +++ b/cognee/tasks/ingestion/data_item.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from typing import Any, Dict, Optional + +@dataclass +class DataItem: + data: Any + label: Optional[str] = None + diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 0572d0f1e..3f38dc6db 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -20,7 +20,7 @@ from cognee.modules.data.methods import ( from .save_data_item_to_storage import save_data_item_to_storage from .data_item_to_text_file import data_item_to_text_file - +from .data_item import DataItem async def ingest_data( data: Any, @@ -78,8 +78,16 @@ async def ingest_data( dataset_data_map = {str(data.id): True for data in dataset_data} for data_item in data: + # Support for DataItem (custom label + data wrapper) + current_label = None + underlying_data = data_item + + if isinstance(data_item, DataItem): + underlying_data = data_item.data + current_label = data_item.label + # Get file path of data item or create a file if it doesn't exist - original_file_path = await save_data_item_to_storage(data_item) + original_file_path = await save_data_item_to_storage(underlying_data) # Transform file path to be OS usable actual_file_path = get_data_file_path(original_file_path) @@ -139,6 +147,7 @@ async def ingest_data( data_point.external_metadata = ext_metadata data_point.node_set = json.dumps(node_set) if node_set else None data_point.tenant_id = user.tenant_id if user.tenant_id else None + data_point.label = current_label # Check if data is already in dataset if str(data_point.id) in dataset_data_map: @@ -169,6 +178,7 @@ async def ingest_data( tenant_id=user.tenant_id if user.tenant_id else None, pipeline_status={}, token_count=-1, + label = current_label ) new_datapoints.append(data_point) diff --git a/uv.lock b/uv.lock index e2fc1df83..3ed54543d 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10, <3.14" resolution-markers = [ "python_full_version >= '3.13' and platform_python_implementation != 'PyPy' and sys_platform == 'darwin'", @@ -929,7 +929,7 @@ wheels = [ [[package]] name = "cognee" -version = "0.3.9" +version = "0.4.0" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -2529,6 +2529,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" }, { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" }, + { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" }, { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" }, { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, @@ -2538,6 +2540,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, @@ -2547,6 +2551,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -2556,6 +2562,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, ] From 8ea83e4a26a598f8bc65aa27b15fad68e91b0b2b Mon Sep 17 00:00:00 2001 From: apenade <166741079+apenade@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:44:49 +0100 Subject: [PATCH 2/8] Update alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py index 8e7bc19b1..bffe61b46 100644 --- a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py +++ b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py @@ -20,7 +20,8 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: op.add_column( "data", - sa.Column("label", sa.String(), nullable=True)), + sa.Column("label", sa.String(), nullable=True) + ) def downgrade() -> None: op.drop_column("data", "label") \ No newline at end of file From a451fb8c5a22503dc0666af7f9383372344eeb7a Mon Sep 17 00:00:00 2001 From: apenade <166741079+apenade@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:45:31 +0100 Subject: [PATCH 3/8] Update cognee/tasks/ingestion/data_item.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- cognee/tasks/ingestion/data_item.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/ingestion/data_item.py b/cognee/tasks/ingestion/data_item.py index 23570bf77..23285d677 100644 --- a/cognee/tasks/ingestion/data_item.py +++ b/cognee/tasks/ingestion/data_item.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Dict, Optional +from typing import Any, Optional @dataclass class DataItem: From a072773995734b8496b7e1a57a2a7abfbfe6faa7 Mon Sep 17 00:00:00 2001 From: apenade <166741079+apenade@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:02:27 +0100 Subject: [PATCH 4/8] Update alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py index bffe61b46..814467954 100644 --- a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py +++ b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py @@ -1,4 +1,4 @@ -"""Add sync_operations table +"""Add label column to data table Revision ID: a1b2c3d4e5f6 Revises: 211ab850ef3d From f48df27fc85b0df53701f0bf813563dc20494f74 Mon Sep 17 00:00:00 2001 From: hiyan Date: Thu, 11 Dec 2025 10:32:45 +0530 Subject: [PATCH 5/8] fix(db): url-encode postgres credentials to handle special characters --- .../databases/relational/create_relational_engine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cognee/infrastructure/databases/relational/create_relational_engine.py b/cognee/infrastructure/databases/relational/create_relational_engine.py index deaeaa2da..8813dfcb2 100644 --- a/cognee/infrastructure/databases/relational/create_relational_engine.py +++ b/cognee/infrastructure/databases/relational/create_relational_engine.py @@ -1,5 +1,6 @@ from .sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter from functools import lru_cache +from urllib.parse import quote_plus @lru_cache @@ -43,9 +44,10 @@ def create_relational_engine( # Test if asyncpg is available import asyncpg - connection_string = ( - f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}" - ) + encoded_username = quote_plus(db_username) + encoded_password = quote_plus(db_password) + + connection_string = f"postgresql+asyncpg://{encoded_username}:{encoded_password}@{db_host}:{db_port}/{db_name}" except ImportError: raise ImportError( "PostgreSQL dependencies are not installed. Please install with 'pip install cognee\"[postgres]\"' or 'pip install cognee\"[postgres-binary]\"' to use PostgreSQL functionality." From b77961b0f178e985c664e22ccca8e4f40e76f456 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 16 Dec 2025 20:59:17 +0100 Subject: [PATCH 6/8] fix: Resolve issues with data label PR, add tests and upgrade migration --- .github/workflows/e2e_tests.yml | 25 +++++++ .../a1b2c3d4e5f6_add_label_column_to_data.py | 23 +++++-- cognee/api/v1/add/add.py | 3 +- .../ingestion/save_data_item_to_storage.py | 5 ++ cognee/tests/test_custom_data_label.py | 68 +++++++++++++++++++ 5 files changed, 117 insertions(+), 7 deletions(-) create mode 100644 cognee/tests/test_custom_data_label.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 8cd62910c..5f5828da8 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -315,6 +315,31 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_multi_tenancy.py + test-data-label: + name: Test adding of label for data in Cognee + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run custom data label test + env: + ENV: 'dev' + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_custom_data_label.py + test-graph-edges: name: Test graph edge ingestion runs-on: ubuntu-22.04 diff --git a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py index 814467954..c127e078b 100644 --- a/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py +++ b/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py @@ -13,15 +13,26 @@ import sqlalchemy as sa # revision identifiers, used by Alembic. revision: str = "a1b2c3d4e5f6" -down_revision: Union[str, None] = "211ab850ef3d" +down_revision: Union[str, None] = "46a6ce2bd2b2" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + def upgrade() -> None: - op.add_column( - "data", - sa.Column("label", sa.String(), nullable=True) - ) + conn = op.get_bind() + insp = sa.inspect(conn) + + label_column = _get_column(insp, "data", "label") + if not label_column: + op.add_column("data", sa.Column("label", sa.String(), nullable=True)) + def downgrade() -> None: - op.drop_column("data", "label") \ No newline at end of file + op.drop_column("data", "label") diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 90ea32ae7..3b355f284 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -10,13 +10,14 @@ from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import ( ) from cognee.modules.engine.operations.setup import setup from cognee.tasks.ingestion import ingest_data, resolve_data_directories +from cognee.tasks.ingestion.data_item import DataItem from cognee.shared.logging_utils import get_logger logger = get_logger() async def add( - data: Union[BinaryIO, list[BinaryIO], str, list[str]], + data: Union[BinaryIO, list[BinaryIO], str, list[str], DataItem, list[DataItem]], dataset_name: str = "main_dataset", user: User = None, node_set: Optional[List[str]] = None, diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 05d21e617..85eef2736 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -9,6 +9,7 @@ from cognee.shared.logging_utils import get_logger from pydantic_settings import BaseSettings, SettingsConfigDict from cognee.tasks.web_scraper.utils import fetch_page_content +from cognee.tasks.ingestion.data_item import DataItem logger = get_logger() @@ -95,5 +96,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str # data is text, save it to data storage and return the file path return await save_data_to_file(data_item) + if isinstance(data_item, DataItem): + # If instance is DataItem use the underlying data + return await save_data_item_to_storage(data_item.data) + # data is not a supported type raise IngestionError(message=f"Data type not supported: {type(data_item)}") diff --git a/cognee/tests/test_custom_data_label.py b/cognee/tests/test_custom_data_label.py new file mode 100644 index 000000000..0dab1cbd7 --- /dev/null +++ b/cognee/tests/test_custom_data_label.py @@ -0,0 +1,68 @@ +import asyncio +import cognee +from cognee.shared.logging_utils import setup_logging, ERROR +from cognee.api.v1.search import SearchType + + +async def main(): + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + + # cognee knowledge graph will be created based on this text + text = """ + Natural language processing (NLP) is an interdisciplinary + subfield of computer science and information retrieval. + """ + from cognee.tasks.ingestion.data_item import DataItem + + test_item = DataItem(text, "test_item") + # Add the text, and make it available for cognify + await cognee.add(test_item) + + # Use LLMs and cognee to create knowledge graph + ret_val = await cognee.cognify() + + query_text = "Tell me about NLP" + print(f"Searching cognee for insights with query: '{query_text}'") + # Query cognee for insights on the added text + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, query_text=query_text + ) + + print("Search results:") + # Display results + for result_text in search_results: + print(result_text) + + from cognee.modules.data.methods.get_dataset_data import get_dataset_data + + for pipeline in ret_val.values(): + dataset_id = pipeline.dataset_id + + dataset_data = await get_dataset_data(dataset_id=dataset_id) + + from fastapi.encoders import jsonable_encoder + + data = [ + dict( + **jsonable_encoder(data), + dataset_id=dataset_id, + ) + for data in dataset_data + ] + + # Check if label is properly added and stored + assert data[0]["label"] == "test_item" + + +if __name__ == "__main__": + logger = setup_logging(log_level=ERROR) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) From cc872fc8de506e8ff0dc635fd5043db6a6f74fac Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 16 Dec 2025 21:04:15 +0100 Subject: [PATCH 7/8] refactor: format PR --- cognee/tasks/ingestion/data_item.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/ingestion/data_item.py b/cognee/tasks/ingestion/data_item.py index 23285d677..da213ed1c 100644 --- a/cognee/tasks/ingestion/data_item.py +++ b/cognee/tasks/ingestion/data_item.py @@ -1,8 +1,8 @@ from dataclasses import dataclass from typing import Any, Optional + @dataclass class DataItem: data: Any label: Optional[str] = None - From 6e5e79f434a755b0692bb6956b571128e9d9db4c Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 17 Dec 2025 21:07:23 +0100 Subject: [PATCH 8/8] fix: Resolve connection issue with postgres when special characters are present --- .../relational/create_relational_engine.py | 14 ++++++++++---- .../databases/vector/create_vector_engine.py | 11 +++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/cognee/infrastructure/databases/relational/create_relational_engine.py b/cognee/infrastructure/databases/relational/create_relational_engine.py index 8813dfcb2..ea2b35c75 100644 --- a/cognee/infrastructure/databases/relational/create_relational_engine.py +++ b/cognee/infrastructure/databases/relational/create_relational_engine.py @@ -1,6 +1,6 @@ +from sqlalchemy import URL from .sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter from functools import lru_cache -from urllib.parse import quote_plus @lru_cache @@ -44,10 +44,16 @@ def create_relational_engine( # Test if asyncpg is available import asyncpg - encoded_username = quote_plus(db_username) - encoded_password = quote_plus(db_password) + # Handle special characters in username and password like # or @ + connection_string = URL.create( + "postgresql+asyncpg", + username=db_username, + password=db_password, + host=db_host, + port=int(db_port), + database=db_name, + ) - connection_string = f"postgresql+asyncpg://{encoded_username}:{encoded_password}@{db_host}:{db_port}/{db_name}" except ImportError: raise ImportError( "PostgreSQL dependencies are not installed. Please install with 'pip install cognee\"[postgres]\"' or 'pip install cognee\"[postgres-binary]\"' to use PostgreSQL functionality." diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index d1cf855d7..47a2e2582 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,3 +1,5 @@ +from sqlalchemy import URL + from .supported_databases import supported_databases from .embeddings import get_embedding_engine @@ -61,8 +63,13 @@ def create_vector_engine( if not (db_host and db_port and db_name and db_username and db_password): raise EnvironmentError("Missing requred pgvector credentials!") - connection_string: str = ( - f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}" + connection_string = URL.create( + "postgresql+asyncpg", + username=db_username, + password=db_password, + host=db_host, + port=int(db_port), + database=db_name, ) try: