From 813ee9483691db21efce1cddf88107f7f36b1b88 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 27 Oct 2025 08:12:37 +0100 Subject: [PATCH 01/54] Initial commit, still wip --- cognee/context_global_variables.py | 27 ++++++++++++++----- .../databases/graph/get_graph_engine.py | 1 + .../utils/get_or_create_dataset_database.py | 21 ++++++++++++++- .../databases/vector/create_vector_engine.py | 2 ++ .../modules/users/models/DatasetDatabase.py | 7 +++++ 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index d52de4b4e..9a4f49763 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -57,19 +57,34 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # Set vector and graph database configuration based on dataset database information vector_config = { - "vector_db_url": os.path.join( - databases_directory_path, dataset_database.vector_database_name - ), - "vector_db_key": "", - "vector_db_provider": "lancedb", + "vector_db_provider": dataset_database.vector_database_provider, + "vector_db_url": dataset_database.vector_database_url, + # TODO: Maybe add key to dataset_database, and put it here?? + "vector_db_key": "" } + # vector_config = { + # "vector_db_url": os.path.join( + # databases_directory_path, dataset_database.vector_database_name + # ), + # "vector_db_key": "", + # "vector_db_provider": "lancedb", + # } + graph_config = { - "graph_database_provider": "kuzu", + "graph_database_provider": dataset_database.graph_database_provider, + "graph_database_url": dataset_database.graph_database_url, + "graph_database_name": dataset_database.graph_database_name, "graph_file_path": os.path.join( databases_directory_path, dataset_database.graph_database_name ), } + # graph_config = { + # "graph_database_provider": "kuzu", + # "graph_file_path": os.path.join( + # databases_directory_path, dataset_database.graph_database_name + # ), + # } storage_config = { "data_root_directory": data_root_directory, diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 1ea61d29f..217f63070 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -69,6 +69,7 @@ def create_graph_engine( graph_database_url=graph_database_url, graph_database_username=graph_database_username, graph_database_password=graph_database_password, + graph_name=graph_database_name, ) if graph_database_provider == "neo4j": diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 29156025d..2b9b00569 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -6,11 +6,20 @@ from sqlalchemy.exc import IntegrityError from cognee.modules.data.methods import create_dataset from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.infrastructure.databases.vector import get_vectordb_config +from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.modules.data.methods import get_unique_dataset_id from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User +# TODO: Find a better place to define these +default_vector_db_name = "lance.db" +default_vector_db_provider = "lancedb" +default_graph_db_provider = "kuzu" +default_vector_db_url = None +default_graph_db_url = None + async def get_or_create_dataset_database( dataset: Union[str, UUID], user: User, @@ -32,9 +41,12 @@ async def get_or_create_dataset_database( dataset_id = await get_unique_dataset_id(dataset, user) - vector_db_name = f"{dataset_id}.lance.db" + vector_db_name = f"{dataset_id}.db" graph_db_name = f"{dataset_id}.pkl" + vector_config = get_vectordb_config() + graph_config = get_graph_config() + async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist if isinstance(dataset, str): @@ -49,12 +61,19 @@ async def get_or_create_dataset_database( if existing: return existing + # TODO: Set the vector and graph database stuff (name, provider, etc.) based on the whether or + # TODO: not we support multi user for that db. If not, set to default, which is lance and/or kuzu. + # If there are no existing rows build a new row record = DatasetDatabase( owner_id=user.id, dataset_id=dataset_id, vector_database_name=vector_db_name, graph_database_name=graph_db_name, + vector_database_provider=vector_config.vector_db_provider, + graph_database_provider=graph_config.graph_database_provider, + vector_database_url=vector_config.vector_db_url, + graph_database_url=graph_config.graph_database_url, ) try: diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index 639bbb9f6..7e3fb367f 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,5 +1,6 @@ from .supported_databases import supported_databases from .embeddings import get_embedding_engine +from cognee.infrastructure.databases.graph.config import get_graph_config from functools import lru_cache @@ -45,6 +46,7 @@ def create_vector_engine( url=vector_db_url, api_key=vector_db_key, embedding_engine=embedding_engine, + graph_name=get_graph_config().graph_database_name ) if vector_db_provider == "pgvector": diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 0d71d8413..3d3899f4c 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -12,8 +12,15 @@ class DatasetDatabase(Base): UUID, ForeignKey("datasets.id", ondelete="CASCADE"), primary_key=True, index=True ) + # TODO: Why is this unique? Isn't it fact that two or more datasets can have the same vector and graph store? vector_database_name = Column(String, unique=True, nullable=False) graph_database_name = Column(String, unique=True, nullable=False) + vector_database_provider = Column(String, unique=True, nullable=False) + graph_database_provider = Column(String, unique=True, nullable=False) + + vector_database_url = Column(String, unique=True, nullable=True) + graph_database_url = Column(String, unique=True, nullable=True) + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From bbcd8baf3a0b0b6ddd6cac94e12977c301ab0cd5 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 28 Oct 2025 17:56:32 +0100 Subject: [PATCH 02/54] feature: add multi-user for Falkor db --- .../infrastructure/databases/graph/config.py | 4 +++ .../databases/graph/get_graph_engine.py | 1 + .../utils/get_or_create_dataset_database.py | 35 +++++++++++++++---- .../databases/vector/create_vector_engine.py | 4 +-- .../modules/users/models/DatasetDatabase.py | 12 ++++--- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/cognee/infrastructure/databases/graph/config.py b/cognee/infrastructure/databases/graph/config.py index b7907313c..23687b359 100644 --- a/cognee/infrastructure/databases/graph/config.py +++ b/cognee/infrastructure/databases/graph/config.py @@ -26,6 +26,7 @@ class GraphConfig(BaseSettings): - graph_database_username - graph_database_password - graph_database_port + - graph_database_key - graph_file_path - graph_model - graph_topology @@ -41,6 +42,7 @@ class GraphConfig(BaseSettings): graph_database_username: str = "" graph_database_password: str = "" graph_database_port: int = 123 + graph_database_key: str = "" graph_file_path: str = "" graph_filename: str = "" graph_model: object = KnowledgeGraph @@ -90,6 +92,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, + "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, "graph_model": self.graph_model, "graph_topology": self.graph_topology, @@ -116,6 +119,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, + "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, } diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 217f63070..70c27aab3 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -33,6 +33,7 @@ def create_graph_engine( graph_database_username="", graph_database_password="", graph_database_port="", + graph_database_key="", ): """ Create a graph engine based on the specified provider type. diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 2b9b00569..0af94fd3a 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -14,11 +14,14 @@ from cognee.modules.users.models import User # TODO: Find a better place to define these -default_vector_db_name = "lance.db" default_vector_db_provider = "lancedb" default_graph_db_provider = "kuzu" default_vector_db_url = None default_graph_db_url = None +default_vector_db_key = None +default_graph_db_key = None +vector_dbs_with_multi_user_support = ["lancedb", "falkor"] +graph_dbs_with_multi_user_support = ["kuzu", "falkor"] async def get_or_create_dataset_database( dataset: Union[str, UUID], @@ -61,8 +64,24 @@ async def get_or_create_dataset_database( if existing: return existing - # TODO: Set the vector and graph database stuff (name, provider, etc.) based on the whether or - # TODO: not we support multi user for that db. If not, set to default, which is lance and/or kuzu. + # Check if we support multi-user for this provider. If not, use default + if graph_config.graph_database_provider in graph_dbs_with_multi_user_support: + graph_provider = graph_config.graph_database_provider + graph_url = graph_config.graph_database_url + graph_key = graph_config.graph_database_key + else: + graph_provider = default_graph_db_provider + graph_url = default_graph_db_url + graph_key = default_graph_db_key + + if vector_config.vector_db_provider in vector_dbs_with_multi_user_support: + vector_provider = vector_config.vector_db_provider + vector_url = vector_config.vector_db_url + vector_key = vector_config.vector_db_key + else: + vector_provider = default_vector_db_provider + vector_url = default_vector_db_url + vector_key = default_vector_db_key # If there are no existing rows build a new row record = DatasetDatabase( @@ -70,10 +89,12 @@ async def get_or_create_dataset_database( dataset_id=dataset_id, vector_database_name=vector_db_name, graph_database_name=graph_db_name, - vector_database_provider=vector_config.vector_db_provider, - graph_database_provider=graph_config.graph_database_provider, - vector_database_url=vector_config.vector_db_url, - graph_database_url=graph_config.graph_database_url, + vector_database_provider=vector_provider, + graph_database_provider=graph_provider, + vector_database_url=vector_url, + graph_database_url=graph_url, + vector_database_key=vector_key, + graph_database_key=graph_key, ) try: diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index 7e3fb367f..35bbc110a 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,6 +1,6 @@ from .supported_databases import supported_databases from .embeddings import get_embedding_engine -from cognee.infrastructure.databases.graph.config import get_graph_config +from cognee.infrastructure.databases.graph.config import get_graph_context_config from functools import lru_cache @@ -46,7 +46,7 @@ def create_vector_engine( url=vector_db_url, api_key=vector_db_key, embedding_engine=embedding_engine, - graph_name=get_graph_config().graph_database_name + graph_name=get_graph_context_config()["graph_database_name"], ) if vector_db_provider == "pgvector": diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 3d3899f4c..25d610ab9 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -12,15 +12,17 @@ class DatasetDatabase(Base): UUID, ForeignKey("datasets.id", ondelete="CASCADE"), primary_key=True, index=True ) - # TODO: Why is this unique? Isn't it fact that two or more datasets can have the same vector and graph store? vector_database_name = Column(String, unique=True, nullable=False) graph_database_name = Column(String, unique=True, nullable=False) - vector_database_provider = Column(String, unique=True, nullable=False) - graph_database_provider = Column(String, unique=True, nullable=False) + vector_database_provider = Column(String, unique=False, nullable=False) + graph_database_provider = Column(String, unique=False, nullable=False) - vector_database_url = Column(String, unique=True, nullable=True) - graph_database_url = Column(String, unique=True, nullable=True) + vector_database_url = Column(String, unique=False, nullable=True) + graph_database_url = Column(String, unique=False, nullable=True) + + vector_database_key = Column(String, unique=False, nullable=True) + graph_database_key = Column(String, unique=False, nullable=True) created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From 9c9395851c4084b0240ad328cab077e04c4bdcce Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 28 Oct 2025 18:01:32 +0100 Subject: [PATCH 03/54] chore: ruff formatting --- cognee/context_global_variables.py | 2 +- .../databases/utils/get_or_create_dataset_database.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 9a4f49763..ee2e37030 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -60,7 +60,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ "vector_db_provider": dataset_database.vector_database_provider, "vector_db_url": dataset_database.vector_database_url, # TODO: Maybe add key to dataset_database, and put it here?? - "vector_db_key": "" + "vector_db_key": "", } # vector_config = { diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 0af94fd3a..1552a7bbc 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -23,6 +23,7 @@ default_graph_db_key = None vector_dbs_with_multi_user_support = ["lancedb", "falkor"] graph_dbs_with_multi_user_support = ["kuzu", "falkor"] + async def get_or_create_dataset_database( dataset: Union[str, UUID], user: User, From c3f0cb95da2d61dd9523079c1977a67dd031c7c6 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 28 Oct 2025 18:06:04 +0100 Subject: [PATCH 04/54] fix: delete unnecessary comments, add to config --- cognee/context_global_variables.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index ee2e37030..09a351f15 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -59,32 +59,19 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ vector_config = { "vector_db_provider": dataset_database.vector_database_provider, "vector_db_url": dataset_database.vector_database_url, - # TODO: Maybe add key to dataset_database, and put it here?? - "vector_db_key": "", + "vector_db_key": dataset_database.vector_database_key, + "vector_db_name": dataset_database.vector_database_name, } - # vector_config = { - # "vector_db_url": os.path.join( - # databases_directory_path, dataset_database.vector_database_name - # ), - # "vector_db_key": "", - # "vector_db_provider": "lancedb", - # } - graph_config = { "graph_database_provider": dataset_database.graph_database_provider, "graph_database_url": dataset_database.graph_database_url, "graph_database_name": dataset_database.graph_database_name, + "graph_database_key": dataset_database.graph_database_key, "graph_file_path": os.path.join( databases_directory_path, dataset_database.graph_database_name ), } - # graph_config = { - # "graph_database_provider": "kuzu", - # "graph_file_path": os.path.join( - # databases_directory_path, dataset_database.graph_database_name - # ), - # } storage_config = { "data_root_directory": data_root_directory, From 70f3ced15af7f4f778de94769fa5afa6405d1772 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 29 Oct 2025 16:30:13 +0100 Subject: [PATCH 05/54] fix: PR comment fixes --- .../databases/graph/get_graph_engine.py | 2 +- .../databases/utils/constants.py | 4 ++ .../utils/get_or_create_dataset_database.py | 70 +++++++++---------- .../infrastructure/databases/vector/config.py | 3 + .../databases/vector/create_vector_engine.py | 4 +- 5 files changed, 44 insertions(+), 39 deletions(-) create mode 100644 cognee/infrastructure/databases/utils/constants.py diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 70c27aab3..82e3cad6e 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -70,7 +70,7 @@ def create_graph_engine( graph_database_url=graph_database_url, graph_database_username=graph_database_username, graph_database_password=graph_database_password, - graph_name=graph_database_name, + database_name=graph_database_name, ) if graph_database_provider == "neo4j": diff --git a/cognee/infrastructure/databases/utils/constants.py b/cognee/infrastructure/databases/utils/constants.py new file mode 100644 index 000000000..fe6390a07 --- /dev/null +++ b/cognee/infrastructure/databases/utils/constants.py @@ -0,0 +1,4 @@ +VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] +GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"] + +HYBRID_DBS = ["falkor"] diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 1552a7bbc..deea46541 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -11,17 +11,11 @@ from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.modules.data.methods import get_unique_dataset_id from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User - - -# TODO: Find a better place to define these -default_vector_db_provider = "lancedb" -default_graph_db_provider = "kuzu" -default_vector_db_url = None -default_graph_db_url = None -default_vector_db_key = None -default_graph_db_key = None -vector_dbs_with_multi_user_support = ["lancedb", "falkor"] -graph_dbs_with_multi_user_support = ["kuzu", "falkor"] +from .constants import ( + GRAPH_DBS_WITH_MULTI_USER_SUPPORT, + VECTOR_DBS_WITH_MULTI_USER_SUPPORT, + HYBRID_DBS, +) async def get_or_create_dataset_database( @@ -45,12 +39,19 @@ async def get_or_create_dataset_database( dataset_id = await get_unique_dataset_id(dataset, user) - vector_db_name = f"{dataset_id}.db" - graph_db_name = f"{dataset_id}.pkl" - vector_config = get_vectordb_config() graph_config = get_graph_config() + graph_db_name = f"{dataset_id}.pkl" + + if graph_config.graph_database_provider in HYBRID_DBS: + vector_db_name = graph_db_name + else: + if vector_config.vector_database_provider == "lancedb": + vector_db_name = f"{dataset_id}.lance.db" + else: + vector_db_name = f"{dataset_id}.db" + async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist if isinstance(dataset, str): @@ -66,23 +67,18 @@ async def get_or_create_dataset_database( return existing # Check if we support multi-user for this provider. If not, use default - if graph_config.graph_database_provider in graph_dbs_with_multi_user_support: - graph_provider = graph_config.graph_database_provider - graph_url = graph_config.graph_database_url - graph_key = graph_config.graph_database_key - else: - graph_provider = default_graph_db_provider - graph_url = default_graph_db_url - graph_key = default_graph_db_key - - if vector_config.vector_db_provider in vector_dbs_with_multi_user_support: - vector_provider = vector_config.vector_db_provider - vector_url = vector_config.vector_db_url - vector_key = vector_config.vector_db_key - else: - vector_provider = default_vector_db_provider - vector_url = default_vector_db_url - vector_key = default_vector_db_key + if graph_config.graph_database_provider not in GRAPH_DBS_WITH_MULTI_USER_SUPPORT: + raise EnvironmentError( + f"Multi-user is currently not supported for the graph database provider: {graph_config.graph_database_provider}. " + f"Supported providers are: {', '.join(GRAPH_DBS_WITH_MULTI_USER_SUPPORT)}. Either use one of these" + f"providers, or disable BACKEND_ACCESS_CONTROL" + ) + if vector_config.vector_db_provider not in VECTOR_DBS_WITH_MULTI_USER_SUPPORT: + raise EnvironmentError( + f"Multi-user is currently not supported for the vector database provider: {vector_config.vector_db_provider}. " + f"Supported providers are: {', '.join(VECTOR_DBS_WITH_MULTI_USER_SUPPORT)}. Either use one of these" + f"providers, or disable BACKEND_ACCESS_CONTROL" + ) # If there are no existing rows build a new row record = DatasetDatabase( @@ -90,12 +86,12 @@ async def get_or_create_dataset_database( dataset_id=dataset_id, vector_database_name=vector_db_name, graph_database_name=graph_db_name, - vector_database_provider=vector_provider, - graph_database_provider=graph_provider, - vector_database_url=vector_url, - graph_database_url=graph_url, - vector_database_key=vector_key, - graph_database_key=graph_key, + vector_database_provider=vector_config.vector_db_provider, + graph_database_provider=graph_config.graph_database_provider, + vector_database_url=vector_config.vector_db_url, + graph_database_url=graph_config.graph_database_url, + vector_database_key=vector_config.vector_db_key, + graph_database_key=graph_config.graph_database_key, ) try: diff --git a/cognee/infrastructure/databases/vector/config.py b/cognee/infrastructure/databases/vector/config.py index b6d3ae644..7d28f1668 100644 --- a/cognee/infrastructure/databases/vector/config.py +++ b/cognee/infrastructure/databases/vector/config.py @@ -18,12 +18,14 @@ class VectorConfig(BaseSettings): Instance variables: - vector_db_url: The URL of the vector database. - vector_db_port: The port for the vector database. + - vector_db_name: The name of the vector database. - vector_db_key: The key for accessing the vector database. - vector_db_provider: The provider for the vector database. """ vector_db_url: str = "" vector_db_port: int = 1234 + vector_db_name: str = "" vector_db_key: str = "" vector_db_provider: str = "lancedb" @@ -58,6 +60,7 @@ class VectorConfig(BaseSettings): return { "vector_db_url": self.vector_db_url, "vector_db_port": self.vector_db_port, + "vector_db_name": self.vector_db_name, "vector_db_key": self.vector_db_key, "vector_db_provider": self.vector_db_provider, } diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index 35bbc110a..3fe926978 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -9,6 +9,7 @@ from functools import lru_cache def create_vector_engine( vector_db_provider: str, vector_db_url: str, + vector_db_name: str, vector_db_port: str = "", vector_db_key: str = "", ): @@ -28,6 +29,7 @@ def create_vector_engine( - vector_db_url (str): The URL for the vector database instance. - vector_db_port (str): The port for the vector database instance. Required for some providers. + - vector_db_name (str): The name of the vector database instance. - vector_db_key (str): The API key or access token for the vector database instance. - vector_db_provider (str): The name of the vector database provider to use (e.g., 'pgvector'). @@ -46,7 +48,7 @@ def create_vector_engine( url=vector_db_url, api_key=vector_db_key, embedding_engine=embedding_engine, - graph_name=get_graph_context_config()["graph_database_name"], + database_name=vector_db_name, ) if vector_db_provider == "pgvector": From 908d32912766331476e0159f96950a30bfda1ef0 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 30 Oct 2025 15:15:41 +0100 Subject: [PATCH 06/54] feat: add alembic migrations --- ..._expand_dataset_database_for_multi_user.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py diff --git a/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py new file mode 100644 index 000000000..cd19d09c8 --- /dev/null +++ b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py @@ -0,0 +1,122 @@ +"""Expand dataset database for multi user + +Revision ID: 76625596c5c3 +Revises: 211ab850ef3d +Create Date: 2025-10-30 12:55:20.239562 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "76625596c5c3" +down_revision: Union[str, None] = "211ab850ef3d" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + data = sa.table( + "dataset_database", + sa.Column("dataset_id", sa.UUID, primary_key=True, index=True), # Critical for SQLite + sa.Column("owner_id", sa.UUID, index=True), + sa.Column("vector_database_name", sa.String(), unique=True, nullable=False), + sa.Column("graph_database_name", sa.String(), unique=True, nullable=False), + sa.Column("vector_database_provider", sa.String(), unique=False, nullable=False), + sa.Column("graph_database_provider", sa.String(), unique=False, nullable=False), + sa.Column("vector_database_url", sa.String(), unique=False, nullable=True), + sa.Column("graph_database_url", sa.String(), unique=False, nullable=True), + sa.Column("vector_database_key", sa.String(), unique=False, nullable=True), + sa.Column("graph_database_key", sa.String(), unique=False, nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True)), + sa.Column("updated_at", sa.DateTime(timezone=True)), + ) + + vector_database_provider_column = _get_column( + insp, "dataset_database", "vector_database_provider" + ) + if not vector_database_provider_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_provider", sa.String(), unique=False, nullable=False), + ) + if op.get_context().dialect.name == "sqlite": + with op.batch_alter_table("dataset_database") as batch_op: + batch_op.execute( + data.update().values( + vector_database_provider="lancedb", + ) + ) + else: + conn = op.get_bind() + conn.execute(data.update().values(vector_database_provider="lancedb")) + + graph_database_provider_column = _get_column( + insp, "dataset_database", "graph_database_provider" + ) + if not graph_database_provider_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_provider", sa.String(), unique=False, nullable=False), + ) + if op.get_context().dialect.name == "sqlite": + with op.batch_alter_table("dataset_database") as batch_op: + batch_op.execute( + data.update().values( + graph_database_provider="kuzu", + ) + ) + else: + conn = op.get_bind() + conn.execute(data.update().values(graph_database_provider="kuzu")) + + vector_database_url_column = _get_column(insp, "dataset_database", "vector_database_url") + if not vector_database_url_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_url", sa.String(), unique=False, nullable=True), + ) + + graph_database_url_column = _get_column(insp, "dataset_database", "graph_database_url") + if not graph_database_url_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_url", sa.String(), unique=False, nullable=True), + ) + + vector_database_key_column = _get_column(insp, "dataset_database", "vector_database_key") + if not vector_database_key_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_key", sa.String(), unique=False, nullable=True), + ) + + graph_database_key_column = _get_column(insp, "dataset_database", "graph_database_key") + if not graph_database_key_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_key", sa.String(), unique=False, nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("dataset_database", "vector_database_provider") + op.drop_column("dataset_database", "graph_database_provider") + op.drop_column("dataset_database", "vector_database_url") + op.drop_column("dataset_database", "graph_database_url") + op.drop_column("dataset_database", "vector_database_key") + op.drop_column("dataset_database", "graph_database_key") From ce925615fe843dac238aca74c4b81615fa6beb65 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 30 Oct 2025 15:32:27 +0100 Subject: [PATCH 07/54] fix: fix small naming error --- .../databases/utils/get_or_create_dataset_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index deea46541..a4e50f665 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -47,7 +47,7 @@ async def get_or_create_dataset_database( if graph_config.graph_database_provider in HYBRID_DBS: vector_db_name = graph_db_name else: - if vector_config.vector_database_provider == "lancedb": + if vector_config.vector_db_provider == "lancedb": vector_db_name = f"{dataset_id}.lance.db" else: vector_db_name = f"{dataset_id}.db" From 28f28f06dd34eec88c0443d780010b451e389674 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 30 Oct 2025 16:04:33 +0100 Subject: [PATCH 08/54] fix: added vector db name to test configs --- cognee/tests/test_parallel_databases.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cognee/tests/test_parallel_databases.py b/cognee/tests/test_parallel_databases.py index 9a590921a..51eb7d3cf 100755 --- a/cognee/tests/test_parallel_databases.py +++ b/cognee/tests/test_parallel_databases.py @@ -33,11 +33,13 @@ async def main(): "vector_db_url": "cognee1.test", "vector_db_key": "", "vector_db_provider": "lancedb", + "vector_db_name": "" } task_2_config = { "vector_db_url": "cognee2.test", "vector_db_key": "", "vector_db_provider": "lancedb", + "vector_db_name": "" } task_1_graph_config = { From 4b0b9bfc539ef448779eb52ce546444dc7780149 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 30 Oct 2025 16:06:15 +0100 Subject: [PATCH 09/54] chore: ruff format --- cognee/tests/test_parallel_databases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tests/test_parallel_databases.py b/cognee/tests/test_parallel_databases.py index 51eb7d3cf..3164206ed 100755 --- a/cognee/tests/test_parallel_databases.py +++ b/cognee/tests/test_parallel_databases.py @@ -33,13 +33,13 @@ async def main(): "vector_db_url": "cognee1.test", "vector_db_key": "", "vector_db_provider": "lancedb", - "vector_db_name": "" + "vector_db_name": "", } task_2_config = { "vector_db_url": "cognee2.test", "vector_db_key": "", "vector_db_provider": "lancedb", - "vector_db_name": "" + "vector_db_name": "", } task_1_graph_config = { From ed2d6871356ea94a5d95f9fcbeefc9c2ecd67348 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 11 Nov 2025 13:52:34 +0100 Subject: [PATCH 10/54] fix: changes based on PR comments --- cognee/context_global_variables.py | 8 ++--- .../databases/utils/constants.py | 4 --- .../utils/get_or_create_dataset_database.py | 36 +++++-------------- 3 files changed, 12 insertions(+), 36 deletions(-) delete mode 100644 cognee/infrastructure/databases/utils/constants.py diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 6ec467ed9..2d711a8b2 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -16,8 +16,8 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -vector_dbs_with_multi_user_support = ["lancedb"] -graph_dbs_with_multi_user_support = ["kuzu"] +VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb"] +GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu"] async def set_session_user_context_variable(user): @@ -28,8 +28,8 @@ def multi_user_support_possible(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() return ( - graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support - and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support + graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT + and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT ) diff --git a/cognee/infrastructure/databases/utils/constants.py b/cognee/infrastructure/databases/utils/constants.py deleted file mode 100644 index fe6390a07..000000000 --- a/cognee/infrastructure/databases/utils/constants.py +++ /dev/null @@ -1,4 +0,0 @@ -VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] -GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"] - -HYBRID_DBS = ["falkor"] diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index a4e50f665..61d7840c0 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -11,12 +11,6 @@ from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.modules.data.methods import get_unique_dataset_id from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User -from .constants import ( - GRAPH_DBS_WITH_MULTI_USER_SUPPORT, - VECTOR_DBS_WITH_MULTI_USER_SUPPORT, - HYBRID_DBS, -) - async def get_or_create_dataset_database( dataset: Union[str, UUID], @@ -42,15 +36,15 @@ async def get_or_create_dataset_database( vector_config = get_vectordb_config() graph_config = get_graph_config() - graph_db_name = f"{dataset_id}.pkl" - - if graph_config.graph_database_provider in HYBRID_DBS: - vector_db_name = graph_db_name + if graph_config.graph_database_provider == "kuzu": + graph_db_name = f"{dataset_id}.pkl" else: - if vector_config.vector_db_provider == "lancedb": - vector_db_name = f"{dataset_id}.lance.db" - else: - vector_db_name = f"{dataset_id}.db" + graph_db_name = dataset_id + + if vector_config.vector_db_provider == "lancedb": + vector_db_name = f"{dataset_id}.lance.db" + else: + vector_db_name = dataset_id async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist @@ -66,20 +60,6 @@ async def get_or_create_dataset_database( if existing: return existing - # Check if we support multi-user for this provider. If not, use default - if graph_config.graph_database_provider not in GRAPH_DBS_WITH_MULTI_USER_SUPPORT: - raise EnvironmentError( - f"Multi-user is currently not supported for the graph database provider: {graph_config.graph_database_provider}. " - f"Supported providers are: {', '.join(GRAPH_DBS_WITH_MULTI_USER_SUPPORT)}. Either use one of these" - f"providers, or disable BACKEND_ACCESS_CONTROL" - ) - if vector_config.vector_db_provider not in VECTOR_DBS_WITH_MULTI_USER_SUPPORT: - raise EnvironmentError( - f"Multi-user is currently not supported for the vector database provider: {vector_config.vector_db_provider}. " - f"Supported providers are: {', '.join(VECTOR_DBS_WITH_MULTI_USER_SUPPORT)}. Either use one of these" - f"providers, or disable BACKEND_ACCESS_CONTROL" - ) - # If there are no existing rows build a new row record = DatasetDatabase( owner_id=user.id, From 011a7fb60bd016047da2618bbb61ffa85eb52028 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 13:53:19 +0100 Subject: [PATCH 11/54] fix: Resolve multi user migration --- ..._expand_dataset_database_for_multi_user.py | 54 ++++++------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py index cd19d09c8..7e13898ae 100644 --- a/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py +++ b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py @@ -14,7 +14,7 @@ import sqlalchemy as sa # revision identifiers, used by Alembic. revision: str = "76625596c5c3" -down_revision: Union[str, None] = "211ab850ef3d" +down_revision: Union[str, None] = "c946955da633" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -30,40 +30,20 @@ def upgrade() -> None: conn = op.get_bind() insp = sa.inspect(conn) - data = sa.table( - "dataset_database", - sa.Column("dataset_id", sa.UUID, primary_key=True, index=True), # Critical for SQLite - sa.Column("owner_id", sa.UUID, index=True), - sa.Column("vector_database_name", sa.String(), unique=True, nullable=False), - sa.Column("graph_database_name", sa.String(), unique=True, nullable=False), - sa.Column("vector_database_provider", sa.String(), unique=False, nullable=False), - sa.Column("graph_database_provider", sa.String(), unique=False, nullable=False), - sa.Column("vector_database_url", sa.String(), unique=False, nullable=True), - sa.Column("graph_database_url", sa.String(), unique=False, nullable=True), - sa.Column("vector_database_key", sa.String(), unique=False, nullable=True), - sa.Column("graph_database_key", sa.String(), unique=False, nullable=True), - sa.Column("created_at", sa.DateTime(timezone=True)), - sa.Column("updated_at", sa.DateTime(timezone=True)), - ) - vector_database_provider_column = _get_column( insp, "dataset_database", "vector_database_provider" ) if not vector_database_provider_column: op.add_column( "dataset_database", - sa.Column("vector_database_provider", sa.String(), unique=False, nullable=False), + sa.Column( + "vector_database_provider", + sa.String(), + unique=False, + nullable=False, + server_default="lancedb", + ), ) - if op.get_context().dialect.name == "sqlite": - with op.batch_alter_table("dataset_database") as batch_op: - batch_op.execute( - data.update().values( - vector_database_provider="lancedb", - ) - ) - else: - conn = op.get_bind() - conn.execute(data.update().values(vector_database_provider="lancedb")) graph_database_provider_column = _get_column( insp, "dataset_database", "graph_database_provider" @@ -71,18 +51,14 @@ def upgrade() -> None: if not graph_database_provider_column: op.add_column( "dataset_database", - sa.Column("graph_database_provider", sa.String(), unique=False, nullable=False), + sa.Column( + "graph_database_provider", + sa.String(), + unique=False, + nullable=False, + server_default="kuzu", + ), ) - if op.get_context().dialect.name == "sqlite": - with op.batch_alter_table("dataset_database") as batch_op: - batch_op.execute( - data.update().values( - graph_database_provider="kuzu", - ) - ) - else: - conn = op.get_bind() - conn.execute(data.update().values(graph_database_provider="kuzu")) vector_database_url_column = _get_column(insp, "dataset_database", "vector_database_url") if not vector_database_url_column: From bb8de7b336a23b6053de3459764d71d7e45b40be Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 11 Nov 2025 13:56:16 +0100 Subject: [PATCH 12/54] Apply suggestion from @dexters1 --- .../databases/utils/get_or_create_dataset_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 61d7840c0..0aa836174 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -35,7 +35,7 @@ async def get_or_create_dataset_database( vector_config = get_vectordb_config() graph_config = get_graph_config() - +Note: for hybrid databases both graph and vector DB name have to be the same if graph_config.graph_database_provider == "kuzu": graph_db_name = f"{dataset_id}.pkl" else: From 20d49eeb76ca81057e902fad67658d5db2dcb0a0 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 11 Nov 2025 13:56:35 +0100 Subject: [PATCH 13/54] Apply suggestion from @dexters1 --- .../databases/utils/get_or_create_dataset_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 0aa836174..635734d89 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -35,7 +35,7 @@ async def get_or_create_dataset_database( vector_config = get_vectordb_config() graph_config = get_graph_config() -Note: for hybrid databases both graph and vector DB name have to be the same +# Note: for hybrid databases both graph and vector DB name have to be the same if graph_config.graph_database_provider == "kuzu": graph_db_name = f"{dataset_id}.pkl" else: From 41b844a31c3b938a09a07462e59b70b39593313f Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 11 Nov 2025 13:56:59 +0100 Subject: [PATCH 14/54] Apply suggestion from @dexters1 --- .../databases/utils/get_or_create_dataset_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 635734d89..1822221cb 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -35,7 +35,7 @@ async def get_or_create_dataset_database( vector_config = get_vectordb_config() graph_config = get_graph_config() -# Note: for hybrid databases both graph and vector DB name have to be the same + # Note: for hybrid databases both graph and vector DB name have to be the same if graph_config.graph_database_provider == "kuzu": graph_db_name = f"{dataset_id}.pkl" else: From 4f5771230e6177db1f87b448abdc851067317608 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 11 Nov 2025 14:22:42 +0100 Subject: [PATCH 15/54] fix: PR comment changes --- cognee/context_global_variables.py | 9 ++++++++- .../databases/utils/get_or_create_dataset_database.py | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 2d711a8b2..6a0f767ff 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -86,10 +86,17 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config.system_root_directory, "databases", str(user.id) ) + if dataset_database.vector_database_provider == "lancedb": + vector_db_url = os.path.join( + databases_directory_path, dataset_database.vector_database_name + ) + else: + vector_db_url = dataset_database.vector_database_url + # Set vector and graph database configuration based on dataset database information vector_config = { "vector_db_provider": dataset_database.vector_database_provider, - "vector_db_url": dataset_database.vector_database_url, + "vector_db_url": vector_db_url, "vector_db_key": dataset_database.vector_database_key, "vector_db_name": dataset_database.vector_database_name, } diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 61d7840c0..b00616671 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -12,6 +12,7 @@ from cognee.modules.data.methods import get_unique_dataset_id from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User + async def get_or_create_dataset_database( dataset: Union[str, UUID], user: User, From ac6c3ef9deeef9ef8069ba1aecdce8791987bd1e Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 11 Nov 2025 15:07:59 +0100 Subject: [PATCH 16/54] fix: fix names, add falkor to constants --- cognee/context_global_variables.py | 4 ++-- .../databases/utils/get_or_create_dataset_database.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 6a0f767ff..d4cedc187 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -16,8 +16,8 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb"] -GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu"] +VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] +GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"] async def set_session_user_context_variable(user): diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 311f89ad7..a2e053b3d 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -40,12 +40,12 @@ async def get_or_create_dataset_database( if graph_config.graph_database_provider == "kuzu": graph_db_name = f"{dataset_id}.pkl" else: - graph_db_name = dataset_id + graph_db_name = f"{dataset_id}" if vector_config.vector_db_provider == "lancedb": vector_db_name = f"{dataset_id}.lance.db" else: - vector_db_name = dataset_id + vector_db_name = f"{dataset_id}" async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist From 6a640238760192358220a431db734d19476bb434 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 15:12:58 +0100 Subject: [PATCH 17/54] fix: Update vector db url properly --- cognee/context_global_variables.py | 12 ++--------- .../utils/get_or_create_dataset_database.py | 20 +++++++++++++++++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 6a0f767ff..c2e9e82a9 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -69,8 +69,6 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ """ - base_config = get_base_config() - if not backend_access_control_enabled(): return @@ -79,6 +77,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # To ensure permissions are enforced properly all datasets will have their own databases dataset_database = await get_or_create_dataset_database(dataset, user) + base_config = get_base_config() data_root_directory = os.path.join( base_config.data_root_directory, str(user.tenant_id or user.id) ) @@ -86,17 +85,10 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config.system_root_directory, "databases", str(user.id) ) - if dataset_database.vector_database_provider == "lancedb": - vector_db_url = os.path.join( - databases_directory_path, dataset_database.vector_database_name - ) - else: - vector_db_url = dataset_database.vector_database_url - # Set vector and graph database configuration based on dataset database information vector_config = { "vector_db_provider": dataset_database.vector_database_provider, - "vector_db_url": vector_db_url, + "vector_db_url": dataset_database.vector_database_url, "vector_db_key": dataset_database.vector_database_key, "vector_db_name": dataset_database.vector_database_name, } diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 311f89ad7..0df3502ba 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -1,10 +1,12 @@ +import os from uuid import UUID from typing import Union from sqlalchemy import select from sqlalchemy.exc import IntegrityError -from cognee.modules.data.methods import create_dataset +from cognee.base_config import get_base_config +from cognee.modules.data.methods import create_dataset from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.vector import get_vectordb_config from cognee.infrastructure.databases.graph.config import get_graph_config @@ -36,6 +38,7 @@ async def get_or_create_dataset_database( vector_config = get_vectordb_config() graph_config = get_graph_config() + # Note: for hybrid databases both graph and vector DB name have to be the same if graph_config.graph_database_provider == "kuzu": graph_db_name = f"{dataset_id}.pkl" @@ -47,6 +50,19 @@ async def get_or_create_dataset_database( else: vector_db_name = dataset_id + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + # Determine vector database URL + if vector_config.vector_db_provider == "lancedb": + vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) + else: + vector_db_url = vector_config.vector_database_url + + # Determine graph database URL + async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist if isinstance(dataset, str): @@ -69,7 +85,7 @@ async def get_or_create_dataset_database( graph_database_name=graph_db_name, vector_database_provider=vector_config.vector_db_provider, graph_database_provider=graph_config.graph_database_provider, - vector_database_url=vector_config.vector_db_url, + vector_database_url=vector_db_url, graph_database_url=graph_config.graph_database_url, vector_database_key=vector_config.vector_db_key, graph_database_key=graph_config.graph_database_key, From 432d4a15782d64ca785854c6aa3db109b17d6f62 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 19:44:34 +0100 Subject: [PATCH 18/54] feat: Add initial multi tenant neo4j support --- cognee/context_global_variables.py | 4 +- .../utils/get_or_create_dataset_database.py | 119 ++++++++++++++---- .../modules/users/models/DatasetDatabase.py | 5 +- 3 files changed, 100 insertions(+), 28 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 62e06fc64..44ead95af 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -17,7 +17,7 @@ graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] -GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"] +GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor", "neo4j"] async def set_session_user_context_variable(user): @@ -101,6 +101,8 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ "graph_file_path": os.path.join( databases_directory_path, dataset_database.graph_database_name ), + "graph_database_username": dataset_database.graph_database_username, + "graph_database_password": dataset_database.graph_database_password, } storage_config = { diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 3684bb100..0a2638dc5 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -39,30 +39,6 @@ async def get_or_create_dataset_database( vector_config = get_vectordb_config() graph_config = get_graph_config() - # Note: for hybrid databases both graph and vector DB name have to be the same - if graph_config.graph_database_provider == "kuzu": - graph_db_name = f"{dataset_id}.pkl" - else: - graph_db_name = f"{dataset_id}" - - if vector_config.vector_db_provider == "lancedb": - vector_db_name = f"{dataset_id}.lance.db" - else: - vector_db_name = f"{dataset_id}" - - base_config = get_base_config() - databases_directory_path = os.path.join( - base_config.system_root_directory, "databases", str(user.id) - ) - - # Determine vector database URL - if vector_config.vector_db_provider == "lancedb": - vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) - else: - vector_db_url = vector_config.vector_database_url - - # Determine graph database URL - async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist if isinstance(dataset, str): @@ -77,7 +53,96 @@ async def get_or_create_dataset_database( if existing: return existing + # Note: for hybrid databases both graph and vector DB name have to be the same + if graph_config.graph_database_provider == "kuzu": + graph_db_name = f"{dataset_id}.pkl" + else: + graph_db_name = f"{dataset_id}" + + if vector_config.vector_db_provider == "lancedb": + vector_db_name = f"{dataset_id}.lance.db" + else: + vector_db_name = f"{dataset_id}" + + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + # Determine vector database URL + if vector_config.vector_db_provider == "lancedb": + vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) + else: + vector_db_url = vector_config.vector_database_url + + # Determine graph database URL + if graph_config.graph_database_provider == "neo4j": + # Auto deploy instance to Aura DB + # OAuth2 token endpoint + + # Your client credentials + client_id = os.environ.get("NEO4J_CLIENT_ID", None) + client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) + tenant_id = os.environ.get("NEO4J_TENANT_ID", None) + + # Make the request with HTTP Basic Auth + import requests + + def get_aura_token(client_id: str, client_secret: str) -> dict: + url = "https://api.neo4j.io/oauth/token" + data = { + "grant_type": "client_credentials" + } # sent as application/x-www-form-urlencoded + + resp = requests.post(url, data=data, auth=(client_id, client_secret)) + resp.raise_for_status() # raises if the request failed + return resp.json() + + resp = get_aura_token(client_id, client_secret) + + url = "https://api.neo4j.io/v1/instances" + + headers = { + "accept": "application/json", + "Authorization": f"Bearer {resp['access_token']}", + "Content-Type": "application/json", + } + + payload = { + "version": "5", + "region": "europe-west1", + "memory": "1GB", + "name": graph_db_name[0:29], + "type": "professional-db", + "tenant_id": tenant_id, + "cloud_provider": "gcp", + } + + response = requests.post(url, headers=headers, json=payload) + + # Wait for instance to be provisioned + # TODO: Find better way to check when instance is ready + import asyncio + + await asyncio.sleep(180) + + print(response.status_code) + print(response.text) + # TODO: Find better name to name Neo4j instance within 30 character limit + print(graph_db_name[0:29]) + graph_db_name = "neo4j" + graph_db_url = response.json()["data"]["connection_url"] + graph_db_key = resp["access_token"] + graph_db_username = response.json()["data"]["username"] + graph_db_password = response.json()["data"]["password"] + else: + graph_db_url = graph_config.graph_database_url + graph_db_key = graph_config.graph_database_key + graph_db_username = graph_config.graph_database_username + graph_db_password = graph_config.graph_database_password + # If there are no existing rows build a new row + # TODO: Update Dataset Database migrations, also make sure database_name is not unique anymore record = DatasetDatabase( owner_id=user.id, dataset_id=dataset_id, @@ -86,9 +151,11 @@ async def get_or_create_dataset_database( vector_database_provider=vector_config.vector_db_provider, graph_database_provider=graph_config.graph_database_provider, vector_database_url=vector_db_url, - graph_database_url=graph_config.graph_database_url, + graph_database_url=graph_db_url, vector_database_key=vector_config.vector_db_key, - graph_database_key=graph_config.graph_database_key, + graph_database_key=graph_db_key, + graph_database_username=graph_db_username, + graph_database_password=graph_db_password, ) try: diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 25d610ab9..5d2e4fcd5 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -13,7 +13,7 @@ class DatasetDatabase(Base): ) vector_database_name = Column(String, unique=True, nullable=False) - graph_database_name = Column(String, unique=True, nullable=False) + graph_database_name = Column(String, unique=False, nullable=False) vector_database_provider = Column(String, unique=False, nullable=False) graph_database_provider = Column(String, unique=False, nullable=False) @@ -24,5 +24,8 @@ class DatasetDatabase(Base): vector_database_key = Column(String, unique=False, nullable=True) graph_database_key = Column(String, unique=False, nullable=True) + graph_database_username = Column(String, unique=False, nullable=True) + graph_database_password = Column(String, unique=False, nullable=True) + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From a0a14e7ccccde894798603947db4f18c06dfc154 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 20:05:47 +0100 Subject: [PATCH 19/54] refactor: Update dataset database class --- cognee/modules/users/models/DatasetDatabase.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index f4b7c2aed..4bbfffe4c 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -12,7 +12,7 @@ class DatasetDatabase(Base): UUID, ForeignKey("datasets.id", ondelete="CASCADE"), primary_key=True, index=True ) - vector_database_name = Column(String, unique=True, nullable=False) + vector_database_name = Column(String, unique=False, nullable=False) graph_database_name = Column(String, unique=False, nullable=False) vector_database_provider = Column(String, unique=False, nullable=False) @@ -27,14 +27,5 @@ class DatasetDatabase(Base): graph_database_username = Column(String, unique=False, nullable=True) graph_database_password = Column(String, unique=False, nullable=True) - vector_database_provider = Column(String, unique=False, nullable=False) - graph_database_provider = Column(String, unique=False, nullable=False) - - vector_database_url = Column(String, unique=False, nullable=True) - graph_database_url = Column(String, unique=False, nullable=True) - - vector_database_key = Column(String, unique=False, nullable=True) - graph_database_key = Column(String, unique=False, nullable=True) - created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From b017fcc8d0030ee2fef9929d020c1bb3d8d15f12 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 12 Nov 2025 17:58:27 +0100 Subject: [PATCH 20/54] refactor: Make neo4j auto scaling more readable --- .../utils/get_or_create_dataset_database.py | 302 ++++++++++-------- 1 file changed, 166 insertions(+), 136 deletions(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 27c0d62a3..84742748d 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -1,6 +1,8 @@ import os +import asyncio +import requests from uuid import UUID -from typing import Union +from typing import Union, Optional from sqlalchemy import select from sqlalchemy.exc import IntegrityError @@ -15,6 +17,157 @@ from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User +async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict: + vector_config = get_vectordb_config() + + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + # Determine vector configuration + if vector_config.vector_db_provider == "lancedb": + vector_db_name = f"{dataset_id}.lance.db" + vector_db_url = os.path.join(databases_directory_path, vector_db_name) + else: + # Note: for hybrid databases both graph and vector DB name have to be the same + vector_db_name = vector_config.vector_db_name + vector_db_url = vector_config.vector_database_url + + return { + "vector_database_name": vector_db_name, + "vector_database_url": vector_db_url, + "vector_database_provider": vector_config.vector_db_provider, + "vector_database_key": vector_config.vector_db_key, + } + + +async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: + graph_config = get_graph_config() + + # Determine graph database URL + if graph_config.graph_database_provider == "neo4j": + graph_db_name = f"{dataset_id}" + # Auto deploy instance to Aura DB + # OAuth2 token endpoint + + # Your client credentials + client_id = os.environ.get("NEO4J_CLIENT_ID", None) + client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) + tenant_id = os.environ.get("NEO4J_TENANT_ID", None) + + # Make the request with HTTP Basic Auth + def get_aura_token(client_id: str, client_secret: str) -> dict: + url = "https://api.neo4j.io/oauth/token" + data = {"grant_type": "client_credentials"} # sent as application/x-www-form-urlencoded + + resp = requests.post(url, data=data, auth=(client_id, client_secret)) + resp.raise_for_status() # raises if the request failed + return resp.json() + + resp = get_aura_token(client_id, client_secret) + + url = "https://api.neo4j.io/v1/instances" + + headers = { + "accept": "application/json", + "Authorization": f"Bearer {resp['access_token']}", + "Content-Type": "application/json", + } + + payload = { + "version": "5", + "region": "europe-west1", + "memory": "1GB", + "name": graph_db_name[0:29], + "type": "professional-db", + "tenant_id": tenant_id, + "cloud_provider": "gcp", + } + + response = requests.post(url, headers=headers, json=payload) + + print(response.status_code) + print(response.text) + # TODO: Find better name to name Neo4j instance within 30 character limit + print(graph_db_name[0:29]) + graph_db_name = "neo4j" + graph_db_url = response.json()["data"]["connection_url"] + graph_db_key = resp["access_token"] + graph_db_username = response.json()["data"]["username"] + graph_db_password = response.json()["data"]["password"] + + async def _wait_for_neo4j_instance_provisioning(instance_id: str, headers: dict): + # Poll until the instance is running + status_url = f"https://api.neo4j.io/v1/instances/{instance_id}" + status = "" + for attempt in range(30): # Try for up to ~5 minutes + status_resp = requests.get(status_url, headers=headers) + status = status_resp.json()["data"]["status"] + if status.lower() == "running": + return + await asyncio.sleep(10) + raise TimeoutError( + f"Neo4j instance '{graph_db_name}' did not become ready within 5 minutes. Status: {status}" + ) + + instance_id = response.json()["data"]["id"] + await _wait_for_neo4j_instance_provisioning(instance_id, headers) + + elif graph_config.graph_database_provider == "kuzu": + # TODO: Add graph file path info for kuzu (also in DatasetDatabase model) + graph_db_name = f"{dataset_id}.pkl" + graph_db_url = graph_config.graph_database_url + graph_db_key = graph_config.graph_database_key + graph_db_username = graph_config.graph_database_username + graph_db_password = graph_config.graph_database_password + elif graph_config.graph_database_provider == "falkor": + # Note: for hybrid databases both graph and vector DB name have to be the same + graph_db_name = f"{dataset_id}" + graph_db_url = graph_config.graph_database_url + graph_db_key = graph_config.graph_database_key + graph_db_username = graph_config.graph_database_username + graph_db_password = graph_config.graph_database_password + else: + raise EnvironmentError( + f"Unsupported graph database provider for backend access control: {graph_config.graph_database_provider}" + ) + + return { + "graph_database_name": graph_db_name, + "graph_database_url": graph_db_url, + "graph_database_provider": graph_config.graph_database_provider, + "graph_database_key": graph_db_key, + "graph_database_username": graph_db_username, + "graph_database_password": graph_db_password, + } + + +async def _existing_dataset_database( + dataset_id: UUID, + user: User, +) -> Optional[DatasetDatabase]: + """ + Check if a DatasetDatabase row already exists for the given owner + dataset. + Return None if it doesn't exist, return the row if it does. + Args: + dataset_id: + user: + + Returns: + DatasetDatabase or None + """ + db_engine = get_relational_engine() + + async with db_engine.get_async_session() as session: + stmt = select(DatasetDatabase).where( + DatasetDatabase.owner_id == user.id, + DatasetDatabase.dataset_id == dataset_id, + ) + existing: DatasetDatabase = await session.scalar(stmt) + return existing + + async def get_or_create_dataset_database( dataset: Union[str, UUID], user: User, @@ -36,150 +189,27 @@ async def get_or_create_dataset_database( dataset_id = await get_unique_dataset_id(dataset, user) - vector_config = get_vectordb_config() - graph_config = get_graph_config() + # If dataset is given as name make sure the dataset is created first + if isinstance(dataset, str): + async with db_engine.get_async_session() as session: + await create_dataset(dataset, user, session) - # Note: for hybrid databases both graph and vector DB name have to be the same - if graph_config.graph_database_provider == "kuzu": - graph_db_name = f"{dataset_id}.pkl" - else: - graph_db_name = f"{dataset_id}" + # If dataset database already exists return it + existing_dataset_database = await _existing_dataset_database(dataset_id, user) + if existing_dataset_database: + return existing_dataset_database - if vector_config.vector_db_provider == "lancedb": - vector_db_name = f"{dataset_id}.lance.db" - else: - vector_db_name = f"{dataset_id}" - - base_config = get_base_config() - databases_directory_path = os.path.join( - base_config.system_root_directory, "databases", str(user.id) - ) - - # Determine vector database URL - if vector_config.vector_db_provider == "lancedb": - vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) - else: - vector_db_url = vector_config.vector_database_url - - # Determine graph database URL + graph_config_dict = await _get_graph_db_info(dataset_id, user) + vector_config_dict = await _get_vector_db_info(dataset_id, user) async with db_engine.get_async_session() as session: - # Create dataset if it doesn't exist - if isinstance(dataset, str): - dataset = await create_dataset(dataset, user, session) - - # Try to fetch an existing row first - stmt = select(DatasetDatabase).where( - DatasetDatabase.owner_id == user.id, - DatasetDatabase.dataset_id == dataset_id, - ) - existing: DatasetDatabase = await session.scalar(stmt) - if existing: - return existing - - # Note: for hybrid databases both graph and vector DB name have to be the same - if graph_config.graph_database_provider == "kuzu": - graph_db_name = f"{dataset_id}.pkl" - else: - graph_db_name = f"{dataset_id}" - - if vector_config.vector_db_provider == "lancedb": - vector_db_name = f"{dataset_id}.lance.db" - else: - vector_db_name = f"{dataset_id}" - - base_config = get_base_config() - databases_directory_path = os.path.join( - base_config.system_root_directory, "databases", str(user.id) - ) - - # Determine vector database URL - if vector_config.vector_db_provider == "lancedb": - vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) - else: - vector_db_url = vector_config.vector_database_url - - # Determine graph database URL - if graph_config.graph_database_provider == "neo4j": - # Auto deploy instance to Aura DB - # OAuth2 token endpoint - - # Your client credentials - client_id = os.environ.get("NEO4J_CLIENT_ID", None) - client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) - tenant_id = os.environ.get("NEO4J_TENANT_ID", None) - - # Make the request with HTTP Basic Auth - import requests - - def get_aura_token(client_id: str, client_secret: str) -> dict: - url = "https://api.neo4j.io/oauth/token" - data = { - "grant_type": "client_credentials" - } # sent as application/x-www-form-urlencoded - - resp = requests.post(url, data=data, auth=(client_id, client_secret)) - resp.raise_for_status() # raises if the request failed - return resp.json() - - resp = get_aura_token(client_id, client_secret) - - url = "https://api.neo4j.io/v1/instances" - - headers = { - "accept": "application/json", - "Authorization": f"Bearer {resp['access_token']}", - "Content-Type": "application/json", - } - - payload = { - "version": "5", - "region": "europe-west1", - "memory": "1GB", - "name": graph_db_name[0:29], - "type": "professional-db", - "tenant_id": tenant_id, - "cloud_provider": "gcp", - } - - response = requests.post(url, headers=headers, json=payload) - - # Wait for instance to be provisioned - # TODO: Find better way to check when instance is ready - import asyncio - - await asyncio.sleep(180) - - print(response.status_code) - print(response.text) - # TODO: Find better name to name Neo4j instance within 30 character limit - print(graph_db_name[0:29]) - graph_db_name = "neo4j" - graph_db_url = response.json()["data"]["connection_url"] - graph_db_key = resp["access_token"] - graph_db_username = response.json()["data"]["username"] - graph_db_password = response.json()["data"]["password"] - else: - graph_db_url = graph_config.graph_database_url - graph_db_key = graph_config.graph_database_key - graph_db_username = graph_config.graph_database_username - graph_db_password = graph_config.graph_database_password - # If there are no existing rows build a new row # TODO: Update Dataset Database migrations, also make sure database_name is not unique anymore record = DatasetDatabase( owner_id=user.id, dataset_id=dataset_id, - vector_database_name=vector_db_name, - graph_database_name=graph_db_name, - vector_database_provider=vector_config.vector_db_provider, - graph_database_provider=graph_config.graph_database_provider, - vector_database_url=vector_db_url, - graph_database_url=graph_db_url, - vector_database_key=vector_config.vector_db_key, - graph_database_key=graph_db_key, - graph_database_username=graph_db_username, - graph_database_password=graph_db_password, + **graph_config_dict, # Unpack graph db config + **vector_config_dict, # Unpack vector db config ) try: From 0176cd5a6890f7f2e8271ca6baee566fc987fd99 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 12 Nov 2025 18:01:44 +0100 Subject: [PATCH 21/54] refactor: Add todo point --- .../databases/utils/get_or_create_dataset_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 84742748d..ab56df787 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -137,7 +137,7 @@ async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: "graph_database_name": graph_db_name, "graph_database_url": graph_db_url, "graph_database_provider": graph_config.graph_database_provider, - "graph_database_key": graph_db_key, + "graph_database_key": graph_db_key, # TODO: Hashing of keys/passwords in relational DB "graph_database_username": graph_db_username, "graph_database_password": graph_db_password, } From 6bb642d6b828a39c97689ce2b5199dfc1b3f1a81 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 12 Nov 2025 21:24:40 +0100 Subject: [PATCH 22/54] refactor: Start adding multi-user functions to db interfaces --- .../utils/get_or_create_dataset_database.py | 12 +++---- .../vector/lancedb/LanceDBAdapter.py | 22 ++++++++++++ .../databases/vector/vector_db_interface.py | 34 +++++++++++++++++++ 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index ab56df787..a292d2f5b 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -20,15 +20,13 @@ from cognee.modules.users.models import User async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict: vector_config = get_vectordb_config() - base_config = get_base_config() - databases_directory_path = os.path.join( - base_config.system_root_directory, "databases", str(user.id) - ) - # Determine vector configuration if vector_config.vector_db_provider == "lancedb": - vector_db_name = f"{dataset_id}.lance.db" - vector_db_url = os.path.join(databases_directory_path, vector_db_name) + # TODO: Have the create_database method be called from interface adapter automatically for all providers instead of specifically here + from cognee.infrastructure.databases.vector.lancedb.LanceDBAdapter import LanceDBAdapter + + return await LanceDBAdapter.create_database(dataset_id, user) + else: # Note: for hybrid databases both graph and vector DB name have to be the same vector_db_name = vector_config.vector_db_name diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 30631ac4c..f2d8fcc09 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -1,10 +1,15 @@ import asyncio from os import path +import os +from uuid import UUID import lancedb from pydantic import BaseModel from lancedb.pydantic import LanceModel, Vector from typing import Generic, List, Optional, TypeVar, Union, get_args, get_origin, get_type_hints +from cognee.base_config import get_base_config +from cognee.infrastructure.databases.vector import get_vectordb_config +from cognee.modules.users.models import User from cognee.infrastructure.databases.exceptions import MissingQueryParameterError from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine.utils import parse_id @@ -357,3 +362,20 @@ class LanceDBAdapter(VectorDBInterface): }, exclude_fields=["metadata"] + related_models_fields, ) + + @classmethod + async def create_database(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + vector_config = get_vectordb_config() + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + vector_db_name = f"{dataset_id}.lance.db" + + return { + "vector_database_name": vector_db_name, + "vector_database_url": os.path.join(databases_directory_path, vector_db_name), + "vector_database_provider": vector_config.vector_db_provider, + "vector_database_key": vector_config.vector_db_key, + } diff --git a/cognee/infrastructure/databases/vector/vector_db_interface.py b/cognee/infrastructure/databases/vector/vector_db_interface.py index 3a3df62eb..b89818275 100644 --- a/cognee/infrastructure/databases/vector/vector_db_interface.py +++ b/cognee/infrastructure/databases/vector/vector_db_interface.py @@ -2,6 +2,8 @@ from typing import List, Protocol, Optional, Union, Any from abc import abstractmethod from cognee.infrastructure.engine import DataPoint from .models.PayloadSchema import PayloadSchema +from uuid import UUID +from cognee.modules.users.models import User class VectorDBInterface(Protocol): @@ -217,3 +219,35 @@ class VectorDBInterface(Protocol): - Any: The schema object suitable for this vector database """ return model_type + + @classmethod + async def create_database(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + """ + Return a dictionary with connection info for a vector database for the given dataset and user. + Function should auto handle deploying of the actual database if needed. + Needed for Cognee multi-tenant/multi-user and backend access control support. + + Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database. + From which internal mapping of dataset -> database connection info will be done. + + Each dataset needs to map to a unique vector database instance when backend access control is enabled. + + Args: + dataset_id: UUID of the dataset if needed by the database creation logic + user: User object if needed by the database creation logic + Returns: + dict: Connection info for the created vector database instance. + """ + pass + + async def delete_database(self, dataset_id: UUID, user: User) -> None: + """ + Delete the vector database instance for the given dataset and user. + Function should auto handle deleting of the actual database. + Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control. + + Args: + dataset_id: UUID of the dataset + user: User object + """ + pass From 3acb581bd03e507885556b6709d3465671b2497c Mon Sep 17 00:00:00 2001 From: martin0731 Date: Thu, 13 Nov 2025 08:31:15 -0500 Subject: [PATCH 23/54] Removed check_permissions_on_dataset.py and related references --- cognee/api/v1/cognify/cognify.py | 25 +++++++----------- .../task_getters/get_cascade_graph_tasks.py | 2 -- .../get_default_tasks_by_indices.py | 8 +++--- cognee/tasks/documents/__init__.py | 1 - .../documents/check_permissions_on_dataset.py | 26 ------------------- examples/python/simple_example.py | 11 +++----- notebooks/cognee_demo.ipynb | 4 +-- 7 files changed, 19 insertions(+), 58 deletions(-) delete mode 100644 cognee/tasks/documents/check_permissions_on_dataset.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 0fa345176..4efec365a 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -19,7 +19,6 @@ from cognee.modules.ontology.get_default_ontology_resolver import ( from cognee.modules.users.models import User from cognee.tasks.documents import ( - check_permissions_on_dataset, classify_documents, extract_chunks_from_documents, ) @@ -78,12 +77,11 @@ async def cognify( Processing Pipeline: 1. **Document Classification**: Identifies document types and structures - 2. **Permission Validation**: Ensures user has processing rights - 3. **Text Chunking**: Breaks content into semantically meaningful segments - 4. **Entity Extraction**: Identifies key concepts, people, places, organizations - 5. **Relationship Detection**: Discovers connections between entities - 6. **Graph Construction**: Builds semantic knowledge graph with embeddings - 7. **Content Summarization**: Creates hierarchical summaries for navigation + 2. **Text Chunking**: Breaks content into semantically meaningful segments + 3. **Entity Extraction**: Identifies key concepts, people, places, organizations + 4. **Relationship Detection**: Discovers connections between entities + 5. **Graph Construction**: Builds semantic knowledge graph with embeddings + 6. **Content Summarization**: Creates hierarchical summaries for navigation Graph Model Customization: The `graph_model` parameter allows custom knowledge structures: @@ -274,7 +272,6 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's default_tasks = [ Task(classify_documents), - Task(check_permissions_on_dataset, user=user, permissions=["write"]), Task( extract_chunks_from_documents, max_chunk_size=chunk_size or get_max_chunk_tokens(), @@ -305,14 +302,13 @@ async def get_temporal_tasks( The pipeline includes: 1. Document classification. - 2. Dataset permission checks (requires "write" access). - 3. Document chunking with a specified or default chunk size. - 4. Event and timestamp extraction from chunks. - 5. Knowledge graph extraction from events. - 6. Batched insertion of data points. + 2. Document chunking with a specified or default chunk size. + 3. Event and timestamp extraction from chunks. + 4. Knowledge graph extraction from events. + 5. Batched insertion of data points. Args: - user (User, optional): The user requesting task execution, used for permission checks. + user (User, optional): The user requesting task execution. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify @@ -325,7 +321,6 @@ async def get_temporal_tasks( temporal_tasks = [ Task(classify_documents), - Task(check_permissions_on_dataset, user=user, permissions=["write"]), Task( extract_chunks_from_documents, max_chunk_size=chunk_size or get_max_chunk_tokens(), diff --git a/cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py b/cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py index edac15015..1fbc31c02 100644 --- a/cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +++ b/cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py @@ -8,7 +8,6 @@ from cognee.modules.users.models import User from cognee.shared.data_models import KnowledgeGraph from cognee.shared.utils import send_telemetry from cognee.tasks.documents import ( - check_permissions_on_dataset, classify_documents, extract_chunks_from_documents, ) @@ -31,7 +30,6 @@ async def get_cascade_graph_tasks( cognee_config = get_cognify_config() default_tasks = [ Task(classify_documents), - Task(check_permissions_on_dataset, user=user, permissions=["write"]), Task( extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens() ), # Extract text chunks based on the document type. diff --git a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py index fb10c7eed..6a39a67cf 100644 --- a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +++ b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py @@ -30,8 +30,8 @@ async def get_no_summary_tasks( ontology_file_path=None, ) -> List[Task]: """Returns default tasks without summarization tasks.""" - # Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks) - base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker) + # Get base tasks (0=classify, 1=extract_chunks) + base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker) ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path) @@ -51,8 +51,8 @@ async def get_just_chunks_tasks( chunk_size: int = None, chunker=TextChunker, user=None ) -> List[Task]: """Returns default tasks with only chunk extraction and data points addition.""" - # Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks) - base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker) + # Get base tasks (0=classify, 1=extract_chunks) + base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker) add_data_points_task = Task(add_data_points, task_config={"batch_size": 10}) diff --git a/cognee/tasks/documents/__init__.py b/cognee/tasks/documents/__init__.py index f4582fbe0..043625f35 100644 --- a/cognee/tasks/documents/__init__.py +++ b/cognee/tasks/documents/__init__.py @@ -1,3 +1,2 @@ from .classify_documents import classify_documents from .extract_chunks_from_documents import extract_chunks_from_documents -from .check_permissions_on_dataset import check_permissions_on_dataset diff --git a/cognee/tasks/documents/check_permissions_on_dataset.py b/cognee/tasks/documents/check_permissions_on_dataset.py deleted file mode 100644 index 01a03de5f..000000000 --- a/cognee/tasks/documents/check_permissions_on_dataset.py +++ /dev/null @@ -1,26 +0,0 @@ -from cognee.modules.data.processing.document_types import Document -from cognee.modules.users.permissions.methods import check_permission_on_dataset -from typing import List - - -async def check_permissions_on_dataset( - documents: List[Document], context: dict, user, permissions -) -> List[Document]: - """ - Validates a user's permissions on a list of documents. - - Notes: - - This function assumes that `check_permission_on_documents` raises an exception if the permission check fails. - - It is designed to validate multiple permissions in a sequential manner for the same set of documents. - - Ensure that the `Document` and `user` objects conform to the expected structure and interfaces. - """ - - for permission in permissions: - await check_permission_on_dataset( - user, - permission, - # TODO: pass dataset through argument instead of context - context["dataset"].id, - ) - - return documents diff --git a/examples/python/simple_example.py b/examples/python/simple_example.py index c13e48f85..347ace365 100644 --- a/examples/python/simple_example.py +++ b/examples/python/simple_example.py @@ -32,16 +32,13 @@ async def main(): print("Cognify process steps:") print("1. Classifying the document: Determining the type and category of the input text.") print( - "2. Checking permissions: Ensuring the user has the necessary rights to process the text." + "2. Extracting text chunks: Breaking down the text into sentences or phrases for analysis." ) print( - "3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis." + "3. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph." ) - print("4. Adding data points: Storing the extracted chunks for processing.") - print( - "5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph." - ) - print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n") + print("4. Summarizing text: Creating concise summaries of the content for quick insights.") + print("5. Adding data points: Storing the extracted chunks for processing.\n") # Use LLMs and cognee to create knowledge graph await cognee.cognify() diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 09c4c89be..fe6ae50ae 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -591,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "7c431fdef4921ae0", "metadata": { "ExecuteTime": { @@ -609,7 +609,6 @@ "from cognee.modules.pipelines import run_tasks\n", "from cognee.modules.users.models import User\n", "from cognee.tasks.documents import (\n", - " check_permissions_on_dataset,\n", " classify_documents,\n", " extract_chunks_from_documents,\n", ")\n", @@ -627,7 +626,6 @@ "\n", " tasks = [\n", " Task(classify_documents),\n", - " Task(check_permissions_on_dataset, user=user, permissions=[\"write\"]),\n", " Task(\n", " extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()\n", " ), # Extract text chunks based on the document type.\n", From 68d81a912519e8e19eda6a2657c14853d72234f2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 20 Nov 2025 18:37:15 +0100 Subject: [PATCH 24/54] refactor: Update multi-user database dataset creation mechanism --- .../databases/graph/graph_db_interface.py | 34 +++++++ .../databases/graph/neo4j_driver/adapter.py | 91 +++++++++++++++++++ .../utils/get_or_create_dataset_database.py | 72 +-------------- .../vector/lancedb/LanceDBAdapter.py | 2 +- .../databases/vector/vector_db_interface.py | 15 +-- 5 files changed, 139 insertions(+), 75 deletions(-) diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 67df1a27c..6d323764b 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -6,6 +6,7 @@ from typing import Optional, Dict, Any, List, Tuple, Type, Union from uuid import NAMESPACE_OID, UUID, uuid5 from cognee.shared.logging_utils import get_logger from cognee.infrastructure.engine import DataPoint +from cognee.modules.users.models.User import User from cognee.modules.data.models.graph_relationship_ledger import GraphRelationshipLedger from cognee.infrastructure.databases.relational.get_relational_engine import get_relational_engine @@ -398,3 +399,36 @@ class GraphDBInterface(ABC): - node_id (Union[str, UUID]): Unique identifier of the node for which to retrieve connections. """ raise NotImplementedError + + @classmethod + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + """ + Return a dictionary with connection info for a graph database for the given dataset. + Function can auto handle deploying of the actual database if needed, but is not necessary. + Only providing connection info is sufficient, this info will be mapped when trying to connect to the provided dataset in the future. + Needed for Cognee multi-tenant/multi-user and backend access control support. + + Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database. + From which internal mapping of dataset -> database connection info will be done. + + Each dataset needs to map to a unique graph database when backend access control is enabled to facilitate a separation of concern for data. + + Args: + dataset_id: UUID of the dataset if needed by the database creation logic + user: User object if needed by the database creation logic + Returns: + dict: Connection info for the created graph database instance. + """ + pass + + async def delete_dataset(self, dataset_id: UUID, user: User) -> None: + """ + Delete the graph database for the given dataset. + Function should auto handle deleting of the actual database or send a request to the proper service to delete the database. + Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control. + + Args: + dataset_id: UUID of the dataset + user: User object + """ + pass diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 6216e107e..dfcf36499 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1,7 +1,9 @@ """Neo4j Adapter for Graph Database""" +import os import json import asyncio +import requests from uuid import UUID from textwrap import dedent from neo4j import AsyncSession @@ -12,6 +14,7 @@ from typing import Optional, Any, List, Dict, Type, Tuple from cognee.infrastructure.engine import DataPoint from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int +from cognee.modules.users.models import User from cognee.tasks.temporal_graph.models import Timestamp from cognee.shared.logging_utils import get_logger, ERROR from cognee.infrastructure.databases.graph.graph_db_interface import ( @@ -1470,3 +1473,91 @@ class Neo4jAdapter(GraphDBInterface): time_ids_list = [item["id"] for item in time_nodes if "id" in item] return ", ".join(f"'{uid}'" for uid in time_ids_list) + + @classmethod + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + """ + Create a new Neo4j Aura instance for the dataset. Return connection info that will be mapped to the dataset. + + Args: + dataset_id: Dataset UUID + user: User object who owns the dataset and is making the request + + Returns: + dict: Connection details for the created Neo4j instance + + """ + graph_db_name = f"{dataset_id}" + + # Client credentials + client_id = os.environ.get("NEO4J_CLIENT_ID", None) + client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) + tenant_id = os.environ.get("NEO4J_TENANT_ID", None) + + # Make the request with HTTP Basic Auth + def get_aura_token(client_id: str, client_secret: str) -> dict: + url = "https://api.neo4j.io/oauth/token" + data = {"grant_type": "client_credentials"} # sent as application/x-www-form-urlencoded + + resp = requests.post(url, data=data, auth=(client_id, client_secret)) + resp.raise_for_status() # raises if the request failed + return resp.json() + + resp = get_aura_token(client_id, client_secret) + + url = "https://api.neo4j.io/v1/instances" + + headers = { + "accept": "application/json", + "Authorization": f"Bearer {resp['access_token']}", + "Content-Type": "application/json", + } + + # TODO: Maybe we can allow **kwargs parameter forwarding for cases like these + # Too allow different configurations between datasets + payload = { + "version": "5", + "region": "europe-west1", + "memory": "1GB", + "name": graph_db_name[0:29], + "type": "professional-db", + "tenant_id": tenant_id, + "cloud_provider": "gcp", + } + + response = requests.post(url, headers=headers, json=payload) + + print(response.status_code) + print(response.text) + # TODO: Find better name to name Neo4j instance within 30 character limit + print(graph_db_name[0:29]) + graph_db_name = "neo4j" + graph_db_url = response.json()["data"]["connection_url"] + graph_db_key = resp["access_token"] + graph_db_username = response.json()["data"]["username"] + graph_db_password = response.json()["data"]["password"] + + async def _wait_for_neo4j_instance_provisioning(instance_id: str, headers: dict): + # Poll until the instance is running + status_url = f"https://api.neo4j.io/v1/instances/{instance_id}" + status = "" + for attempt in range(30): # Try for up to ~5 minutes + status_resp = requests.get(status_url, headers=headers) + status = status_resp.json()["data"]["status"] + if status.lower() == "running": + return + await asyncio.sleep(10) + raise TimeoutError( + f"Neo4j instance '{graph_db_name}' did not become ready within 5 minutes. Status: {status}" + ) + + instance_id = response.json()["data"]["id"] + await _wait_for_neo4j_instance_provisioning(instance_id, headers) + return { + "graph_database_name": graph_db_name, + "graph_database_url": graph_db_url, + "graph_database_provider": "neo4j", + "graph_database_key": graph_db_key, # TODO: Hashing of keys/passwords in relational DB + "graph_database_username": graph_db_username, + "graph_database_password": graph_db_password, + } diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index a292d2f5b..b60640d4c 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -25,7 +25,7 @@ async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict: # TODO: Have the create_database method be called from interface adapter automatically for all providers instead of specifically here from cognee.infrastructure.databases.vector.lancedb.LanceDBAdapter import LanceDBAdapter - return await LanceDBAdapter.create_database(dataset_id, user) + return await LanceDBAdapter.create_dataset(dataset_id, user) else: # Note: for hybrid databases both graph and vector DB name have to be the same @@ -42,75 +42,11 @@ async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict: async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: graph_config = get_graph_config() - # Determine graph database URL if graph_config.graph_database_provider == "neo4j": - graph_db_name = f"{dataset_id}" - # Auto deploy instance to Aura DB - # OAuth2 token endpoint + from cognee.infrastructure.databases.graph.neo4j_driver.adapter import Neo4jAdapter - # Your client credentials - client_id = os.environ.get("NEO4J_CLIENT_ID", None) - client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) - tenant_id = os.environ.get("NEO4J_TENANT_ID", None) - - # Make the request with HTTP Basic Auth - def get_aura_token(client_id: str, client_secret: str) -> dict: - url = "https://api.neo4j.io/oauth/token" - data = {"grant_type": "client_credentials"} # sent as application/x-www-form-urlencoded - - resp = requests.post(url, data=data, auth=(client_id, client_secret)) - resp.raise_for_status() # raises if the request failed - return resp.json() - - resp = get_aura_token(client_id, client_secret) - - url = "https://api.neo4j.io/v1/instances" - - headers = { - "accept": "application/json", - "Authorization": f"Bearer {resp['access_token']}", - "Content-Type": "application/json", - } - - payload = { - "version": "5", - "region": "europe-west1", - "memory": "1GB", - "name": graph_db_name[0:29], - "type": "professional-db", - "tenant_id": tenant_id, - "cloud_provider": "gcp", - } - - response = requests.post(url, headers=headers, json=payload) - - print(response.status_code) - print(response.text) - # TODO: Find better name to name Neo4j instance within 30 character limit - print(graph_db_name[0:29]) - graph_db_name = "neo4j" - graph_db_url = response.json()["data"]["connection_url"] - graph_db_key = resp["access_token"] - graph_db_username = response.json()["data"]["username"] - graph_db_password = response.json()["data"]["password"] - - async def _wait_for_neo4j_instance_provisioning(instance_id: str, headers: dict): - # Poll until the instance is running - status_url = f"https://api.neo4j.io/v1/instances/{instance_id}" - status = "" - for attempt in range(30): # Try for up to ~5 minutes - status_resp = requests.get(status_url, headers=headers) - status = status_resp.json()["data"]["status"] - if status.lower() == "running": - return - await asyncio.sleep(10) - raise TimeoutError( - f"Neo4j instance '{graph_db_name}' did not become ready within 5 minutes. Status: {status}" - ) - - instance_id = response.json()["data"]["id"] - await _wait_for_neo4j_instance_provisioning(instance_id, headers) + return await Neo4jAdapter.create_dataset(dataset_id, user) elif graph_config.graph_database_provider == "kuzu": # TODO: Add graph file path info for kuzu (also in DatasetDatabase model) @@ -176,6 +112,8 @@ async def get_or_create_dataset_database( • If the row already exists, it is fetched and returned. • Otherwise a new one is created atomically and returned. + DatasetDatabase row contains connection and provider info for vector and graph databases. + Parameters ---------- user : User diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index f2d8fcc09..a93fbc818 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -364,7 +364,7 @@ class LanceDBAdapter(VectorDBInterface): ) @classmethod - async def create_database(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: vector_config = get_vectordb_config() base_config = get_base_config() databases_directory_path = os.path.join( diff --git a/cognee/infrastructure/databases/vector/vector_db_interface.py b/cognee/infrastructure/databases/vector/vector_db_interface.py index b89818275..12ace1a6c 100644 --- a/cognee/infrastructure/databases/vector/vector_db_interface.py +++ b/cognee/infrastructure/databases/vector/vector_db_interface.py @@ -221,16 +221,17 @@ class VectorDBInterface(Protocol): return model_type @classmethod - async def create_database(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: """ - Return a dictionary with connection info for a vector database for the given dataset and user. - Function should auto handle deploying of the actual database if needed. + Return a dictionary with connection info for a vector database for the given dataset. + Function can auto handle deploying of the actual database if needed, but is not necessary. + Only providing connection info is sufficient, this info will be mapped when trying to connect to the provided dataset in the future. Needed for Cognee multi-tenant/multi-user and backend access control support. Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database. From which internal mapping of dataset -> database connection info will be done. - Each dataset needs to map to a unique vector database instance when backend access control is enabled. + Each dataset needs to map to a unique vector database when backend access control is enabled to facilitate a separation of concern for data. Args: dataset_id: UUID of the dataset if needed by the database creation logic @@ -240,10 +241,10 @@ class VectorDBInterface(Protocol): """ pass - async def delete_database(self, dataset_id: UUID, user: User) -> None: + async def delete_dataset(self, dataset_id: UUID, user: User) -> None: """ - Delete the vector database instance for the given dataset and user. - Function should auto handle deleting of the actual database. + Delete the vector database for the given dataset. + Function should auto handle deleting of the actual database or send a request to the proper service to delete the database. Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control. Args: From 080081071337cdf4c12ef7837942b0b0a335722e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 20 Nov 2025 18:46:02 +0100 Subject: [PATCH 25/54] refactor: remove print statements --- .../databases/graph/neo4j_driver/adapter.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index dfcf36499..43e5ea654 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1519,7 +1519,9 @@ class Neo4jAdapter(GraphDBInterface): "version": "5", "region": "europe-west1", "memory": "1GB", - "name": graph_db_name[0:29], + "name": graph_db_name[ + 0:29 + ], # TODO: Find better name to name Neo4j instance within 30 character limit "type": "professional-db", "tenant_id": tenant_id, "cloud_provider": "gcp", @@ -1527,10 +1529,6 @@ class Neo4jAdapter(GraphDBInterface): response = requests.post(url, headers=headers, json=payload) - print(response.status_code) - print(response.text) - # TODO: Find better name to name Neo4j instance within 30 character limit - print(graph_db_name[0:29]) graph_db_name = "neo4j" graph_db_url = response.json()["data"]["connection_url"] graph_db_key = resp["access_token"] From 64a3ee96c45c14ce4c4ad3d16d0edbea204e2d26 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 24 Nov 2025 20:31:28 +0100 Subject: [PATCH 26/54] refactor: Create new abstraction for dataset database mapping and handling --- cognee/context_global_variables.py | 1 + .../dataset_database_handler/__init__.py | 3 + .../dataset_database_handler_interface.py | 43 +++++++ .../supported_dataset_database_handlers.py | 15 +++ .../use_dataset_database_handler.py | 5 + .../infrastructure/databases/graph/config.py | 3 + .../databases/graph/get_graph_engine.py | 1 + .../databases/graph/graph_db_interface.py | 34 ----- .../graph/kuzu/KuzuDatasetDatabaseHandler.py | 57 +++++++++ .../Neo4jAuraDatasetDatabaseHandler.py | 118 ++++++++++++++++++ .../databases/graph/neo4j_driver/adapter.py | 89 ------------- .../utils/get_or_create_dataset_database.py | 58 ++------- .../infrastructure/databases/vector/config.py | 2 + .../databases/vector/create_vector_engine.py | 1 + .../vector/lancedb/LanceDBAdapter.py | 17 --- .../lancedb/LanceDBDatasetDatabaseHandler.py | 41 ++++++ 16 files changed, 300 insertions(+), 188 deletions(-) create mode 100644 cognee/infrastructure/databases/dataset_database_handler/__init__.py create mode 100644 cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py create mode 100644 cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py create mode 100644 cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py create mode 100644 cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py create mode 100644 cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py create mode 100644 cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 44ead95af..2b6ffa058 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -27,6 +27,7 @@ async def set_session_user_context_variable(user): def multi_user_support_possible(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() + # TODO: Make sure dataset database handler and provider match, remove multi_user support check, add error if no dataset database handler exists for provider return ( graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT diff --git a/cognee/infrastructure/databases/dataset_database_handler/__init__.py b/cognee/infrastructure/databases/dataset_database_handler/__init__.py new file mode 100644 index 000000000..a74017113 --- /dev/null +++ b/cognee/infrastructure/databases/dataset_database_handler/__init__.py @@ -0,0 +1,3 @@ +from .dataset_database_handler_interface import DatasetDatabaseHandlerInterface +from .supported_dataset_database_handlers import supported_dataset_database_handlers +from .use_dataset_database_handler import use_dataset_database_handler diff --git a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py new file mode 100644 index 000000000..6dadee6cf --- /dev/null +++ b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py @@ -0,0 +1,43 @@ +from typing import Optional +from uuid import UUID +from abc import ABC, abstractmethod + +from cognee.modules.users.models.User import User + + +class DatasetDatabaseHandlerInterface(ABC): + @classmethod + @abstractmethod + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + """ + Return a dictionary with connection info for a graph or vector database for the given dataset. + Function can auto handle deploying of the actual database if needed, but is not necessary. + Only providing connection info is sufficient, this info will be mapped when trying to connect to the provided dataset in the future. + Needed for Cognee multi-tenant/multi-user and backend access control support. + + Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database. + From which internal mapping of dataset -> database connection info will be done. + + Each dataset needs to map to a unique graph or vector database when backend access control is enabled to facilitate a separation of concern for data. + + Args: + dataset_id: UUID of the dataset if needed by the database creation logic + user: User object if needed by the database creation logic + Returns: + dict: Connection info for the created graph or vector database instance. + """ + pass + + @classmethod + @abstractmethod + async def delete_dataset(cls, dataset_id: UUID, user: User) -> None: + """ + Delete the graph or vector database for the given dataset. + Function should auto handle deleting of the actual database or send a request to the proper service to delete/mark the database as not needed for the given dataset. + Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control. + + Args: + dataset_id: UUID of the dataset + user: User object + """ + pass diff --git a/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py b/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py new file mode 100644 index 000000000..9cc7d9f93 --- /dev/null +++ b/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py @@ -0,0 +1,15 @@ +from cognee.infrastructure.databases.graph.neo4j_driver.Neo4jAuraDatasetDatabaseHandler import ( + Neo4jAuraDatasetDatabaseHandler, +) +from cognee.infrastructure.databases.vector.lancedb.LanceDBDatasetDatabaseHandler import ( + LanceDBDatasetDatabaseHandler, +) +from cognee.infrastructure.databases.graph.kuzu.KuzuDatasetDatabaseHandler import ( + KuzuDatasetDatabaseHandler, +) + +supported_dataset_database_handlers = { + "neo4j_aura": Neo4jAuraDatasetDatabaseHandler, + "lancedb": LanceDBDatasetDatabaseHandler, + "kuzu": KuzuDatasetDatabaseHandler, +} diff --git a/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py b/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py new file mode 100644 index 000000000..a583de354 --- /dev/null +++ b/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py @@ -0,0 +1,5 @@ +from .supported_dataset_database_handlers import supported_dataset_database_handlers + + +def use_dataset_database_handler(dataset_database_handler_name, dataset_database_handler): + supported_dataset_database_handlers[dataset_database_handler_name] = dataset_database_handler diff --git a/cognee/infrastructure/databases/graph/config.py b/cognee/infrastructure/databases/graph/config.py index 23687b359..bcf97ebfa 100644 --- a/cognee/infrastructure/databases/graph/config.py +++ b/cognee/infrastructure/databases/graph/config.py @@ -47,6 +47,7 @@ class GraphConfig(BaseSettings): graph_filename: str = "" graph_model: object = KnowledgeGraph graph_topology: object = KnowledgeGraph + graph_dataset_database_handler: str = "kuzu" model_config = SettingsConfigDict(env_file=".env", extra="allow", populate_by_name=True) # Model validator updates graph_filename and path dynamically after class creation based on current database provider @@ -97,6 +98,7 @@ class GraphConfig(BaseSettings): "graph_model": self.graph_model, "graph_topology": self.graph_topology, "model_config": self.model_config, + "graph_dataset_database_handler": self.graph_dataset_database_handler, } def to_hashable_dict(self) -> dict: @@ -121,6 +123,7 @@ class GraphConfig(BaseSettings): "graph_database_port": self.graph_database_port, "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, + "graph_dataset_database_handler": self.graph_dataset_database_handler, } diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 82e3cad6e..c37af2102 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -34,6 +34,7 @@ def create_graph_engine( graph_database_password="", graph_database_port="", graph_database_key="", + graph_dataset_database_handler="", ): """ Create a graph engine based on the specified provider type. diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 6d323764b..67df1a27c 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -6,7 +6,6 @@ from typing import Optional, Dict, Any, List, Tuple, Type, Union from uuid import NAMESPACE_OID, UUID, uuid5 from cognee.shared.logging_utils import get_logger from cognee.infrastructure.engine import DataPoint -from cognee.modules.users.models.User import User from cognee.modules.data.models.graph_relationship_ledger import GraphRelationshipLedger from cognee.infrastructure.databases.relational.get_relational_engine import get_relational_engine @@ -399,36 +398,3 @@ class GraphDBInterface(ABC): - node_id (Union[str, UUID]): Unique identifier of the node for which to retrieve connections. """ raise NotImplementedError - - @classmethod - async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: - """ - Return a dictionary with connection info for a graph database for the given dataset. - Function can auto handle deploying of the actual database if needed, but is not necessary. - Only providing connection info is sufficient, this info will be mapped when trying to connect to the provided dataset in the future. - Needed for Cognee multi-tenant/multi-user and backend access control support. - - Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database. - From which internal mapping of dataset -> database connection info will be done. - - Each dataset needs to map to a unique graph database when backend access control is enabled to facilitate a separation of concern for data. - - Args: - dataset_id: UUID of the dataset if needed by the database creation logic - user: User object if needed by the database creation logic - Returns: - dict: Connection info for the created graph database instance. - """ - pass - - async def delete_dataset(self, dataset_id: UUID, user: User) -> None: - """ - Delete the graph database for the given dataset. - Function should auto handle deleting of the actual database or send a request to the proper service to delete the database. - Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control. - - Args: - dataset_id: UUID of the dataset - user: User object - """ - pass diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py new file mode 100644 index 000000000..8859422f9 --- /dev/null +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -0,0 +1,57 @@ +import os +import asyncio +import requests +from uuid import UUID +from typing import Optional + +from cognee.modules.users.models import User + +from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface + + +class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): + """ + Handler for interacting with Kuzu Dataset databases. + """ + + @classmethod + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + """ + Create a new Kuzu instance for the dataset. Return connection info that will be mapped to the dataset. + + Args: + dataset_id: Dataset UUID + user: User object who owns the dataset and is making the request + + Returns: + dict: Connection details for the created Kuzu instance + + """ + from cognee.infrastructure.databases.graph.config import get_graph_config + + graph_config = get_graph_config() + + if graph_config.graph_database_provider != "kuzu": + raise ValueError( + "KuzuDatasetDatabaseHandler can only be used with Kuzu graph database provider." + ) + + # TODO: Add graph file path info for kuzu (also in DatasetDatabase model) + graph_db_name = f"{dataset_id}.pkl" + graph_db_url = graph_config.graph_database_url + graph_db_key = graph_config.graph_database_key + graph_db_username = graph_config.graph_database_username + graph_db_password = graph_config.graph_database_password + + return { + "graph_database_name": graph_db_name, + "graph_database_url": graph_db_url, + "graph_database_provider": graph_config.graph_database_provider, + "graph_database_key": graph_db_key, + "graph_database_username": graph_db_username, + "graph_database_password": graph_db_password, + } + + @classmethod + async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): + pass diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py new file mode 100644 index 000000000..cc38abed0 --- /dev/null +++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py @@ -0,0 +1,118 @@ +import os +import asyncio +import requests +from uuid import UUID +from typing import Optional + +from cognee.infrastructure.databases.graph import get_graph_config +from cognee.modules.users.models import User + +from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface + + +class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): + """ + Handler for interacting with Neo4j Aura Dataset databases. + """ + + @classmethod + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + """ + Create a new Neo4j Aura instance for the dataset. Return connection info that will be mapped to the dataset. + + Args: + dataset_id: Dataset UUID + user: User object who owns the dataset and is making the request + + Returns: + dict: Connection details for the created Neo4j instance + + """ + graph_config = get_graph_config() + + if graph_config.graph_database_provider != "neo4j": + raise ValueError( + "Neo4jAuraDatasetDatabaseHandler can only be used with Neo4j graph database provider." + ) + + graph_db_name = f"{dataset_id}" + + # Client credentials + client_id = os.environ.get("NEO4J_CLIENT_ID", None) + client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) + tenant_id = os.environ.get("NEO4J_TENANT_ID", None) + + if client_id is None or client_secret is None or tenant_id is None: + raise ValueError( + "NEO4J_CLIENT_ID, NEO4J_CLIENT_SECRET, and NEO4J_TENANT_ID environment variables must be set to use Neo4j Aura DatasetDatabase Handling." + ) + + # Make the request with HTTP Basic Auth + def get_aura_token(client_id: str, client_secret: str) -> dict: + url = "https://api.neo4j.io/oauth/token" + data = {"grant_type": "client_credentials"} # sent as application/x-www-form-urlencoded + + resp = requests.post(url, data=data, auth=(client_id, client_secret)) + resp.raise_for_status() # raises if the request failed + return resp.json() + + resp = get_aura_token(client_id, client_secret) + + url = "https://api.neo4j.io/v1/instances" + + headers = { + "accept": "application/json", + "Authorization": f"Bearer {resp['access_token']}", + "Content-Type": "application/json", + } + + # TODO: Maybe we can allow **kwargs parameter forwarding for cases like these + # Too allow different configurations between datasets + payload = { + "version": "5", + "region": "europe-west1", + "memory": "1GB", + "name": graph_db_name[ + 0:29 + ], # TODO: Find better name to name Neo4j instance within 30 character limit + "type": "professional-db", + "tenant_id": tenant_id, + "cloud_provider": "gcp", + } + + response = requests.post(url, headers=headers, json=payload) + + graph_db_name = "neo4j" # Has to be 'neo4j' for Aura + graph_db_url = response.json()["data"]["connection_url"] + graph_db_key = resp["access_token"] + graph_db_username = response.json()["data"]["username"] + graph_db_password = response.json()["data"]["password"] + + async def _wait_for_neo4j_instance_provisioning(instance_id: str, headers: dict): + # Poll until the instance is running + status_url = f"https://api.neo4j.io/v1/instances/{instance_id}" + status = "" + for attempt in range(30): # Try for up to ~5 minutes + status_resp = requests.get(status_url, headers=headers) + status = status_resp.json()["data"]["status"] + if status.lower() == "running": + return + await asyncio.sleep(10) + raise TimeoutError( + f"Neo4j instance '{graph_db_name}' did not become ready within 5 minutes. Status: {status}" + ) + + instance_id = response.json()["data"]["id"] + await _wait_for_neo4j_instance_provisioning(instance_id, headers) + return { + "graph_database_name": graph_db_name, + "graph_database_url": graph_db_url, + "graph_database_provider": "neo4j", + "graph_database_key": graph_db_key, # TODO: Hashing of keys/passwords in relational DB + "graph_database_username": graph_db_username, + "graph_database_password": graph_db_password, + } + + @classmethod + async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): + pass diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 43e5ea654..6216e107e 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1,9 +1,7 @@ """Neo4j Adapter for Graph Database""" -import os import json import asyncio -import requests from uuid import UUID from textwrap import dedent from neo4j import AsyncSession @@ -14,7 +12,6 @@ from typing import Optional, Any, List, Dict, Type, Tuple from cognee.infrastructure.engine import DataPoint from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int -from cognee.modules.users.models import User from cognee.tasks.temporal_graph.models import Timestamp from cognee.shared.logging_utils import get_logger, ERROR from cognee.infrastructure.databases.graph.graph_db_interface import ( @@ -1473,89 +1470,3 @@ class Neo4jAdapter(GraphDBInterface): time_ids_list = [item["id"] for item in time_nodes if "id" in item] return ", ".join(f"'{uid}'" for uid in time_ids_list) - - @classmethod - async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: - """ - Create a new Neo4j Aura instance for the dataset. Return connection info that will be mapped to the dataset. - - Args: - dataset_id: Dataset UUID - user: User object who owns the dataset and is making the request - - Returns: - dict: Connection details for the created Neo4j instance - - """ - graph_db_name = f"{dataset_id}" - - # Client credentials - client_id = os.environ.get("NEO4J_CLIENT_ID", None) - client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) - tenant_id = os.environ.get("NEO4J_TENANT_ID", None) - - # Make the request with HTTP Basic Auth - def get_aura_token(client_id: str, client_secret: str) -> dict: - url = "https://api.neo4j.io/oauth/token" - data = {"grant_type": "client_credentials"} # sent as application/x-www-form-urlencoded - - resp = requests.post(url, data=data, auth=(client_id, client_secret)) - resp.raise_for_status() # raises if the request failed - return resp.json() - - resp = get_aura_token(client_id, client_secret) - - url = "https://api.neo4j.io/v1/instances" - - headers = { - "accept": "application/json", - "Authorization": f"Bearer {resp['access_token']}", - "Content-Type": "application/json", - } - - # TODO: Maybe we can allow **kwargs parameter forwarding for cases like these - # Too allow different configurations between datasets - payload = { - "version": "5", - "region": "europe-west1", - "memory": "1GB", - "name": graph_db_name[ - 0:29 - ], # TODO: Find better name to name Neo4j instance within 30 character limit - "type": "professional-db", - "tenant_id": tenant_id, - "cloud_provider": "gcp", - } - - response = requests.post(url, headers=headers, json=payload) - - graph_db_name = "neo4j" - graph_db_url = response.json()["data"]["connection_url"] - graph_db_key = resp["access_token"] - graph_db_username = response.json()["data"]["username"] - graph_db_password = response.json()["data"]["password"] - - async def _wait_for_neo4j_instance_provisioning(instance_id: str, headers: dict): - # Poll until the instance is running - status_url = f"https://api.neo4j.io/v1/instances/{instance_id}" - status = "" - for attempt in range(30): # Try for up to ~5 minutes - status_resp = requests.get(status_url, headers=headers) - status = status_resp.json()["data"]["status"] - if status.lower() == "running": - return - await asyncio.sleep(10) - raise TimeoutError( - f"Neo4j instance '{graph_db_name}' did not become ready within 5 minutes. Status: {status}" - ) - - instance_id = response.json()["data"]["id"] - await _wait_for_neo4j_instance_provisioning(instance_id, headers) - return { - "graph_database_name": graph_db_name, - "graph_database_url": graph_db_url, - "graph_database_provider": "neo4j", - "graph_database_key": graph_db_key, # TODO: Hashing of keys/passwords in relational DB - "graph_database_username": graph_db_username, - "graph_database_password": graph_db_password, - } diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index b60640d4c..f4bacca7e 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -20,61 +20,23 @@ from cognee.modules.users.models import User async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict: vector_config = get_vectordb_config() - # Determine vector configuration - if vector_config.vector_db_provider == "lancedb": - # TODO: Have the create_database method be called from interface adapter automatically for all providers instead of specifically here - from cognee.infrastructure.databases.vector.lancedb.LanceDBAdapter import LanceDBAdapter + from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( + supported_dataset_database_handlers, + ) - return await LanceDBAdapter.create_dataset(dataset_id, user) - - else: - # Note: for hybrid databases both graph and vector DB name have to be the same - vector_db_name = vector_config.vector_db_name - vector_db_url = vector_config.vector_database_url - - return { - "vector_database_name": vector_db_name, - "vector_database_url": vector_db_url, - "vector_database_provider": vector_config.vector_db_provider, - "vector_database_key": vector_config.vector_db_key, - } + handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] + return await handler.create_dataset(dataset_id, user) async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: graph_config = get_graph_config() - # Determine graph database URL - if graph_config.graph_database_provider == "neo4j": - from cognee.infrastructure.databases.graph.neo4j_driver.adapter import Neo4jAdapter - return await Neo4jAdapter.create_dataset(dataset_id, user) + from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( + supported_dataset_database_handlers, + ) - elif graph_config.graph_database_provider == "kuzu": - # TODO: Add graph file path info for kuzu (also in DatasetDatabase model) - graph_db_name = f"{dataset_id}.pkl" - graph_db_url = graph_config.graph_database_url - graph_db_key = graph_config.graph_database_key - graph_db_username = graph_config.graph_database_username - graph_db_password = graph_config.graph_database_password - elif graph_config.graph_database_provider == "falkor": - # Note: for hybrid databases both graph and vector DB name have to be the same - graph_db_name = f"{dataset_id}" - graph_db_url = graph_config.graph_database_url - graph_db_key = graph_config.graph_database_key - graph_db_username = graph_config.graph_database_username - graph_db_password = graph_config.graph_database_password - else: - raise EnvironmentError( - f"Unsupported graph database provider for backend access control: {graph_config.graph_database_provider}" - ) - - return { - "graph_database_name": graph_db_name, - "graph_database_url": graph_db_url, - "graph_database_provider": graph_config.graph_database_provider, - "graph_database_key": graph_db_key, # TODO: Hashing of keys/passwords in relational DB - "graph_database_username": graph_db_username, - "graph_database_password": graph_db_password, - } + handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] + return await handler.create_dataset(dataset_id, user) async def _existing_dataset_database( diff --git a/cognee/infrastructure/databases/vector/config.py b/cognee/infrastructure/databases/vector/config.py index 7d28f1668..86b2a0fce 100644 --- a/cognee/infrastructure/databases/vector/config.py +++ b/cognee/infrastructure/databases/vector/config.py @@ -28,6 +28,7 @@ class VectorConfig(BaseSettings): vector_db_name: str = "" vector_db_key: str = "" vector_db_provider: str = "lancedb" + vector_dataset_database_handler: str = "lancedb" model_config = SettingsConfigDict(env_file=".env", extra="allow") @@ -63,6 +64,7 @@ class VectorConfig(BaseSettings): "vector_db_name": self.vector_db_name, "vector_db_key": self.vector_db_key, "vector_db_provider": self.vector_db_provider, + "vector_dataset_database_handler": self.vector_dataset_database_handler, } diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index b182f084b..02e01e288 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -12,6 +12,7 @@ def create_vector_engine( vector_db_name: str, vector_db_port: str = "", vector_db_key: str = "", + vector_dataset_database_handler: str = "", ): """ Create a vector database engine based on the specified provider. diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index a93fbc818..b52f78517 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -362,20 +362,3 @@ class LanceDBAdapter(VectorDBInterface): }, exclude_fields=["metadata"] + related_models_fields, ) - - @classmethod - async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: - vector_config = get_vectordb_config() - base_config = get_base_config() - databases_directory_path = os.path.join( - base_config.system_root_directory, "databases", str(user.id) - ) - - vector_db_name = f"{dataset_id}.lance.db" - - return { - "vector_database_name": vector_db_name, - "vector_database_url": os.path.join(databases_directory_path, vector_db_name), - "vector_database_provider": vector_config.vector_db_provider, - "vector_database_key": vector_config.vector_db_key, - } diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py new file mode 100644 index 000000000..8a80dddcf --- /dev/null +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py @@ -0,0 +1,41 @@ +import os +from uuid import UUID +from typing import Optional + +from cognee.modules.users.models import User +from cognee.base_config import get_base_config +from cognee.infrastructure.databases.vector import get_vectordb_config +from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface + + +class LanceDBDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): + """ + Handler for interacting with LanceDB Dataset databases. + """ + + @classmethod + async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: + vector_config = get_vectordb_config() + base_config = get_base_config() + + if vector_config.vector_db_provider != "lancedb": + raise ValueError( + "LanceDBDatasetDatabaseHandler can only be used with LanceDB vector database provider." + ) + + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + vector_db_name = f"{dataset_id}.lance.db" + + return { + "vector_database_name": vector_db_name, + "vector_database_url": os.path.join(databases_directory_path, vector_db_name), + "vector_database_provider": vector_config.vector_db_provider, + "vector_database_key": vector_config.vector_db_key, + } + + @classmethod + async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): + pass From 593f17fcdcbc7f064c5bf25371bb371fc629eeed Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 25 Nov 2025 15:41:01 +0100 Subject: [PATCH 27/54] refactor: Add better handling of configuration for dataset to database handler --- .env.template | 4 ++ cognee/context_global_variables.py | 65 ++++++++++++++----- .../supported_dataset_database_handlers.py | 9 ++- .../use_dataset_database_handler.py | 9 ++- .../utils/get_or_create_dataset_database.py | 8 +-- cognee/shared/logging_utils.py | 4 ++ 6 files changed, 71 insertions(+), 28 deletions(-) diff --git a/.env.template b/.env.template index ae2cb1338..d178965e8 100644 --- a/.env.template +++ b/.env.template @@ -93,6 +93,8 @@ DB_NAME=cognee_db # Default (local file-based) GRAPH_DATABASE_PROVIDER="kuzu" +# Handler for multi-user access control mode, it handles how should the mapping/creation of separate DBs be handled per Cognee dataset +GRAPH_DATASET_DATABASE_HANDLER="kuzu" # -- To switch to Remote Kuzu uncomment and fill these: ------------------------------------------------------------- #GRAPH_DATABASE_PROVIDER="kuzu" @@ -117,6 +119,8 @@ VECTOR_DB_PROVIDER="lancedb" # Not needed if a cloud vector database is not used VECTOR_DB_URL= VECTOR_DB_KEY= +# Handler for multi-user access control mode, it handles how should the mapping/creation of separate DBs be handled per Cognee dataset +VECTOR_DATASET_DATABASE_HANDLER="lancedb" ################################################################################ # 🧩 Ontology resolver settings diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 2b6ffa058..0e7e16178 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -4,8 +4,8 @@ from typing import Union from uuid import UUID from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector.config import get_vectordb_context_config -from cognee.infrastructure.databases.graph.config import get_graph_context_config +from cognee.infrastructure.databases.vector.config import get_vectordb_config +from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.infrastructure.databases.utils import get_or_create_dataset_database from cognee.infrastructure.files.storage.config import file_storage_config from cognee.modules.users.methods import get_user @@ -16,23 +16,59 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] -GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor", "neo4j"] - async def set_session_user_context_variable(user): session_user.set(user) def multi_user_support_possible(): - graph_db_config = get_graph_context_config() - vector_db_config = get_vectordb_context_config() - # TODO: Make sure dataset database handler and provider match, remove multi_user support check, add error if no dataset database handler exists for provider - return ( - graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT - and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT + graph_db_config = get_graph_config() + vector_db_config = get_vectordb_config() + + graph_handler = graph_db_config.graph_dataset_database_handler + vector_handler = vector_db_config.vector_dataset_database_handler + from cognee.infrastructure.databases.dataset_database_handler import ( + supported_dataset_database_handlers, ) + if graph_handler not in supported_dataset_database_handlers: + raise EnvironmentError( + "Unsupported graph dataset to database handler configured. Cannot add support for multi-user access control mode. Please use a supported graph dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n" + f"Selected graph dataset to database handler: {graph_handler}\n" + f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n" + ) + + if vector_handler not in supported_dataset_database_handlers: + raise EnvironmentError( + "Unsupported vector dataset to database handler configured. Cannot add support for multi-user access control mode. Please use a supported vector dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n" + f"Selected vector dataset to database handler: {vector_handler}\n" + f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n" + ) + + if ( + supported_dataset_database_handlers[graph_handler]["handler_provider"] + != graph_db_config.graph_database_provider + ): + raise EnvironmentError( + "The selected graph dataset to database handler does not work with the configured graph database provider. Cannot add support for multi-user access control mode. Please use a supported graph dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n" + f"Selected graph database provider: {graph_db_config.graph_database_provider}\n" + f"Selected graph dataset to database handler: {graph_handler}\n" + f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n" + ) + + if ( + supported_dataset_database_handlers[vector_handler]["handler_provider"] + != vector_db_config.vector_db_provider + ): + raise EnvironmentError( + "The selected vector dataset to database handler does not work with the configured vector database provider. Cannot add support for multi-user access control mode. Please use a supported vector dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n" + f"Selected vector database provider: {vector_db_config.vector_db_provider}\n" + f"Selected vector dataset to database handler: {vector_handler}\n" + f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n" + ) + + return True + def backend_access_control_enabled(): backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None) @@ -42,12 +78,7 @@ def backend_access_control_enabled(): return multi_user_support_possible() elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it - multi_user_support = multi_user_support_possible() - if not multi_user_support: - raise EnvironmentError( - "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." - ) - return True + return multi_user_support_possible() return False diff --git a/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py b/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py index 9cc7d9f93..adaa45e33 100644 --- a/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +++ b/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py @@ -9,7 +9,10 @@ from cognee.infrastructure.databases.graph.kuzu.KuzuDatasetDatabaseHandler impor ) supported_dataset_database_handlers = { - "neo4j_aura": Neo4jAuraDatasetDatabaseHandler, - "lancedb": LanceDBDatasetDatabaseHandler, - "kuzu": KuzuDatasetDatabaseHandler, + "neo4j_aura": { + "handler_instance": Neo4jAuraDatasetDatabaseHandler, + "handler_provider": "neo4j", + }, + "lancedb": {"handler_instance": LanceDBDatasetDatabaseHandler, "handler_provider": "lancedb"}, + "kuzu": {"handler_instance": KuzuDatasetDatabaseHandler, "handler_provider": "kuzu"}, } diff --git a/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py b/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py index a583de354..bca2128ee 100644 --- a/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +++ b/cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py @@ -1,5 +1,10 @@ from .supported_dataset_database_handlers import supported_dataset_database_handlers -def use_dataset_database_handler(dataset_database_handler_name, dataset_database_handler): - supported_dataset_database_handlers[dataset_database_handler_name] = dataset_database_handler +def use_dataset_database_handler( + dataset_database_handler_name, dataset_database_handler, dataset_database_provider +): + supported_dataset_database_handlers[dataset_database_handler_name] = { + "handler_instance": dataset_database_handler, + "handler_provider": dataset_database_provider, + } diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index f4bacca7e..665355e30 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -1,13 +1,9 @@ -import os -import asyncio -import requests from uuid import UUID from typing import Union, Optional from sqlalchemy import select from sqlalchemy.exc import IntegrityError -from cognee.base_config import get_base_config from cognee.modules.data.methods import create_dataset from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.vector import get_vectordb_config @@ -25,7 +21,7 @@ async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict: ) handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] - return await handler.create_dataset(dataset_id, user) + return await handler["handler_instance"].create_dataset(dataset_id, user) async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: @@ -36,7 +32,7 @@ async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: ) handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] - return await handler.create_dataset(dataset_id, user) + return await handler["handler_instance"].create_dataset(dataset_id, user) async def _existing_dataset_database( diff --git a/cognee/shared/logging_utils.py b/cognee/shared/logging_utils.py index e8efde72c..70a0bd37e 100644 --- a/cognee/shared/logging_utils.py +++ b/cognee/shared/logging_utils.py @@ -534,6 +534,10 @@ def setup_logging(log_level=None, name=None): # Get a configured logger and log system information logger = structlog.get_logger(name if name else __name__) + logger.warning( + "From version 0.5.0 onwards, Cognee will run with multi-user access control mode set to on by default. Data isolation between different users and datasets will be enforced and data created before multi-user access control mode was turned on won't be accessible by default. To disable multi-user access control mode and regain access to old data set the environment variable ENABLE_BACKEND_ACCESS_CONTROL to false before starting Cognee. For more information, please refer to the Cognee documentation." + ) + if logs_dir is not None: logger.info(f"Log file created at: {log_file_path}", log_file=log_file_path) From 2e02aafbaed6caa38c86b343b6fa4a1cef51683a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 25 Nov 2025 15:55:36 +0100 Subject: [PATCH 28/54] refactor: Remove unused imports --- .../databases/vector/lancedb/LanceDBAdapter.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index b52f78517..30631ac4c 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -1,15 +1,10 @@ import asyncio from os import path -import os -from uuid import UUID import lancedb from pydantic import BaseModel from lancedb.pydantic import LanceModel, Vector from typing import Generic, List, Optional, TypeVar, Union, get_args, get_origin, get_type_hints -from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector import get_vectordb_config -from cognee.modules.users.models import User from cognee.infrastructure.databases.exceptions import MissingQueryParameterError from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine.utils import parse_id From 5f3b7764068ea7c31013b42d8133f2375930d4d0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 25 Nov 2025 16:38:34 +0100 Subject: [PATCH 29/54] chore: add todo for enhancing db connections --- cognee/modules/users/models/DatasetDatabase.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 4bbfffe4c..75e650bcd 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -24,6 +24,9 @@ class DatasetDatabase(Base): vector_database_key = Column(String, unique=False, nullable=True) graph_database_key = Column(String, unique=False, nullable=True) + # TODO: Instead of specifying and forwawrding all these individual fields, consider using a JSON field to store + # configuration details for different database types. This would make it more flexible to add new database types + # without changing the database schema. graph_database_username = Column(String, unique=False, nullable=True) graph_database_password = Column(String, unique=False, nullable=True) From 69777ef0a5d80b3a2a10d91d59a9e4f051d019ca Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 25 Nov 2025 17:53:21 +0100 Subject: [PATCH 30/54] feat: Add ability to handle custom connection resolution to avoid storing security critical data in rel dbx --- cognee/context_global_variables.py | 11 ++++- .../dataset_database_handler_interface.py | 40 +++++++++++++++++- .../graph/kuzu/KuzuDatasetDatabaseHandler.py | 6 ++- .../Neo4jAuraDatasetDatabaseHandler.py | 8 ++-- .../databases/utils/__init__.py | 1 + ...esolve_dataset_database_connection_info.py | 42 +++++++++++++++++++ .../modules/users/models/DatasetDatabase.py | 6 +-- 7 files changed, 103 insertions(+), 11 deletions(-) create mode 100644 cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 0e7e16178..58fff2dff 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -7,6 +7,7 @@ from cognee.base_config import get_base_config from cognee.infrastructure.databases.vector.config import get_vectordb_config from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.infrastructure.databases.utils import get_or_create_dataset_database +from cognee.infrastructure.databases.utils import resolve_dataset_database_connection_info from cognee.infrastructure.files.storage.config import file_storage_config from cognee.modules.users.methods import get_user @@ -108,6 +109,8 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # To ensure permissions are enforced properly all datasets will have their own databases dataset_database = await get_or_create_dataset_database(dataset, user) + # Ensure that all connection info is resolved properly + dataset_database = await resolve_dataset_database_connection_info(dataset_database) base_config = get_base_config() data_root_directory = os.path.join( @@ -133,8 +136,12 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ "graph_file_path": os.path.join( databases_directory_path, dataset_database.graph_database_name ), - "graph_database_username": dataset_database.graph_database_username, - "graph_database_password": dataset_database.graph_database_password, + "graph_database_username": dataset_database.graph_database_connection_info.get( + "graph_database_username", "" + ), + "graph_database_password": dataset_database.graph_database_connection_info.get( + "graph_database_password", "" + ), } storage_config = { diff --git a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py index 6dadee6cf..01ee46c48 100644 --- a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +++ b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py @@ -3,6 +3,7 @@ from uuid import UUID from abc import ABC, abstractmethod from cognee.modules.users.models.User import User +from cognee.modules.users.models.DatasetDatabase import DatasetDatabase class DatasetDatabaseHandlerInterface(ABC): @@ -10,7 +11,7 @@ class DatasetDatabaseHandlerInterface(ABC): @abstractmethod async def create_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> dict: """ - Return a dictionary with connection info for a graph or vector database for the given dataset. + Return a dictionary with database connection/resolution info for a graph or vector database for the given dataset. Function can auto handle deploying of the actual database if needed, but is not necessary. Only providing connection info is sufficient, this info will be mapped when trying to connect to the provided dataset in the future. Needed for Cognee multi-tenant/multi-user and backend access control support. @@ -18,6 +19,10 @@ class DatasetDatabaseHandlerInterface(ABC): Dictionary returned from this function will be used to create a DatasetDatabase row in the relational database. From which internal mapping of dataset -> database connection info will be done. + The returned dictionary is stored verbatim in the relational database and is later passed to + resolve_dataset_connection_info() at connection time. For safe credential handling, prefer + returning only references to secrets or role identifiers, not plaintext credentials. + Each dataset needs to map to a unique graph or vector database when backend access control is enabled to facilitate a separation of concern for data. Args: @@ -28,6 +33,39 @@ class DatasetDatabaseHandlerInterface(ABC): """ pass + @classmethod + async def resolve_dataset_connection_info( + cls, dataset_database: DatasetDatabase + ) -> DatasetDatabase: + """ + Resolve runtime connection details for a dataset’s backing graph/vector database. + Function is intended to be overwritten to implement custom logic for resolving connection info. + + This method is invoked right before the application opens a connection for a given dataset. + It receives the DatasetDatabase row that was persisted when create_dataset() ran and must + return a modified instance of DatasetDatabase with concrete connection parameters that the client/driver can use. + Do not update these new DatasetDatabase values in the relational database to avoid storing secure credentials. + + In case of separate graph and vector database handlers, each handler should implement its own logic for resolving + connection info and only change parameters related to its appropriate database, the resolution function will then + be called one after another with the updated DatasetDatabase value from the previous function as the input. + + Typical behavior: + - If the DatasetDatabase row already contains raw connection fields (e.g., host/port/db/user/password + or api_url/api_key), return them as-is. + - If the row stores only references (e.g., secret IDs, vault paths, cloud resource ARNs/IDs, IAM + roles, SSO tokens), resolve those references by calling the appropriate secret manager or provider + API to obtain short-lived credentials and assemble the final connection DatasetDatabase object. + - Do not persist any resolved or decrypted secrets back to the relational database. Return them only + to the caller. + + Args: + dataset_database: DatasetDatabase row from the relational database + Returns: + DatasetDatabase: Updated instance with resolved connection info + """ + return dataset_database + @classmethod @abstractmethod async def delete_dataset(cls, dataset_id: UUID, user: User) -> None: diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py index 8859422f9..a2b2da8f4 100644 --- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -48,8 +48,10 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): "graph_database_url": graph_db_url, "graph_database_provider": graph_config.graph_database_provider, "graph_database_key": graph_db_key, - "graph_database_username": graph_db_username, - "graph_database_password": graph_db_password, + "graph_database_connection_info": { + "graph_database_username": graph_db_username, + "graph_database_password": graph_db_password, + }, } @classmethod diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py index cc38abed0..d1e5eee6f 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py @@ -108,9 +108,11 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): "graph_database_name": graph_db_name, "graph_database_url": graph_db_url, "graph_database_provider": "neo4j", - "graph_database_key": graph_db_key, # TODO: Hashing of keys/passwords in relational DB - "graph_database_username": graph_db_username, - "graph_database_password": graph_db_password, + "graph_database_key": graph_db_key, + "graph_database_connection_info": { # TODO: Hashing of keys/passwords in relational DB + "graph_database_username": graph_db_username, + "graph_database_password": graph_db_password, + }, } @classmethod diff --git a/cognee/infrastructure/databases/utils/__init__.py b/cognee/infrastructure/databases/utils/__init__.py index 1dfa15640..f31d1e0dc 100644 --- a/cognee/infrastructure/databases/utils/__init__.py +++ b/cognee/infrastructure/databases/utils/__init__.py @@ -1 +1,2 @@ from .get_or_create_dataset_database import get_or_create_dataset_database +from .resolve_dataset_database_connection_info import resolve_dataset_database_connection_info diff --git a/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py b/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py new file mode 100644 index 000000000..4d8c19403 --- /dev/null +++ b/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py @@ -0,0 +1,42 @@ +from cognee.infrastructure.databases.vector import get_vectordb_config +from cognee.infrastructure.databases.graph.config import get_graph_config +from cognee.modules.users.models.DatasetDatabase import DatasetDatabase + + +async def _get_vector_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase: + vector_config = get_vectordb_config() + + from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( + supported_dataset_database_handlers, + ) + + handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] + return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database) + + +async def _get_graph_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase: + graph_config = get_graph_config() + + from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( + supported_dataset_database_handlers, + ) + + handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] + return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database) + + +async def resolve_dataset_database_connection_info( + dataset_database: DatasetDatabase, +) -> DatasetDatabase: + """ + Resolve the connection info for the given DatasetDatabase instance. + Resolve both vector and graph database connection info and return the updated DatasetDatabase instance. + + Args: + dataset_database: DatasetDatabase instance + Returns: + DatasetDatabase instance with resolved connection info + """ + dataset_database = await _get_vector_db_connection_info(dataset_database) + dataset_database = await _get_graph_db_connection_info(dataset_database) + return dataset_database diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 75e650bcd..b864fb951 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -1,6 +1,6 @@ from datetime import datetime, timezone -from sqlalchemy import Column, DateTime, String, UUID, ForeignKey +from sqlalchemy import Column, DateTime, String, UUID, ForeignKey, JSON from cognee.infrastructure.databases.relational import Base @@ -27,8 +27,8 @@ class DatasetDatabase(Base): # TODO: Instead of specifying and forwawrding all these individual fields, consider using a JSON field to store # configuration details for different database types. This would make it more flexible to add new database types # without changing the database schema. - graph_database_username = Column(String, unique=False, nullable=True) - graph_database_password = Column(String, unique=False, nullable=True) + graph_database_connection_info = Column(JSON, unique=False, nullable=True) + vector_database_connection_info = Column(JSON, unique=False, nullable=True) created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From cf9edf2663e87cfa9a77972015f1f39beb4a462f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 25 Nov 2025 18:03:35 +0100 Subject: [PATCH 31/54] chore: Add migration for new dataset database model field --- ...d2b2_expand_dataset_database_with_json_.py | 66 +++++++++++++++++++ .../modules/users/models/DatasetDatabase.py | 4 +- 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py diff --git a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py new file mode 100644 index 000000000..becd29226 --- /dev/null +++ b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py @@ -0,0 +1,66 @@ +"""Expand dataset database with json connection field + +Revision ID: 46a6ce2bd2b2 +Revises: 76625596c5c3 +Create Date: 2025-11-25 17:56:28.938931 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "46a6ce2bd2b2" +down_revision: Union[str, None] = "76625596c5c3" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + vector_database_connection_info_column = _get_column( + insp, "dataset_database", "vector_database_connection_info" + ) + if not vector_database_connection_info_column: + op.add_column( + "dataset_database", + sa.Column( + "vector_database_connection_info", + sa.JSON(), + unique=False, + nullable=False, + default={}, + ), + ) + + graph_database_connection_info_column = _get_column( + insp, "dataset_database", "graph_database_connection_info" + ) + if not graph_database_connection_info_column: + op.add_column( + "dataset_database", + sa.Column( + "graph_database_connection_info", + sa.JSON(), + unique=False, + nullable=False, + default={}, + ), + ) + + +def downgrade() -> None: + op.drop_column("dataset_database", "vector_database_connection_info") + op.drop_column("dataset_database", "graph_database_connection_info") diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index b864fb951..fee323d2f 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -27,8 +27,8 @@ class DatasetDatabase(Base): # TODO: Instead of specifying and forwawrding all these individual fields, consider using a JSON field to store # configuration details for different database types. This would make it more flexible to add new database types # without changing the database schema. - graph_database_connection_info = Column(JSON, unique=False, nullable=True) - vector_database_connection_info = Column(JSON, unique=False, nullable=True) + graph_database_connection_info = Column(JSON, unique=False, nullable=False, default={}) + vector_database_connection_info = Column(JSON, unique=False, nullable=False, default={}) created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From 1ff6a72fc7cf1ba9b8906ab7821162df5c924a18 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 26 Nov 2025 16:45:18 +0100 Subject: [PATCH 32/54] refactor: set default value to empty dictionary --- .../46a6ce2bd2b2_expand_dataset_database_with_json_.py | 4 ++-- cognee/modules/users/models/DatasetDatabase.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py index becd29226..a9e804f88 100644 --- a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py +++ b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py @@ -41,7 +41,7 @@ def upgrade() -> None: sa.JSON(), unique=False, nullable=False, - default={}, + server_default=sa.text("'{}'"), ), ) @@ -56,7 +56,7 @@ def upgrade() -> None: sa.JSON(), unique=False, nullable=False, - default={}, + server_default=sa.text("'{}'"), ), ) diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index fee323d2f..577828921 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -1,6 +1,6 @@ from datetime import datetime, timezone -from sqlalchemy import Column, DateTime, String, UUID, ForeignKey, JSON +from sqlalchemy import Column, DateTime, String, UUID, ForeignKey, JSON, text from cognee.infrastructure.databases.relational import Base @@ -27,8 +27,12 @@ class DatasetDatabase(Base): # TODO: Instead of specifying and forwawrding all these individual fields, consider using a JSON field to store # configuration details for different database types. This would make it more flexible to add new database types # without changing the database schema. - graph_database_connection_info = Column(JSON, unique=False, nullable=False, default={}) - vector_database_connection_info = Column(JSON, unique=False, nullable=False, default={}) + graph_database_connection_info = Column( + JSON, unique=False, nullable=False, server_default=text("'{}'") + ) + vector_database_connection_info = Column( + JSON, unique=False, nullable=False, server_default=text("'{}'") + ) created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) From ddf802ff54e7f245545dab49a8be202bbb14f36c Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 27 Nov 2025 18:38:00 +0100 Subject: [PATCH 33/54] chore: Add migration of unique constraint for SQLite --- ...d2b2_expand_dataset_database_with_json_.py | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py index a9e804f88..e15a98b7c 100644 --- a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py +++ b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py @@ -18,6 +18,10 @@ down_revision: Union[str, None] = "76625596c5c3" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +graph_constraint_name = "dataset_database_graph_database_name_key" +vector_constraint_name = "dataset_database_vector_database_name_key" +TABLE_NAME = "dataset_database" + def _get_column(inspector, table, name, schema=None): for col in inspector.get_columns(table, schema=schema): @@ -26,10 +30,154 @@ def _get_column(inspector, table, name, schema=None): return None +def _recreate_table_without_unique_constraint_sqlite(op, insp): + """ + SQLite cannot drop unique constraints on individual columns. We must: + 1. Create a new table without the unique constraints. + 2. Copy data from the old table. + 3. Drop the old table. + 4. Rename the new table. + """ + conn = op.get_bind() + + # Create new table definition (without unique constraints) + op.create_table( + f"{TABLE_NAME}_new", + sa.Column("owner_id", sa.UUID()), + sa.Column("dataset_id", sa.UUID(), primary_key=True, nullable=False), + sa.Column("vector_database_name", sa.String(), nullable=False), + sa.Column("graph_database_name", sa.String(), nullable=False), + sa.Column("vector_database_provider", sa.String(), nullable=False), + sa.Column("graph_database_provider", sa.String(), nullable=False), + sa.Column("vector_database_url", sa.String()), + sa.Column("graph_database_url", sa.String()), + sa.Column("vector_database_key", sa.String()), + sa.Column("graph_database_key", sa.String()), + sa.Column( + "graph_database_connection_info", + sa.JSON(), + nullable=False, + server_default=sa.text("'{}'"), + ), + sa.Column( + "vector_database_connection_info", + sa.JSON(), + nullable=False, + server_default=sa.text("'{}'"), + ), + sa.Column("created_at", sa.DateTime()), + sa.Column("updated_at", sa.DateTime()), + sa.ForeignKeyConstraint(["dataset_id"], ["datasets.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["owner_id"], ["principals.id"], ondelete="CASCADE"), + ) + + # Copy data into new table + conn.execute( + sa.text(f""" + INSERT INTO {TABLE_NAME}_new + SELECT + owner_id, + dataset_id, + vector_database_name, + graph_database_name, + vector_database_provider, + graph_database_provider, + vector_database_url, + graph_database_url, + vector_database_key, + graph_database_key, + COALESCE(graph_database_connection_info, '{{}}'), + COALESCE(vector_database_connection_info, '{{}}'), + created_at, + updated_at + FROM {TABLE_NAME} + """) + ) + + # Drop old table + op.drop_table(TABLE_NAME) + + # Rename new table + op.rename_table(f"{TABLE_NAME}_new", TABLE_NAME) + + +def _recreate_table_with_unique_constraint_sqlite(op, insp): + """ + SQLite cannot drop unique constraints on individual columns. We must: + 1. Create a new table without the unique constraints. + 2. Copy data from the old table. + 3. Drop the old table. + 4. Rename the new table. + """ + conn = op.get_bind() + + # Create new table definition (without unique constraints) + op.create_table( + f"{TABLE_NAME}_new", + sa.Column("owner_id", sa.UUID()), + sa.Column("dataset_id", sa.UUID(), primary_key=True, nullable=False), + sa.Column("vector_database_name", sa.String(), nullable=False, unique=True), + sa.Column("graph_database_name", sa.String(), nullable=False, unique=True), + sa.Column("vector_database_provider", sa.String(), nullable=False), + sa.Column("graph_database_provider", sa.String(), nullable=False), + sa.Column("vector_database_url", sa.String()), + sa.Column("graph_database_url", sa.String()), + sa.Column("vector_database_key", sa.String()), + sa.Column("graph_database_key", sa.String()), + sa.Column( + "graph_database_connection_info", + sa.JSON(), + nullable=False, + server_default=sa.text("'{}'"), + ), + sa.Column( + "vector_database_connection_info", + sa.JSON(), + nullable=False, + server_default=sa.text("'{}'"), + ), + sa.Column("created_at", sa.DateTime()), + sa.Column("updated_at", sa.DateTime()), + sa.ForeignKeyConstraint(["dataset_id"], ["datasets.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["owner_id"], ["principals.id"], ondelete="CASCADE"), + ) + + # Copy data into new table + conn.execute( + sa.text(f""" + INSERT INTO {TABLE_NAME}_new + SELECT + owner_id, + dataset_id, + vector_database_name, + graph_database_name, + vector_database_provider, + graph_database_provider, + vector_database_url, + graph_database_url, + vector_database_key, + graph_database_key, + COALESCE(graph_database_connection_info, '{{}}'), + COALESCE(vector_database_connection_info, '{{}}'), + created_at, + updated_at + FROM {TABLE_NAME} + """) + ) + + # Drop old table + op.drop_table(TABLE_NAME) + + # Rename new table + op.rename_table(f"{TABLE_NAME}_new", TABLE_NAME) + + def upgrade() -> None: conn = op.get_bind() insp = sa.inspect(conn) + unique_constraints = insp.get_unique_constraints(TABLE_NAME) + vector_database_connection_info_column = _get_column( insp, "dataset_database", "vector_database_connection_info" ) @@ -60,7 +208,60 @@ def upgrade() -> None: ), ) + with op.batch_alter_table("dataset_database", schema=None) as batch_op: + # Drop the unique constraint to make unique=False + graph_constraint_to_drop = None + for uc in unique_constraints: + # Check if the constraint covers ONLY the target column + if uc["name"] == graph_constraint_name: + graph_constraint_to_drop = uc["name"] + break + + vector_constraint_to_drop = None + for uc in unique_constraints: + # Check if the constraint covers ONLY the target column + if uc["name"] == vector_constraint_name: + vector_constraint_to_drop = uc["name"] + break + + if ( + vector_constraint_to_drop + and graph_constraint_to_drop + and op.get_context().dialect.name == "postgresql" + ): + # PostgreSQL + batch_op.drop_constraint(graph_constraint_name, type_="unique") + batch_op.drop_constraint(vector_constraint_name, type_="unique") + + if op.get_context().dialect.name == "sqlite": + conn = op.get_bind() + # Fun fact: SQLite has hidden auto indexes for unique constraints that can't be dropped or accessed directly + # So we need to check for them and drop them by recreating the table (altering column also won't work) + result = conn.execute(sa.text("PRAGMA index_list('dataset_database')")) + rows = result.fetchall() + unique_auto_indexes = [row for row in rows if row[3] == "u"] + for row in unique_auto_indexes: + result = conn.execute(sa.text(f"PRAGMA index_info('{row[1]}')")) + index_info = result.fetchall() + if index_info[0][2] == "vector_database_name": + # In case a unique index exists on vector_database_name, drop it and the graph_database_name one + _recreate_table_without_unique_constraint_sqlite(op, insp) + def downgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + if op.get_context().dialect.name == "sqlite": + _recreate_table_with_unique_constraint_sqlite(op, insp) + elif op.get_context().dialect.name == "postgresql": + with op.batch_alter_table("dataset_database", schema=None) as batch_op: + # Re-add the unique constraint to return to unique=True + batch_op.create_unique_constraint(graph_constraint_name, ["graph_database_name"]) + + with op.batch_alter_table("dataset_database", schema=None) as batch_op: + # Re-add the unique constraint to return to unique=True + batch_op.create_unique_constraint(vector_constraint_name, ["vector_database_name"]) + op.drop_column("dataset_database", "vector_database_connection_info") op.drop_column("dataset_database", "graph_database_connection_info") From ed9b77444828110bbf6ee8a11fb18e652beae9f8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 28 Nov 2025 13:11:45 +0100 Subject: [PATCH 34/54] chore: disable backend access control for deduplication test --- .github/workflows/e2e_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 3dea2548c..db595b0bf 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -147,6 +147,7 @@ jobs: - name: Run Deduplication Example env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Test needs OpenAI endpoint to handle multimedia OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} From 7e0be8f167e285780128cad811f7ef7d9df54cfd Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 28 Nov 2025 13:48:01 +0100 Subject: [PATCH 35/54] chore: disable backend access control for tests not supporting mode --- .github/workflows/examples_tests.yml | 1 + .github/workflows/graph_db_tests.yml | 1 + .github/workflows/vector_db_tests.yml | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index f7cc278cb..a9332cf25 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -72,6 +72,7 @@ jobs: - name: Run Descriptive Graph Metrics Example env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/graph_db_tests.yml b/.github/workflows/graph_db_tests.yml index b07f6232f..e9fd7f4c2 100644 --- a/.github/workflows/graph_db_tests.yml +++ b/.github/workflows/graph_db_tests.yml @@ -78,6 +78,7 @@ jobs: - name: Run default Neo4j env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/vector_db_tests.yml b/.github/workflows/vector_db_tests.yml index 06b58c962..65b70abe5 100644 --- a/.github/workflows/vector_db_tests.yml +++ b/.github/workflows/vector_db_tests.yml @@ -92,6 +92,7 @@ jobs: - name: Run PGVector Tests env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -127,4 +128,4 @@ jobs: EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/test_lancedb.py \ No newline at end of file + run: uv run python ./cognee/tests/test_lancedb.py From a0c5867977a4fb993886a82dfcc56daa4ce24b70 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 28 Nov 2025 14:56:33 +0100 Subject: [PATCH 36/54] chore: disable backend access control --- .github/workflows/weighted_edges_tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 2b4a043bf..1c43187ad 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -94,6 +94,7 @@ jobs: - name: Run Weighted Edges Tests env: + ENABLE_BACKEND_ACCESS_CONTROL: 'false' GRAPH_DATABASE_PROVIDER: ${{ matrix.graph_db_provider }} GRAPH_DATABASE_URL: ${{ matrix.graph_db_provider == 'neo4j' && steps.neo4j.outputs.neo4j-url || '' }} GRAPH_DATABASE_USERNAME: ${{ matrix.graph_db_provider == 'neo4j' && steps.neo4j.outputs.neo4j-username || '' }} @@ -165,5 +166,3 @@ jobs: uses: astral-sh/ruff-action@v2 with: args: "format --check cognee/modules/graph/utils/get_graph_from_model.py cognee/tests/unit/interfaces/graph/test_weighted_edges.py examples/python/weighted_edges_example.py" - - \ No newline at end of file From d81d63390f84b2d381ae5583938257425b9d9e71 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 28 Nov 2025 16:33:46 +0100 Subject: [PATCH 37/54] test: Add test for dataset database handler creation --- .github/workflows/e2e_tests.yml | 25 ++++ .../users/methods/get_authenticated_user.py | 4 +- cognee/tests/test_dataset_database_handler.py | 135 ++++++++++++++++++ 3 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 cognee/tests/test_dataset_database_handler.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index db595b0bf..31398afd3 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -212,6 +212,31 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_parallel_databases.py + test-dataset-database-handler: + name: Test dataset database handlers in Cognee + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run dataset databases handler test + env: + ENV: 'dev' + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_dataset_database_handler.py + test-permissions: name: Test permissions with different situations in Cognee runs-on: ubuntu-22.04 diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index d6d701737..7dc721d7e 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -12,8 +12,8 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( - os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or backend_access_control_enabled() + os.getenv("REQUIRE_AUTHENTICATION", "true").lower() == "true" + or os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", "true").lower() == "true" ) fastapi_users = get_fastapi_users() diff --git a/cognee/tests/test_dataset_database_handler.py b/cognee/tests/test_dataset_database_handler.py new file mode 100644 index 000000000..a42ab0a17 --- /dev/null +++ b/cognee/tests/test_dataset_database_handler.py @@ -0,0 +1,135 @@ +import asyncio +import os + +# Set custom dataset database handler environment variable +os.environ["VECTOR_DATASET_DATABASE_HANDLER"] = "custom_lancedb_handler" +os.environ["GRAPH_DATASET_DATABASE_HANDLER"] = "custom_kuzu_handler" + +import cognee +from cognee.modules.users.methods import get_default_user +from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface +from cognee.shared.logging_utils import setup_logging, ERROR +from cognee.api.v1.search import SearchType + + +class LanceDBTestDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): + @classmethod + async def create_dataset(cls, dataset_id, user): + import pathlib + + cognee_directory_path = str( + pathlib.Path( + os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_dataset_database_handler" + ) + ).resolve() + ) + databases_directory_path = os.path.join(cognee_directory_path, "databases", str(user.id)) + os.makedirs(databases_directory_path, exist_ok=True) + + vector_db_name = "test.lance.db" + + return { + "vector_database_name": vector_db_name, + "vector_database_url": os.path.join(databases_directory_path, vector_db_name), + "vector_database_provider": "lancedb", + } + + +class KuzuTestDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): + @classmethod + async def create_dataset(cls, dataset_id, user): + databases_directory_path = os.path.join("databases", str(user.id)) + os.makedirs(databases_directory_path, exist_ok=True) + + graph_db_name = "test.kuzu" + return { + "graph_database_name": graph_db_name, + "graph_database_url": os.path.join(databases_directory_path, graph_db_name), + "graph_database_provider": "kuzu", + } + + +async def main(): + import pathlib + + data_directory_path = str( + pathlib.Path( + os.path.join( + pathlib.Path(__file__).parent, ".data_storage/test_dataset_database_handler" + ) + ).resolve() + ) + cognee.config.data_root_directory(data_directory_path) + cognee_directory_path = str( + pathlib.Path( + os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_dataset_database_handler" + ) + ).resolve() + ) + cognee.config.system_root_directory(cognee_directory_path) + + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + + # Add custom dataset database handler + from cognee.infrastructure.databases.dataset_database_handler.use_dataset_database_handler import ( + use_dataset_database_handler, + ) + + use_dataset_database_handler( + "custom_lancedb_handler", LanceDBTestDatasetDatabaseHandler, "lancedb" + ) + use_dataset_database_handler("custom_kuzu_handler", KuzuTestDatasetDatabaseHandler, "kuzu") + + # cognee knowledge graph will be created based on this text + text = """ + Natural language processing (NLP) is an interdisciplinary + subfield of computer science and information retrieval. + """ + + print("Adding text to cognee:") + print(text.strip()) + + # Add the text, and make it available for cognify + await cognee.add(text) + print("Text added successfully.\n") + + # Use LLMs and cognee to create knowledge graph + await cognee.cognify() + print("Cognify process complete.\n") + + query_text = "Tell me about NLP" + print(f"Searching cognee for insights with query: '{query_text}'") + # Query cognee for insights on the added text + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, query_text=query_text + ) + + print("Search results:") + # Display results + for result_text in search_results: + print(result_text) + + default_user = await get_default_user() + # Assert that the custom database files were created based on the custom dataset database handlers + assert os.path.exists( + os.path.join(cognee_directory_path, "databases", str(default_user.id), "test.kuzu") + ), "Graph database file not found." + assert os.path.exists( + os.path.join(cognee_directory_path, "databases", str(default_user.id), "test.lance.db") + ), "Vector database file not found." + + +if __name__ == "__main__": + logger = setup_logging(log_level=ERROR) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) From 2e493cea4cd827bd2149875ab50805f8a664c6d2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 1 Dec 2025 15:07:01 +0100 Subject: [PATCH 38/54] chore: Disable multi user mode for tests that can't run it --- .github/workflows/distributed_test.yml | 1 + .github/workflows/temporal_graph_tests.yml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/distributed_test.yml b/.github/workflows/distributed_test.yml index 57bbb7459..3c9debfdf 100644 --- a/.github/workflows/distributed_test.yml +++ b/.github/workflows/distributed_test.yml @@ -47,6 +47,7 @@ jobs: - name: Run Distributed Cognee (Modal) env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/temporal_graph_tests.yml b/.github/workflows/temporal_graph_tests.yml index 8917e432a..60e6fe7ef 100644 --- a/.github/workflows/temporal_graph_tests.yml +++ b/.github/workflows/temporal_graph_tests.yml @@ -72,6 +72,7 @@ jobs: - name: Run Temporal Graph with Neo4j (lancedb + sqlite) env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.OPENAI_MODEL }} LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -123,6 +124,7 @@ jobs: - name: Run Temporal Graph with Kuzu (postgres + pgvector) env: ENV: dev + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.OPENAI_MODEL }} LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -189,6 +191,7 @@ jobs: - name: Run Temporal Graph with Neo4j (postgres + pgvector) env: ENV: dev + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.OPENAI_MODEL }} LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} From 5cfc7b17618f4aa335ae0aba18fc5eebcf6f6a00 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 1 Dec 2025 15:58:19 +0100 Subject: [PATCH 39/54] chore: Disable backend access control when not supported --- .github/workflows/db_examples_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/db_examples_tests.yml b/.github/workflows/db_examples_tests.yml index c58bc48ef..5062982d8 100644 --- a/.github/workflows/db_examples_tests.yml +++ b/.github/workflows/db_examples_tests.yml @@ -61,6 +61,7 @@ jobs: - name: Run Neo4j Example env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -142,6 +143,7 @@ jobs: - name: Run PGVector Example env: ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} From dbcb35a6dafab18e8c2b61e120228c0e5fc2dae4 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 2 Dec 2025 13:09:45 +0100 Subject: [PATCH 40/54] chore: remove unused imports, add optional for delete dataset statement --- .../dataset_database_handler_interface.py | 2 +- .../databases/graph/kuzu/KuzuDatasetDatabaseHandler.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py index 01ee46c48..1811feb12 100644 --- a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +++ b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py @@ -68,7 +68,7 @@ class DatasetDatabaseHandlerInterface(ABC): @classmethod @abstractmethod - async def delete_dataset(cls, dataset_id: UUID, user: User) -> None: + async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> None: """ Delete the graph or vector database for the given dataset. Function should auto handle deleting of the actual database or send a request to the proper service to delete/mark the database as not needed for the given dataset. diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py index a2b2da8f4..242249e00 100644 --- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -1,6 +1,3 @@ -import os -import asyncio -import requests from uuid import UUID from typing import Optional From 92448767fe9fd13cea9428f24185e08654162502 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 2 Dec 2025 14:29:51 +0100 Subject: [PATCH 41/54] refactor: remove done TODOs --- .../databases/graph/kuzu/KuzuDatasetDatabaseHandler.py | 1 - .../databases/utils/get_or_create_dataset_database.py | 1 - cognee/modules/users/models/DatasetDatabase.py | 1 - 3 files changed, 3 deletions(-) diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py index 242249e00..9eb19c2a2 100644 --- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -33,7 +33,6 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): "KuzuDatasetDatabaseHandler can only be used with Kuzu graph database provider." ) - # TODO: Add graph file path info for kuzu (also in DatasetDatabase model) graph_db_name = f"{dataset_id}.pkl" graph_db_url = graph_config.graph_database_url graph_db_key = graph_config.graph_database_key diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 665355e30..3d03a699e 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -98,7 +98,6 @@ async def get_or_create_dataset_database( async with db_engine.get_async_session() as session: # If there are no existing rows build a new row - # TODO: Update Dataset Database migrations, also make sure database_name is not unique anymore record = DatasetDatabase( owner_id=user.id, dataset_id=dataset_id, diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 577828921..15964f032 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -24,7 +24,6 @@ class DatasetDatabase(Base): vector_database_key = Column(String, unique=False, nullable=True) graph_database_key = Column(String, unique=False, nullable=True) - # TODO: Instead of specifying and forwawrding all these individual fields, consider using a JSON field to store # configuration details for different database types. This would make it more flexible to add new database types # without changing the database schema. graph_database_connection_info = Column( From 1282905888ae4203326344b8bbb09a79dc1b7423 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 2 Dec 2025 16:34:16 +0100 Subject: [PATCH 42/54] feat: add password encryption for Neo4j --- .../Neo4jAuraDatasetDatabaseHandler.py | 49 ++++++++++++++++--- cognee/modules/data/deletion/prune_system.py | 6 ++- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py index d1e5eee6f..550c60d94 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py @@ -1,12 +1,14 @@ import os import asyncio import requests +import base64 +import hashlib from uuid import UUID from typing import Optional +from cryptography.fernet import Fernet from cognee.infrastructure.databases.graph import get_graph_config -from cognee.modules.users.models import User - +from cognee.modules.users.models import User, DatasetDatabase from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface @@ -37,10 +39,15 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): graph_db_name = f"{dataset_id}" - # Client credentials + # Client credentials and encryption client_id = os.environ.get("NEO4J_CLIENT_ID", None) client_secret = os.environ.get("NEO4J_CLIENT_SECRET", None) tenant_id = os.environ.get("NEO4J_TENANT_ID", None) + encryption_env_key = os.environ.get("NEO4J_ENCRYPTION_KEY", "test_key") + encryption_key = base64.urlsafe_b64encode( + hashlib.sha256(encryption_env_key.encode()).digest() + ) + cipher = Fernet(encryption_key) if client_id is None or client_secret is None or tenant_id is None: raise ValueError( @@ -93,7 +100,9 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): status_url = f"https://api.neo4j.io/v1/instances/{instance_id}" status = "" for attempt in range(30): # Try for up to ~5 minutes - status_resp = requests.get(status_url, headers=headers) + status_resp = requests.get( + status_url, headers=headers + ) # TODO: Use async requests with httpx status = status_resp.json()["data"]["status"] if status.lower() == "running": return @@ -104,17 +113,45 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): instance_id = response.json()["data"]["id"] await _wait_for_neo4j_instance_provisioning(instance_id, headers) + + encrypted_db_password_bytes = cipher.encrypt(graph_db_password.encode()) + encrypted_db_password_string = encrypted_db_password_bytes.decode() + return { "graph_database_name": graph_db_name, "graph_database_url": graph_db_url, "graph_database_provider": "neo4j", "graph_database_key": graph_db_key, - "graph_database_connection_info": { # TODO: Hashing of keys/passwords in relational DB + "graph_database_connection_info": { "graph_database_username": graph_db_username, - "graph_database_password": graph_db_password, + "graph_database_password": encrypted_db_password_string, }, } + @classmethod + async def resolve_dataset_connection_info( + cls, dataset_database: DatasetDatabase + ) -> DatasetDatabase: + """ + Resolve and decrypt connection info for the Neo4j dataset database. + + Args: + dataset_database: DatasetDatabase instance containing encrypted connection info. + """ + encryption_env_key = os.environ.get("NEO4J_ENCRYPTION_KEY", "test_key") + encryption_key = base64.urlsafe_b64encode( + hashlib.sha256(encryption_env_key.encode()).digest() + ) + cipher = Fernet(encryption_key) + graph_db_password = cipher.decrypt( + dataset_database.graph_database_connection_info["graph_database_password"].encode() + ).decode() + + dataset_database.graph_database_connection_info["graph_database_password"] = ( + graph_db_password + ) + return dataset_database + @classmethod async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): pass diff --git a/cognee/modules/data/deletion/prune_system.py b/cognee/modules/data/deletion/prune_system.py index a1b60988f..b568e6ba7 100644 --- a/cognee/modules/data/deletion/prune_system.py +++ b/cognee/modules/data/deletion/prune_system.py @@ -1,3 +1,4 @@ +from cognee.context_global_variables import backend_access_control_enabled from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine @@ -5,11 +6,12 @@ from cognee.shared.cache import delete_cache async def prune_system(graph=True, vector=True, metadata=True, cache=True): - if graph: + # TODO: prune_system should work with multi-user access control mode enabled + if graph and not backend_access_control_enabled(): graph_engine = await get_graph_engine() await graph_engine.delete_graph() - if vector: + if vector and not backend_access_control_enabled(): vector_engine = get_vector_engine() await vector_engine.prune() From 5698c609f53b674b423c3e61671f370299a45012 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 3 Dec 2025 11:47:10 +0100 Subject: [PATCH 43/54] test: Update tests with regards to auto scaling changes --- .github/workflows/e2e_tests.yml | 1 + cognee/tests/test_dataset_database_handler.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 31398afd3..a1cb54c83 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -104,6 +104,7 @@ jobs: - name: Run default basic pipeline with telemetry on env: ENV: 'local' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/cognee/tests/test_dataset_database_handler.py b/cognee/tests/test_dataset_database_handler.py index a42ab0a17..be1b249d2 100644 --- a/cognee/tests/test_dataset_database_handler.py +++ b/cognee/tests/test_dataset_database_handler.py @@ -70,12 +70,6 @@ async def main(): ) cognee.config.system_root_directory(cognee_directory_path) - # Create a clean slate for cognee -- reset data and system state - print("Resetting cognee data...") - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - print("Data reset complete.\n") - # Add custom dataset database handler from cognee.infrastructure.databases.dataset_database_handler.use_dataset_database_handler import ( use_dataset_database_handler, @@ -86,6 +80,12 @@ async def main(): ) use_dataset_database_handler("custom_kuzu_handler", KuzuTestDatasetDatabaseHandler, "kuzu") + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + # cognee knowledge graph will be created based on this text text = """ Natural language processing (NLP) is an interdisciplinary From f4078d1247a5007665176c56a6eb236246b86358 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 3 Dec 2025 13:10:18 +0100 Subject: [PATCH 44/54] feat: Add ability to delete lance and kuzu datasets, add prune to work with multi user mode --- .github/workflows/e2e_tests.yml | 1 - .../dataset_database_handler_interface.py | 5 +- .../graph/kuzu/KuzuDatasetDatabaseHandler.py | 32 +++++++++++-- .../Neo4jAuraDatasetDatabaseHandler.py | 2 +- .../lancedb/LanceDBDatasetDatabaseHandler.py | 15 +++++- cognee/modules/data/deletion/prune_system.py | 46 +++++++++++++++++++ 6 files changed, 91 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index a1cb54c83..31398afd3 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -104,7 +104,6 @@ jobs: - name: Run default basic pipeline with telemetry on env: ENV: 'local' - ENABLE_BACKEND_ACCESS_CONTROL: 'false' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py index 1811feb12..a0b68e497 100644 --- a/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +++ b/cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py @@ -68,14 +68,13 @@ class DatasetDatabaseHandlerInterface(ABC): @classmethod @abstractmethod - async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]) -> None: + async def delete_dataset(cls, dataset_database: DatasetDatabase) -> None: """ Delete the graph or vector database for the given dataset. Function should auto handle deleting of the actual database or send a request to the proper service to delete/mark the database as not needed for the given dataset. Needed for maintaining a database for Cognee multi-tenant/multi-user and backend access control. Args: - dataset_id: UUID of the dataset - user: User object + dataset_database: DatasetDatabase row containing connection/resolution info for the graph or vector database to delete. """ pass diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py index 9eb19c2a2..6eb3ed0f4 100644 --- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -1,8 +1,12 @@ +import os from uuid import UUID from typing import Optional +from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine +from cognee.base_config import get_base_config from cognee.modules.users.models import User - +from cognee.modules.users.models import DatasetDatabase +from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface @@ -51,5 +55,27 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): } @classmethod - async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): - pass + async def delete_dataset(cls, dataset_database: DatasetDatabase): + graph_config = get_graph_config() + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(dataset_database.owner_id) + ) + graph_file_path = os.path.join( + databases_directory_path, dataset_database.graph_database_name + ) + graph_engine = create_graph_engine( + graph_database_provider=dataset_database.graph_database_provider, + graph_file_path=graph_file_path, + graph_database_url=dataset_database.graph_database_url, + graph_database_name=dataset_database.graph_database_name, + graph_database_username=dataset_database.graph_database_connection_info.get( + "graph_database_username", "" + ), + graph_database_password=dataset_database.graph_database_connection_info.get( + "graph_database_password", "" + ), + graph_database_port=graph_config.graph_database_port, + graph_database_key=dataset_database.graph_database_key, + ) + await graph_engine.delete_graph() diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py index 550c60d94..003b770d7 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py @@ -153,5 +153,5 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): return dataset_database @classmethod - async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): + async def delete_dataset(cls, dataset_database: DatasetDatabase): pass diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py index 8a80dddcf..6d14b11bc 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py @@ -2,7 +2,9 @@ import os from uuid import UUID from typing import Optional +from cognee.infrastructure.databases.vector.create_vector_engine import create_vector_engine from cognee.modules.users.models import User +from cognee.modules.users.models import DatasetDatabase from cognee.base_config import get_base_config from cognee.infrastructure.databases.vector import get_vectordb_config from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface @@ -37,5 +39,14 @@ class LanceDBDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): } @classmethod - async def delete_dataset(cls, dataset_id: Optional[UUID], user: Optional[User]): - pass + async def delete_dataset(cls, dataset_database: DatasetDatabase): + vector_config = get_vectordb_config() + vector_engine = create_vector_engine( + vector_db_provider=dataset_database.vector_database_provider, + vector_db_url=dataset_database.vector_database_url, + vector_db_name=dataset_database.vector_database_name, + vector_db_port=vector_config.vector_db_port, + vector_db_key=dataset_database.vector_database_key, + vector_dataset_database_handler=vector_config.vector_dataset_database_handler, + ) + await vector_engine.prune() diff --git a/cognee/modules/data/deletion/prune_system.py b/cognee/modules/data/deletion/prune_system.py index b568e6ba7..66f93ec48 100644 --- a/cognee/modules/data/deletion/prune_system.py +++ b/cognee/modules/data/deletion/prune_system.py @@ -2,18 +2,64 @@ from cognee.context_global_variables import backend_access_control_enabled from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.infrastructure.databases.vector.config import get_vectordb_config +from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.shared.cache import delete_cache +from cognee.modules.users.models import DatasetDatabase + + +async def prune_graph_databases(): + async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict: + graph_config = get_graph_config() + from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( + supported_dataset_database_handlers, + ) + + handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] + return await handler["handler_instance"].delete_dataset(dataset_database) + + db_engine = get_relational_engine() + if "dataset_database" in await db_engine.get_table_names(): + data = await db_engine.get_all_data_from_table("dataset_database") + # Go through each dataset database and delete the graph database + for data_item in data: + await _prune_graph_db(data_item) + + +async def prune_vector_databases(): + async def _prune_vector_db(dataset_database: DatasetDatabase) -> dict: + vector_config = get_vectordb_config() + + from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( + supported_dataset_database_handlers, + ) + + handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] + return await handler["handler_instance"].delete_dataset(dataset_database) + + db_engine = get_relational_engine() + if "dataset_database" in await db_engine.get_table_names(): + data = await db_engine.get_all_data_from_table("dataset_database") + # Go through each dataset database and delete the vector database + for data_item in data: + await _prune_vector_db(data_item) async def prune_system(graph=True, vector=True, metadata=True, cache=True): + # Note: prune system should not be available through the API, it has no permission checks and will + # delete all graph and vector databases if called. It should only be used in development or testing environments. # TODO: prune_system should work with multi-user access control mode enabled if graph and not backend_access_control_enabled(): graph_engine = await get_graph_engine() await graph_engine.delete_graph() + elif graph and backend_access_control_enabled(): + await prune_graph_databases() if vector and not backend_access_control_enabled(): vector_engine = get_vector_engine() await vector_engine.prune() + elif vector and backend_access_control_enabled(): + await prune_vector_databases() if metadata: db_engine = get_relational_engine() From 1961efcc338bcba3bd5c829a7b872df6bedc8464 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 3 Dec 2025 14:27:06 +0100 Subject: [PATCH 45/54] fix: Handle scenario when there is no relational database on prune time --- cognee/modules/data/deletion/prune_system.py | 39 +++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/cognee/modules/data/deletion/prune_system.py b/cognee/modules/data/deletion/prune_system.py index 66f93ec48..b5d944dbb 100644 --- a/cognee/modules/data/deletion/prune_system.py +++ b/cognee/modules/data/deletion/prune_system.py @@ -1,3 +1,5 @@ +from sqlalchemy.exc import OperationalError + from cognee.context_global_variables import backend_access_control_enabled from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine @@ -6,6 +8,9 @@ from cognee.infrastructure.databases.vector.config import get_vectordb_config from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.shared.cache import delete_cache from cognee.modules.users.models import DatasetDatabase +from cognee.shared.logging_utils import get_logger + +logger = get_logger() async def prune_graph_databases(): @@ -19,11 +24,18 @@ async def prune_graph_databases(): return await handler["handler_instance"].delete_dataset(dataset_database) db_engine = get_relational_engine() - if "dataset_database" in await db_engine.get_table_names(): - data = await db_engine.get_all_data_from_table("dataset_database") - # Go through each dataset database and delete the graph database - for data_item in data: - await _prune_graph_db(data_item) + try: + if "dataset_database" in await db_engine.get_table_names(): + data = await db_engine.get_all_data_from_table("dataset_database") + # Go through each dataset database and delete the graph database + for data_item in data: + await _prune_graph_db(data_item) + except OperationalError as e: + logger.debug( + "Skipping pruning of graph DB. OperationalError when accessing dataset_database table: %s", + e, + ) + return async def prune_vector_databases(): @@ -38,11 +50,18 @@ async def prune_vector_databases(): return await handler["handler_instance"].delete_dataset(dataset_database) db_engine = get_relational_engine() - if "dataset_database" in await db_engine.get_table_names(): - data = await db_engine.get_all_data_from_table("dataset_database") - # Go through each dataset database and delete the vector database - for data_item in data: - await _prune_vector_db(data_item) + try: + if "dataset_database" in await db_engine.get_table_names(): + data = await db_engine.get_all_data_from_table("dataset_database") + # Go through each dataset database and delete the vector database + for data_item in data: + await _prune_vector_db(data_item) + except OperationalError as e: + logger.debug( + "Skipping pruning of vector DB. OperationalError when accessing dataset_database table: %s", + e, + ) + return async def prune_system(graph=True, vector=True, metadata=True, cache=True): From fd84edeb74c7c736726bd845f86bf285b7c95dcf Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 3 Dec 2025 15:43:41 +0100 Subject: [PATCH 46/54] refactor: change getting of tables during deletion --- cognee/modules/data/deletion/prune_system.py | 28 +++++++++----------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/cognee/modules/data/deletion/prune_system.py b/cognee/modules/data/deletion/prune_system.py index b5d944dbb..b43cab1f7 100644 --- a/cognee/modules/data/deletion/prune_system.py +++ b/cognee/modules/data/deletion/prune_system.py @@ -1,5 +1,6 @@ from sqlalchemy.exc import OperationalError +from cognee.infrastructure.databases.exceptions import EntityNotFoundError from cognee.context_global_variables import backend_access_control_enabled from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine @@ -25,14 +26,13 @@ async def prune_graph_databases(): db_engine = get_relational_engine() try: - if "dataset_database" in await db_engine.get_table_names(): - data = await db_engine.get_all_data_from_table("dataset_database") - # Go through each dataset database and delete the graph database - for data_item in data: - await _prune_graph_db(data_item) - except OperationalError as e: + data = await db_engine.get_all_data_from_table("dataset_database") + # Go through each dataset database and delete the graph database + for data_item in data: + await _prune_graph_db(data_item) + except (OperationalError, EntityNotFoundError) as e: logger.debug( - "Skipping pruning of graph DB. OperationalError when accessing dataset_database table: %s", + "Skipping pruning of graph DB. Error when accessing dataset_database table: %s", e, ) return @@ -51,14 +51,13 @@ async def prune_vector_databases(): db_engine = get_relational_engine() try: - if "dataset_database" in await db_engine.get_table_names(): - data = await db_engine.get_all_data_from_table("dataset_database") - # Go through each dataset database and delete the vector database - for data_item in data: - await _prune_vector_db(data_item) - except OperationalError as e: + data = await db_engine.get_all_data_from_table("dataset_database") + # Go through each dataset database and delete the vector database + for data_item in data: + await _prune_vector_db(data_item) + except (OperationalError, EntityNotFoundError) as e: logger.debug( - "Skipping pruning of vector DB. OperationalError when accessing dataset_database table: %s", + "Skipping pruning of vector DB. Error when accessing dataset_database table: %s", e, ) return @@ -67,7 +66,6 @@ async def prune_vector_databases(): async def prune_system(graph=True, vector=True, metadata=True, cache=True): # Note: prune system should not be available through the API, it has no permission checks and will # delete all graph and vector databases if called. It should only be used in development or testing environments. - # TODO: prune_system should work with multi-user access control mode enabled if graph and not backend_access_control_enabled(): graph_engine = await get_graph_engine() await graph_engine.delete_graph() From f1c5b9a55fb3b91e1763a6d735cbe203cd5c906d Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 3 Dec 2025 18:05:47 +0100 Subject: [PATCH 47/54] fix: Resolve DB caching issues when deleting databases --- cognee/context_global_variables.py | 4 ++++ .../databases/graph/kuzu/KuzuDatasetDatabaseHandler.py | 9 ++++----- .../vector/lancedb/LanceDBDatasetDatabaseHandler.py | 9 +++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 58fff2dff..6417f34f7 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -121,6 +121,8 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ ) # Set vector and graph database configuration based on dataset database information + # TODO: Add better handling of vector and graph config accross Cognee. + # LRU_CACHE takes into account order of inputs, if order of inputs is changed it will be registered as a new DB adapter vector_config = { "vector_db_provider": dataset_database.vector_database_provider, "vector_db_url": dataset_database.vector_database_url, @@ -142,6 +144,8 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ "graph_database_password": dataset_database.graph_database_connection_info.get( "graph_database_password", "" ), + "graph_dataset_database_handler": "", + "graph_database_port": "", } storage_config = { diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py index 6eb3ed0f4..edc6d5c39 100644 --- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -6,7 +6,6 @@ from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_ from cognee.base_config import get_base_config from cognee.modules.users.models import User from cognee.modules.users.models import DatasetDatabase -from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface @@ -56,7 +55,6 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): @classmethod async def delete_dataset(cls, dataset_database: DatasetDatabase): - graph_config = get_graph_config() base_config = get_base_config() databases_directory_path = os.path.join( base_config.system_root_directory, "databases", str(dataset_database.owner_id) @@ -66,16 +64,17 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): ) graph_engine = create_graph_engine( graph_database_provider=dataset_database.graph_database_provider, - graph_file_path=graph_file_path, graph_database_url=dataset_database.graph_database_url, graph_database_name=dataset_database.graph_database_name, + graph_database_key=dataset_database.graph_database_key, + graph_file_path=graph_file_path, graph_database_username=dataset_database.graph_database_connection_info.get( "graph_database_username", "" ), graph_database_password=dataset_database.graph_database_connection_info.get( "graph_database_password", "" ), - graph_database_port=graph_config.graph_database_port, - graph_database_key=dataset_database.graph_database_key, + graph_dataset_database_handler="", + graph_database_port="", ) await graph_engine.delete_graph() diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py index 6d14b11bc..f165a7ea4 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py @@ -32,21 +32,18 @@ class LanceDBDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): vector_db_name = f"{dataset_id}.lance.db" return { - "vector_database_name": vector_db_name, - "vector_database_url": os.path.join(databases_directory_path, vector_db_name), "vector_database_provider": vector_config.vector_db_provider, + "vector_database_url": os.path.join(databases_directory_path, vector_db_name), "vector_database_key": vector_config.vector_db_key, + "vector_database_name": vector_db_name, } @classmethod async def delete_dataset(cls, dataset_database: DatasetDatabase): - vector_config = get_vectordb_config() vector_engine = create_vector_engine( vector_db_provider=dataset_database.vector_database_provider, vector_db_url=dataset_database.vector_database_url, - vector_db_name=dataset_database.vector_database_name, - vector_db_port=vector_config.vector_db_port, vector_db_key=dataset_database.vector_database_key, - vector_dataset_database_handler=vector_config.vector_dataset_database_handler, + vector_db_name=dataset_database.vector_database_name, ) await vector_engine.prune() From d0b914acaa84b60a0b95332cd9c21cfe06f0e62b Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Tue, 9 Dec 2025 17:55:43 +0100 Subject: [PATCH 48/54] Chore: Remove Ontology file size limit. Code duplications --- cognee/api/v1/ontologies/ontologies.py | 50 +++++--------------------- 1 file changed, 8 insertions(+), 42 deletions(-) diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index 130b4a862..4bd3f2cd5 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -5,7 +5,7 @@ from pathlib import Path from datetime import datetime, timezone from typing import Optional, List from dataclasses import dataclass - +from fastapi import UploadFile @dataclass class OntologyMetadata: @@ -45,7 +45,7 @@ class OntologyService: json.dump(metadata, f, indent=2) async def upload_ontology( - self, ontology_key: str, file, user, description: Optional[str] = None + self, ontology_key: str, file: UploadFile, user, description: Optional[str] = None ) -> OntologyMetadata: if not file.filename.lower().endswith(".owl"): raise ValueError("File must be in .owl format") @@ -57,8 +57,6 @@ class OntologyService: raise ValueError(f"Ontology key '{ontology_key}' already exists") content = await file.read() - if len(content) > 10 * 1024 * 1024: - raise ValueError("File size exceeds 10MB limit") file_path = user_dir / f"{ontology_key}.owl" with open(file_path, "wb") as f: @@ -105,47 +103,15 @@ class OntologyService: if len(set(ontology_key)) != len(ontology_key): raise ValueError("Duplicate ontology keys not allowed") - if descriptions and len(descriptions) != len(files): - raise ValueError("Number of descriptions must match number of files") - results = [] - user_dir = self._get_user_dir(str(user.id)) - metadata = self._load_metadata(user_dir) for i, (key, file) in enumerate(zip(ontology_key, files)): - if key in metadata: - raise ValueError(f"Ontology key '{key}' already exists") - - if not file.filename.lower().endswith(".owl"): - raise ValueError(f"File '{file.filename}' must be in .owl format") - - content = await file.read() - if len(content) > 10 * 1024 * 1024: - raise ValueError(f"File '{file.filename}' exceeds 10MB limit") - - file_path = user_dir / f"{key}.owl" - with open(file_path, "wb") as f: - f.write(content) - - ontology_metadata = { - "filename": file.filename, - "size_bytes": len(content), - "uploaded_at": datetime.now(timezone.utc).isoformat(), - "description": descriptions[i] if descriptions else None, - } - metadata[key] = ontology_metadata - - results.append( - OntologyMetadata( - ontology_key=key, - filename=file.filename, - size_bytes=len(content), - uploaded_at=ontology_metadata["uploaded_at"], - description=descriptions[i] if descriptions else None, - ) - ) - - self._save_metadata(user_dir, metadata) + results.append(await self.upload_ontology( + ontology_key=key, + file=file, + user=user, + description=descriptions[i] if descriptions else None, + )) return results def get_ontology_contents(self, ontology_key: List[str], user) -> List[str]: From d932ee4bd96cba2ca21b54fde43080757e218571 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Tue, 9 Dec 2025 17:58:34 +0100 Subject: [PATCH 49/54] Specify file type --- cognee/api/v1/ontologies/ontologies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index 4bd3f2cd5..9ade625b0 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -80,7 +80,7 @@ class OntologyService: ) async def upload_ontologies( - self, ontology_key: List[str], files: List, user, descriptions: Optional[List[str]] = None + self, ontology_key: List[str], files: List[UploadFile], user, descriptions: Optional[List[str]] = None ) -> List[OntologyMetadata]: """ Upload ontology files with their respective keys. From 2ca194c28fc67501de8a661ff413f208c5b2899a Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Tue, 9 Dec 2025 18:22:44 +0100 Subject: [PATCH 50/54] fix format --- cognee/api/v1/ontologies/ontologies.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index 9ade625b0..cfd42fec4 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -7,6 +7,7 @@ from typing import Optional, List from dataclasses import dataclass from fastapi import UploadFile + @dataclass class OntologyMetadata: ontology_key: str @@ -80,7 +81,11 @@ class OntologyService: ) async def upload_ontologies( - self, ontology_key: List[str], files: List[UploadFile], user, descriptions: Optional[List[str]] = None + self, + ontology_key: List[str], + files: List[UploadFile], + user, + descriptions: Optional[List[str]] = None, ) -> List[OntologyMetadata]: """ Upload ontology files with their respective keys. @@ -106,12 +111,14 @@ class OntologyService: results = [] for i, (key, file) in enumerate(zip(ontology_key, files)): - results.append(await self.upload_ontology( - ontology_key=key, - file=file, - user=user, - description=descriptions[i] if descriptions else None, - )) + results.append( + await self.upload_ontology( + ontology_key=key, + file=file, + user=user, + description=descriptions[i] if descriptions else None, + ) + ) return results def get_ontology_contents(self, ontology_key: List[str], user) -> List[str]: From 88f61f9bdb9093abb05ff1c70c5a0d3153c91cf5 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Wed, 10 Dec 2025 17:24:31 +0100 Subject: [PATCH 51/54] Added filename check --- cognee/api/v1/ontologies/ontologies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index cfd42fec4..2a133bf8a 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -48,6 +48,8 @@ class OntologyService: async def upload_ontology( self, ontology_key: str, file: UploadFile, user, description: Optional[str] = None ) -> OntologyMetadata: + if not file.filename: + raise ValueError("File must have a filename") if not file.filename.lower().endswith(".owl"): raise ValueError("File must be in .owl format") From 0a1ed79340dc33b136839a764df59d3d1ddd4da1 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 11 Dec 2025 13:05:23 +0100 Subject: [PATCH 52/54] refactor: change neo4j_aura to neo4j_aura_dev --- .../supported_dataset_database_handlers.py | 8 ++++---- ....py => Neo4jAuraDevDatasetDatabaseHandler.py} | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) rename cognee/infrastructure/databases/graph/neo4j_driver/{Neo4jAuraDatasetDatabaseHandler.py => Neo4jAuraDevDatasetDatabaseHandler.py} (87%) diff --git a/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py b/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py index adaa45e33..225e9732e 100644 --- a/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +++ b/cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py @@ -1,5 +1,5 @@ -from cognee.infrastructure.databases.graph.neo4j_driver.Neo4jAuraDatasetDatabaseHandler import ( - Neo4jAuraDatasetDatabaseHandler, +from cognee.infrastructure.databases.graph.neo4j_driver.Neo4jAuraDevDatasetDatabaseHandler import ( + Neo4jAuraDevDatasetDatabaseHandler, ) from cognee.infrastructure.databases.vector.lancedb.LanceDBDatasetDatabaseHandler import ( LanceDBDatasetDatabaseHandler, @@ -9,8 +9,8 @@ from cognee.infrastructure.databases.graph.kuzu.KuzuDatasetDatabaseHandler impor ) supported_dataset_database_handlers = { - "neo4j_aura": { - "handler_instance": Neo4jAuraDatasetDatabaseHandler, + "neo4j_aura_dev": { + "handler_instance": Neo4jAuraDevDatasetDatabaseHandler, "handler_provider": "neo4j", }, "lancedb": {"handler_instance": LanceDBDatasetDatabaseHandler, "handler_provider": "lancedb"}, diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py similarity index 87% rename from cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py rename to cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py index 003b770d7..73f057fa8 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py @@ -12,9 +12,18 @@ from cognee.modules.users.models import User, DatasetDatabase from cognee.infrastructure.databases.dataset_database_handler import DatasetDatabaseHandlerInterface -class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): +class Neo4jAuraDevDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): """ - Handler for interacting with Neo4j Aura Dataset databases. + Handler for a quick development PoC integration of Cognee multi-user and permission mode with Neo4j Aura databases. + This handler creates a new Neo4j Aura instance for each Cognee dataset created. + + Improvements needed to be production ready: + - Secret management for client credentials, currently secrets are encrypted and stored in the Cognee relational database, + a secret manager or a similar system should be used instead. + + Quality of life improvements: + - Allow configuration of different Neo4j Aura plans and regions. + - Requests should be made async, currently a blocking requests library is used. """ @classmethod @@ -34,7 +43,7 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): if graph_config.graph_database_provider != "neo4j": raise ValueError( - "Neo4jAuraDatasetDatabaseHandler can only be used with Neo4j graph database provider." + "Neo4jAuraDevDatasetDatabaseHandler can only be used with Neo4j graph database provider." ) graph_db_name = f"{dataset_id}" @@ -134,6 +143,7 @@ class Neo4jAuraDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): ) -> DatasetDatabase: """ Resolve and decrypt connection info for the Neo4j dataset database. + In this case, decrypt the password stored in the database. Args: dataset_database: DatasetDatabase instance containing encrypted connection info. From ede884e0b0215714a86afcd38ae5800332e861df Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:11:31 +0100 Subject: [PATCH 53/54] feat: make pipeline processing cache optional (#1876) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Make the pipeline cache mechanism optional, have it turned off by default but use it for add and cognify like it has been used until now ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [ x I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. ## Summary by CodeRabbit * **New Features** * Introduced pipeline caching across ingestion, processing, and custom pipeline flows with per-run controls to enable or disable caching. * Added an option for incremental loading in custom pipeline runs. * **Behavior Changes** * One pipeline path now explicitly bypasses caching by default to always re-run when invoked. * Disabling cache forces re-processing instead of early exit; cache reset still enables re-execution. * **Tests** * Added tests validating caching, non-caching, and cache-reset re-execution behavior. * **Chores** * Added CI job to run pipeline caching tests. ✏️ Tip: You can customize this high-level summary in your review settings. --- .github/workflows/e2e_tests.yml | 27 +++ cognee/api/v1/add/add.py | 1 + cognee/api/v1/cognify/cognify.py | 1 + cognee/modules/memify/memify.py | 8 +- .../modules/pipelines/operations/pipeline.py | 20 ++- .../run_custom_pipeline.py | 9 +- cognee/tests/test_pipeline_cache.py | 164 ++++++++++++++++++ 7 files changed, 220 insertions(+), 10 deletions(-) create mode 100644 cognee/tests/test_pipeline_cache.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 520d93689..cb69e9ef6 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -582,3 +582,30 @@ jobs: DB_USERNAME: cognee DB_PASSWORD: cognee run: uv run python ./cognee/tests/test_conversation_history.py + + run-pipeline-cache-test: + name: Test Pipeline Caching + runs-on: ubuntu-22.04 + steps: + - name: Check out + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Pipeline Cache Test + env: + ENV: 'dev' + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_pipeline_cache.py diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index a521b316b..1ea4caca4 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -205,6 +205,7 @@ async def add( pipeline_name="add_pipeline", vector_db_config=vector_db_config, graph_db_config=graph_db_config, + use_pipeline_cache=True, incremental_loading=incremental_loading, data_per_batch=data_per_batch, ): diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 9d9f7d154..9862edd49 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -237,6 +237,7 @@ async def cognify( vector_db_config=vector_db_config, graph_db_config=graph_db_config, incremental_loading=incremental_loading, + use_pipeline_cache=True, pipeline_name="cognify_pipeline", data_per_batch=data_per_batch, ) diff --git a/cognee/modules/memify/memify.py b/cognee/modules/memify/memify.py index 2d9b32a1b..e60eb5a4e 100644 --- a/cognee/modules/memify/memify.py +++ b/cognee/modules/memify/memify.py @@ -12,9 +12,6 @@ from cognee.modules.users.models import User from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import ( resolve_authorized_user_datasets, ) -from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import ( - reset_dataset_pipeline_run_status, -) from cognee.modules.engine.operations.setup import setup from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor from cognee.tasks.memify.extract_subgraph_chunks import extract_subgraph_chunks @@ -97,10 +94,6 @@ async def memify( *enrichment_tasks, ] - await reset_dataset_pipeline_run_status( - authorized_dataset.id, user, pipeline_names=["memify_pipeline"] - ) - # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for pipeline_executor_func = get_pipeline_executor(run_in_background=run_in_background) @@ -113,6 +106,7 @@ async def memify( datasets=authorized_dataset.id, vector_db_config=vector_db_config, graph_db_config=graph_db_config, + use_pipeline_cache=False, incremental_loading=False, pipeline_name="memify_pipeline", ) diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index eb0ebe8bd..6641d3a4c 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -20,6 +20,9 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import ( from cognee.modules.pipelines.layers.check_pipeline_run_qualification import ( check_pipeline_run_qualification, ) +from cognee.modules.pipelines.models.PipelineRunInfo import ( + PipelineRunStarted, +) from typing import Any logger = get_logger("cognee.pipeline") @@ -35,6 +38,7 @@ async def run_pipeline( pipeline_name: str = "custom_pipeline", vector_db_config: dict = None, graph_db_config: dict = None, + use_pipeline_cache: bool = False, incremental_loading: bool = False, data_per_batch: int = 20, ): @@ -51,6 +55,7 @@ async def run_pipeline( data=data, pipeline_name=pipeline_name, context={"dataset": dataset}, + use_pipeline_cache=use_pipeline_cache, incremental_loading=incremental_loading, data_per_batch=data_per_batch, ): @@ -64,6 +69,7 @@ async def run_pipeline_per_dataset( data=None, pipeline_name: str = "custom_pipeline", context: dict = None, + use_pipeline_cache=False, incremental_loading=False, data_per_batch: int = 20, ): @@ -77,8 +83,18 @@ async def run_pipeline_per_dataset( if process_pipeline_status: # If pipeline was already processed or is currently being processed # return status information to async generator and finish execution - yield process_pipeline_status - return + if use_pipeline_cache: + # If pipeline caching is enabled we do not proceed with re-processing + yield process_pipeline_status + return + else: + # If pipeline caching is disabled we always return pipeline started information and proceed with re-processing + yield PipelineRunStarted( + pipeline_run_id=process_pipeline_status.pipeline_run_id, + dataset_id=dataset.id, + dataset_name=dataset.name, + payload=data, + ) pipeline_run = run_tasks( tasks, diff --git a/cognee/modules/run_custom_pipeline/run_custom_pipeline.py b/cognee/modules/run_custom_pipeline/run_custom_pipeline.py index d3df1c060..269238503 100644 --- a/cognee/modules/run_custom_pipeline/run_custom_pipeline.py +++ b/cognee/modules/run_custom_pipeline/run_custom_pipeline.py @@ -18,6 +18,8 @@ async def run_custom_pipeline( user: User = None, vector_db_config: Optional[dict] = None, graph_db_config: Optional[dict] = None, + use_pipeline_cache: bool = False, + incremental_loading: bool = False, data_per_batch: int = 20, run_in_background: bool = False, pipeline_name: str = "custom_pipeline", @@ -40,6 +42,10 @@ async def run_custom_pipeline( user: User context for authentication and data access. Uses default if None. vector_db_config: Custom vector database configuration for embeddings storage. graph_db_config: Custom graph database configuration for relationship storage. + use_pipeline_cache: If True, pipelines with the same ID that are currently executing and pipelines with the same ID that were completed won't process data again. + Pipelines ID is created based on the generate_pipeline_id function. Pipeline status can be manually reset with the reset_dataset_pipeline_run_status function. + incremental_loading: If True, only new or modified data will be processed to avoid duplication. (Only works if data is used with the Cognee python Data model). + The incremental system stores and compares hashes of processed data in the Data model and skips data with the same content hash. data_per_batch: Number of data items to be processed in parallel. run_in_background: If True, starts processing asynchronously and returns immediately. If False, waits for completion before returning. @@ -63,7 +69,8 @@ async def run_custom_pipeline( datasets=dataset, vector_db_config=vector_db_config, graph_db_config=graph_db_config, - incremental_loading=False, + use_pipeline_cache=use_pipeline_cache, + incremental_loading=incremental_loading, data_per_batch=data_per_batch, pipeline_name=pipeline_name, ) diff --git a/cognee/tests/test_pipeline_cache.py b/cognee/tests/test_pipeline_cache.py new file mode 100644 index 000000000..8cdd6aa3c --- /dev/null +++ b/cognee/tests/test_pipeline_cache.py @@ -0,0 +1,164 @@ +""" +Test suite for the pipeline_cache feature in Cognee pipelines. + +This module tests the behavior of the `pipeline_cache` parameter which controls +whether a pipeline should skip re-execution when it has already been completed +for the same dataset. + +Architecture Overview: +--------------------- +The pipeline_cache mechanism works at the dataset level: +1. When a pipeline runs, it logs its status (INITIATED -> STARTED -> COMPLETED) +2. Before each run, `check_pipeline_run_qualification()` checks the pipeline status +3. If `use_pipeline_cache=True` and status is COMPLETED/STARTED, the pipeline skips +4. If `use_pipeline_cache=False`, the pipeline always re-executes regardless of status +""" + +import pytest + +import cognee +from cognee.modules.pipelines.tasks.task import Task +from cognee.modules.pipelines import run_pipeline +from cognee.modules.users.methods import get_default_user + +from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import ( + reset_dataset_pipeline_run_status, +) +from cognee.infrastructure.databases.relational import create_db_and_tables + + +class ExecutionCounter: + """Helper class to track task execution counts.""" + + def __init__(self): + self.count = 0 + + +async def create_counting_task(data, counter: ExecutionCounter): + """Create a task that increments a counter from the ExecutionCounter instance when executed.""" + counter.count += 1 + return counter + + +class TestPipelineCache: + """Tests for basic pipeline_cache on/off behavior.""" + + @pytest.mark.asyncio + async def test_pipeline_cache_off_allows_reexecution(self): + """ + Test that with use_pipeline_cache=False, the pipeline re-executes + even when it has already completed for the dataset. + + Expected behavior: + - First run: Pipeline executes fully, task runs once + - Second run: Pipeline executes again, task runs again (total: 2 times) + """ + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await create_db_and_tables() + + counter = ExecutionCounter() + user = await get_default_user() + + tasks = [Task(create_counting_task, counter=counter)] + + # First run + pipeline_results_1 = [] + async for result in run_pipeline( + tasks=tasks, + datasets="test_dataset_cache_off", + data=["sample data"], # Data is necessary to trigger processing + user=user, + pipeline_name="test_cache_off_pipeline", + use_pipeline_cache=False, + ): + pipeline_results_1.append(result) + + first_run_count = counter.count + assert first_run_count >= 1, "Task should have executed at least once on first run" + + # Second run with pipeline_cache=False + pipeline_results_2 = [] + async for result in run_pipeline( + tasks=tasks, + datasets="test_dataset_cache_off", + data=["sample data"], # Data is necessary to trigger processing + user=user, + pipeline_name="test_cache_off_pipeline", + use_pipeline_cache=False, + ): + pipeline_results_2.append(result) + + second_run_count = counter.count + assert second_run_count > first_run_count, ( + f"With pipeline_cache=False, task should re-execute. " + f"First run: {first_run_count}, After second run: {second_run_count}" + ) + + @pytest.mark.asyncio + async def test_reset_pipeline_status_allows_reexecution_with_cache(self): + """ + Test that resetting pipeline status allows re-execution even with + pipeline_cache=True. + """ + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await create_db_and_tables() + + counter = ExecutionCounter() + user = await get_default_user() + dataset_name = "reset_status_test" + pipeline_name = "test_reset_pipeline" + + tasks = [Task(create_counting_task, counter=counter)] + + # First run + pipeline_result = [] + async for result in run_pipeline( + tasks=tasks, + datasets=dataset_name, + user=user, + data=["sample data"], # Data is necessary to trigger processing + pipeline_name=pipeline_name, + use_pipeline_cache=True, + ): + pipeline_result.append(result) + + first_run_count = counter.count + assert first_run_count >= 1 + + # Second run without reset - should skip + async for _ in run_pipeline( + tasks=tasks, + datasets=dataset_name, + user=user, + data=["sample data"], # Data is necessary to trigger processing + pipeline_name=pipeline_name, + use_pipeline_cache=True, + ): + pass + + after_second_run = counter.count + assert after_second_run == first_run_count, "Should have skipped due to cache" + + # Reset the pipeline status + await reset_dataset_pipeline_run_status( + pipeline_result[0].dataset_id, user, pipeline_names=[pipeline_name] + ) + + # Third run after reset - should execute + async for _ in run_pipeline( + tasks=tasks, + datasets=dataset_name, + user=user, + data=["sample data"], # Data is necessary to trigger processing + pipeline_name=pipeline_name, + use_pipeline_cache=True, + ): + pass + + after_reset_run = counter.count + assert after_reset_run > after_second_run, ( + f"After reset, pipeline should re-execute. " + f"Before reset: {after_second_run}, After reset run: {after_reset_run}" + ) From 127d9860df55c857dd5ee1caa5dab3b3a6f43345 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:22:03 +0100 Subject: [PATCH 54/54] feat: Add dataset database handler info (#1887) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Add info on dataset database handler used for dataset database ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. ## Summary by CodeRabbit * **New Features** * Datasets now record their assigned vector and graph database handlers, allowing per-dataset backend selection. * **Chores** * Database schema expanded to store handler identifiers per dataset. * Deletion/cleanup processes now use dataset-level handler info for accurate removal across backends. * **Tests** * Tests updated to include and validate the new handler fields in dataset creation outputs. ✏️ Tip: You can customize this high-level summary in your review settings. --- ...d2b2_expand_dataset_database_with_json_.py | 66 +++++++++++++++++++ .../graph/kuzu/KuzuDatasetDatabaseHandler.py | 1 + .../Neo4jAuraDevDatasetDatabaseHandler.py | 1 + ...esolve_dataset_database_connection_info.py | 10 +-- .../lancedb/LanceDBDatasetDatabaseHandler.py | 1 + cognee/modules/data/deletion/prune_system.py | 13 ++-- .../modules/users/models/DatasetDatabase.py | 3 + cognee/tests/test_dataset_database_handler.py | 2 + 8 files changed, 82 insertions(+), 15 deletions(-) diff --git a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py index e15a98b7c..25b94a724 100644 --- a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py +++ b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py @@ -49,6 +49,20 @@ def _recreate_table_without_unique_constraint_sqlite(op, insp): sa.Column("graph_database_name", sa.String(), nullable=False), sa.Column("vector_database_provider", sa.String(), nullable=False), sa.Column("graph_database_provider", sa.String(), nullable=False), + sa.Column( + "vector_dataset_database_handler", + sa.String(), + unique=False, + nullable=False, + server_default="lancedb", + ), + sa.Column( + "graph_dataset_database_handler", + sa.String(), + unique=False, + nullable=False, + server_default="kuzu", + ), sa.Column("vector_database_url", sa.String()), sa.Column("graph_database_url", sa.String()), sa.Column("vector_database_key", sa.String()), @@ -82,6 +96,8 @@ def _recreate_table_without_unique_constraint_sqlite(op, insp): graph_database_name, vector_database_provider, graph_database_provider, + vector_dataset_database_handler, + graph_dataset_database_handler, vector_database_url, graph_database_url, vector_database_key, @@ -120,6 +136,20 @@ def _recreate_table_with_unique_constraint_sqlite(op, insp): sa.Column("graph_database_name", sa.String(), nullable=False, unique=True), sa.Column("vector_database_provider", sa.String(), nullable=False), sa.Column("graph_database_provider", sa.String(), nullable=False), + sa.Column( + "vector_dataset_database_handler", + sa.String(), + unique=False, + nullable=False, + server_default="lancedb", + ), + sa.Column( + "graph_dataset_database_handler", + sa.String(), + unique=False, + nullable=False, + server_default="kuzu", + ), sa.Column("vector_database_url", sa.String()), sa.Column("graph_database_url", sa.String()), sa.Column("vector_database_key", sa.String()), @@ -153,6 +183,8 @@ def _recreate_table_with_unique_constraint_sqlite(op, insp): graph_database_name, vector_database_provider, graph_database_provider, + vector_dataset_database_handler, + graph_dataset_database_handler, vector_database_url, graph_database_url, vector_database_key, @@ -193,6 +225,22 @@ def upgrade() -> None: ), ) + vector_dataset_database_handler = _get_column( + insp, "dataset_database", "vector_dataset_database_handler" + ) + if not vector_dataset_database_handler: + # Add LanceDB as the default graph dataset database handler + op.add_column( + "dataset_database", + sa.Column( + "vector_dataset_database_handler", + sa.String(), + unique=False, + nullable=False, + server_default="lancedb", + ), + ) + graph_database_connection_info_column = _get_column( insp, "dataset_database", "graph_database_connection_info" ) @@ -208,6 +256,22 @@ def upgrade() -> None: ), ) + graph_dataset_database_handler = _get_column( + insp, "dataset_database", "graph_dataset_database_handler" + ) + if not graph_dataset_database_handler: + # Add Kuzu as the default graph dataset database handler + op.add_column( + "dataset_database", + sa.Column( + "graph_dataset_database_handler", + sa.String(), + unique=False, + nullable=False, + server_default="kuzu", + ), + ) + with op.batch_alter_table("dataset_database", schema=None) as batch_op: # Drop the unique constraint to make unique=False graph_constraint_to_drop = None @@ -265,3 +329,5 @@ def downgrade() -> None: op.drop_column("dataset_database", "vector_database_connection_info") op.drop_column("dataset_database", "graph_database_connection_info") + op.drop_column("dataset_database", "vector_dataset_database_handler") + op.drop_column("dataset_database", "graph_dataset_database_handler") diff --git a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py index edc6d5c39..61ff84870 100644 --- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py @@ -47,6 +47,7 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): "graph_database_url": graph_db_url, "graph_database_provider": graph_config.graph_database_provider, "graph_database_key": graph_db_key, + "graph_dataset_database_handler": "kuzu", "graph_database_connection_info": { "graph_database_username": graph_db_username, "graph_database_password": graph_db_password, diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py index 73f057fa8..eb6cbc55a 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py @@ -131,6 +131,7 @@ class Neo4jAuraDevDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): "graph_database_url": graph_db_url, "graph_database_provider": "neo4j", "graph_database_key": graph_db_key, + "graph_dataset_database_handler": "neo4j_aura_dev", "graph_database_connection_info": { "graph_database_username": graph_db_username, "graph_database_password": encrypted_db_password_string, diff --git a/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py b/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py index 4d8c19403..d33169642 100644 --- a/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +++ b/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py @@ -1,27 +1,21 @@ -from cognee.infrastructure.databases.vector import get_vectordb_config -from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.modules.users.models.DatasetDatabase import DatasetDatabase async def _get_vector_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase: - vector_config = get_vectordb_config() - from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( supported_dataset_database_handlers, ) - handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] + handler = supported_dataset_database_handlers[dataset_database.vector_dataset_database_handler] return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database) async def _get_graph_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase: - graph_config = get_graph_config() - from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( supported_dataset_database_handlers, ) - handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] + handler = supported_dataset_database_handlers[dataset_database.graph_dataset_database_handler] return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database) diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py index f165a7ea4..e392b7eb8 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py @@ -36,6 +36,7 @@ class LanceDBDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): "vector_database_url": os.path.join(databases_directory_path, vector_db_name), "vector_database_key": vector_config.vector_db_key, "vector_database_name": vector_db_name, + "vector_dataset_database_handler": "lancedb", } @classmethod diff --git a/cognee/modules/data/deletion/prune_system.py b/cognee/modules/data/deletion/prune_system.py index b43cab1f7..645e1a223 100644 --- a/cognee/modules/data/deletion/prune_system.py +++ b/cognee/modules/data/deletion/prune_system.py @@ -5,8 +5,6 @@ from cognee.context_global_variables import backend_access_control_enabled from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.infrastructure.databases.vector.config import get_vectordb_config -from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.shared.cache import delete_cache from cognee.modules.users.models import DatasetDatabase from cognee.shared.logging_utils import get_logger @@ -16,12 +14,13 @@ logger = get_logger() async def prune_graph_databases(): async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict: - graph_config = get_graph_config() from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( supported_dataset_database_handlers, ) - handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] + handler = supported_dataset_database_handlers[ + dataset_database.graph_dataset_database_handler + ] return await handler["handler_instance"].delete_dataset(dataset_database) db_engine = get_relational_engine() @@ -40,13 +39,13 @@ async def prune_graph_databases(): async def prune_vector_databases(): async def _prune_vector_db(dataset_database: DatasetDatabase) -> dict: - vector_config = get_vectordb_config() - from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( supported_dataset_database_handlers, ) - handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] + handler = supported_dataset_database_handlers[ + dataset_database.vector_dataset_database_handler + ] return await handler["handler_instance"].delete_dataset(dataset_database) db_engine = get_relational_engine() diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 15964f032..08c4b5311 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -18,6 +18,9 @@ class DatasetDatabase(Base): vector_database_provider = Column(String, unique=False, nullable=False) graph_database_provider = Column(String, unique=False, nullable=False) + graph_dataset_database_handler = Column(String, unique=False, nullable=False) + vector_dataset_database_handler = Column(String, unique=False, nullable=False) + vector_database_url = Column(String, unique=False, nullable=True) graph_database_url = Column(String, unique=False, nullable=True) diff --git a/cognee/tests/test_dataset_database_handler.py b/cognee/tests/test_dataset_database_handler.py index be1b249d2..e4c9b0177 100644 --- a/cognee/tests/test_dataset_database_handler.py +++ b/cognee/tests/test_dataset_database_handler.py @@ -30,6 +30,7 @@ class LanceDBTestDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): vector_db_name = "test.lance.db" return { + "vector_dataset_database_handler": "custom_lancedb_handler", "vector_database_name": vector_db_name, "vector_database_url": os.path.join(databases_directory_path, vector_db_name), "vector_database_provider": "lancedb", @@ -44,6 +45,7 @@ class KuzuTestDatasetDatabaseHandler(DatasetDatabaseHandlerInterface): graph_db_name = "test.kuzu" return { + "graph_dataset_database_handler": "custom_kuzu_handler", "graph_database_name": graph_db_name, "graph_database_url": os.path.join(databases_directory_path, graph_db_name), "graph_database_provider": "kuzu",