refactor: Add better handling of configuration for dataset to database handler

This commit is contained in:
Igor Ilic 2025-11-25 15:41:01 +01:00
parent 64a3ee96c4
commit 593f17fcdc
6 changed files with 71 additions and 28 deletions

View file

@ -93,6 +93,8 @@ DB_NAME=cognee_db
# Default (local file-based) # Default (local file-based)
GRAPH_DATABASE_PROVIDER="kuzu" GRAPH_DATABASE_PROVIDER="kuzu"
# Handler for multi-user access control mode, it handles how should the mapping/creation of separate DBs be handled per Cognee dataset
GRAPH_DATASET_DATABASE_HANDLER="kuzu"
# -- To switch to Remote Kuzu uncomment and fill these: ------------------------------------------------------------- # -- To switch to Remote Kuzu uncomment and fill these: -------------------------------------------------------------
#GRAPH_DATABASE_PROVIDER="kuzu" #GRAPH_DATABASE_PROVIDER="kuzu"
@ -117,6 +119,8 @@ VECTOR_DB_PROVIDER="lancedb"
# Not needed if a cloud vector database is not used # Not needed if a cloud vector database is not used
VECTOR_DB_URL= VECTOR_DB_URL=
VECTOR_DB_KEY= VECTOR_DB_KEY=
# Handler for multi-user access control mode, it handles how should the mapping/creation of separate DBs be handled per Cognee dataset
VECTOR_DATASET_DATABASE_HANDLER="lancedb"
################################################################################ ################################################################################
# 🧩 Ontology resolver settings # 🧩 Ontology resolver settings

View file

@ -4,8 +4,8 @@ from typing import Union
from uuid import UUID from uuid import UUID
from cognee.base_config import get_base_config from cognee.base_config import get_base_config
from cognee.infrastructure.databases.vector.config import get_vectordb_context_config from cognee.infrastructure.databases.vector.config import get_vectordb_config
from cognee.infrastructure.databases.graph.config import get_graph_context_config from cognee.infrastructure.databases.graph.config import get_graph_config
from cognee.infrastructure.databases.utils import get_or_create_dataset_database from cognee.infrastructure.databases.utils import get_or_create_dataset_database
from cognee.infrastructure.files.storage.config import file_storage_config from cognee.infrastructure.files.storage.config import file_storage_config
from cognee.modules.users.methods import get_user from cognee.modules.users.methods import get_user
@ -16,23 +16,59 @@ vector_db_config = ContextVar("vector_db_config", default=None)
graph_db_config = ContextVar("graph_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None)
session_user = ContextVar("session_user", default=None) session_user = ContextVar("session_user", default=None)
VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"]
GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor", "neo4j"]
async def set_session_user_context_variable(user): async def set_session_user_context_variable(user):
session_user.set(user) session_user.set(user)
def multi_user_support_possible(): def multi_user_support_possible():
graph_db_config = get_graph_context_config() graph_db_config = get_graph_config()
vector_db_config = get_vectordb_context_config() vector_db_config = get_vectordb_config()
# TODO: Make sure dataset database handler and provider match, remove multi_user support check, add error if no dataset database handler exists for provider
return ( graph_handler = graph_db_config.graph_dataset_database_handler
graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT vector_handler = vector_db_config.vector_dataset_database_handler
and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT from cognee.infrastructure.databases.dataset_database_handler import (
supported_dataset_database_handlers,
) )
if graph_handler not in supported_dataset_database_handlers:
raise EnvironmentError(
"Unsupported graph dataset to database handler configured. Cannot add support for multi-user access control mode. Please use a supported graph dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n"
f"Selected graph dataset to database handler: {graph_handler}\n"
f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n"
)
if vector_handler not in supported_dataset_database_handlers:
raise EnvironmentError(
"Unsupported vector dataset to database handler configured. Cannot add support for multi-user access control mode. Please use a supported vector dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n"
f"Selected vector dataset to database handler: {vector_handler}\n"
f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n"
)
if (
supported_dataset_database_handlers[graph_handler]["handler_provider"]
!= graph_db_config.graph_database_provider
):
raise EnvironmentError(
"The selected graph dataset to database handler does not work with the configured graph database provider. Cannot add support for multi-user access control mode. Please use a supported graph dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n"
f"Selected graph database provider: {graph_db_config.graph_database_provider}\n"
f"Selected graph dataset to database handler: {graph_handler}\n"
f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n"
)
if (
supported_dataset_database_handlers[vector_handler]["handler_provider"]
!= vector_db_config.vector_db_provider
):
raise EnvironmentError(
"The selected vector dataset to database handler does not work with the configured vector database provider. Cannot add support for multi-user access control mode. Please use a supported vector dataset to database handler or set the environment variables ENABLE_BACKEND_ACCESS_CONTROL to false to switch off multi-user access control mode.\n"
f"Selected vector database provider: {vector_db_config.vector_db_provider}\n"
f"Selected vector dataset to database handler: {vector_handler}\n"
f"Supported dataset to database handlers: {list(supported_dataset_database_handlers.keys())}\n"
)
return True
def backend_access_control_enabled(): def backend_access_control_enabled():
backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None) backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None)
@ -42,12 +78,7 @@ def backend_access_control_enabled():
return multi_user_support_possible() return multi_user_support_possible()
elif backend_access_control.lower() == "true": elif backend_access_control.lower() == "true":
# If enabled, ensure that the current graph and vector DBs can support it # If enabled, ensure that the current graph and vector DBs can support it
multi_user_support = multi_user_support_possible() return multi_user_support_possible()
if not multi_user_support:
raise EnvironmentError(
"ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control."
)
return True
return False return False

View file

@ -9,7 +9,10 @@ from cognee.infrastructure.databases.graph.kuzu.KuzuDatasetDatabaseHandler impor
) )
supported_dataset_database_handlers = { supported_dataset_database_handlers = {
"neo4j_aura": Neo4jAuraDatasetDatabaseHandler, "neo4j_aura": {
"lancedb": LanceDBDatasetDatabaseHandler, "handler_instance": Neo4jAuraDatasetDatabaseHandler,
"kuzu": KuzuDatasetDatabaseHandler, "handler_provider": "neo4j",
},
"lancedb": {"handler_instance": LanceDBDatasetDatabaseHandler, "handler_provider": "lancedb"},
"kuzu": {"handler_instance": KuzuDatasetDatabaseHandler, "handler_provider": "kuzu"},
} }

View file

@ -1,5 +1,10 @@
from .supported_dataset_database_handlers import supported_dataset_database_handlers from .supported_dataset_database_handlers import supported_dataset_database_handlers
def use_dataset_database_handler(dataset_database_handler_name, dataset_database_handler): def use_dataset_database_handler(
supported_dataset_database_handlers[dataset_database_handler_name] = dataset_database_handler dataset_database_handler_name, dataset_database_handler, dataset_database_provider
):
supported_dataset_database_handlers[dataset_database_handler_name] = {
"handler_instance": dataset_database_handler,
"handler_provider": dataset_database_provider,
}

View file

@ -1,13 +1,9 @@
import os
import asyncio
import requests
from uuid import UUID from uuid import UUID
from typing import Union, Optional from typing import Union, Optional
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from cognee.base_config import get_base_config
from cognee.modules.data.methods import create_dataset from cognee.modules.data.methods import create_dataset
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.infrastructure.databases.vector import get_vectordb_config from cognee.infrastructure.databases.vector import get_vectordb_config
@ -25,7 +21,7 @@ async def _get_vector_db_info(dataset_id: UUID, user: User) -> dict:
) )
handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler] handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler]
return await handler.create_dataset(dataset_id, user) return await handler["handler_instance"].create_dataset(dataset_id, user)
async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict: async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict:
@ -36,7 +32,7 @@ async def _get_graph_db_info(dataset_id: UUID, user: User) -> dict:
) )
handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler] handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler]
return await handler.create_dataset(dataset_id, user) return await handler["handler_instance"].create_dataset(dataset_id, user)
async def _existing_dataset_database( async def _existing_dataset_database(

View file

@ -534,6 +534,10 @@ def setup_logging(log_level=None, name=None):
# Get a configured logger and log system information # Get a configured logger and log system information
logger = structlog.get_logger(name if name else __name__) logger = structlog.get_logger(name if name else __name__)
logger.warning(
"From version 0.5.0 onwards, Cognee will run with multi-user access control mode set to on by default. Data isolation between different users and datasets will be enforced and data created before multi-user access control mode was turned on won't be accessible by default. To disable multi-user access control mode and regain access to old data set the environment variable ENABLE_BACKEND_ACCESS_CONTROL to false before starting Cognee. For more information, please refer to the Cognee documentation."
)
if logs_dir is not None: if logs_dir is not None:
logger.info(f"Log file created at: {log_file_path}", log_file=log_file_path) logger.info(f"Log file created at: {log_file_path}", log_file=log_file_path)