Initial commit, still wip

This commit is contained in:
Andrej Milicevic 2025-10-27 08:12:37 +01:00
parent 1f49ca265f
commit 813ee94836
5 changed files with 51 additions and 7 deletions

View file

@ -57,19 +57,34 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_
# Set vector and graph database configuration based on dataset database information
vector_config = {
"vector_db_url": os.path.join(
databases_directory_path, dataset_database.vector_database_name
),
"vector_db_key": "",
"vector_db_provider": "lancedb",
"vector_db_provider": dataset_database.vector_database_provider,
"vector_db_url": dataset_database.vector_database_url,
# TODO: Maybe add key to dataset_database, and put it here??
"vector_db_key": ""
}
# vector_config = {
# "vector_db_url": os.path.join(
# databases_directory_path, dataset_database.vector_database_name
# ),
# "vector_db_key": "",
# "vector_db_provider": "lancedb",
# }
graph_config = {
"graph_database_provider": "kuzu",
"graph_database_provider": dataset_database.graph_database_provider,
"graph_database_url": dataset_database.graph_database_url,
"graph_database_name": dataset_database.graph_database_name,
"graph_file_path": os.path.join(
databases_directory_path, dataset_database.graph_database_name
),
}
# graph_config = {
# "graph_database_provider": "kuzu",
# "graph_file_path": os.path.join(
# databases_directory_path, dataset_database.graph_database_name
# ),
# }
storage_config = {
"data_root_directory": data_root_directory,

View file

@ -69,6 +69,7 @@ def create_graph_engine(
graph_database_url=graph_database_url,
graph_database_username=graph_database_username,
graph_database_password=graph_database_password,
graph_name=graph_database_name,
)
if graph_database_provider == "neo4j":

View file

@ -6,11 +6,20 @@ from sqlalchemy.exc import IntegrityError
from cognee.modules.data.methods import create_dataset
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.infrastructure.databases.vector import get_vectordb_config
from cognee.infrastructure.databases.graph.config import get_graph_config
from cognee.modules.data.methods import get_unique_dataset_id
from cognee.modules.users.models import DatasetDatabase
from cognee.modules.users.models import User
# TODO: Find a better place to define these
default_vector_db_name = "lance.db"
default_vector_db_provider = "lancedb"
default_graph_db_provider = "kuzu"
default_vector_db_url = None
default_graph_db_url = None
async def get_or_create_dataset_database(
dataset: Union[str, UUID],
user: User,
@ -32,9 +41,12 @@ async def get_or_create_dataset_database(
dataset_id = await get_unique_dataset_id(dataset, user)
vector_db_name = f"{dataset_id}.lance.db"
vector_db_name = f"{dataset_id}.db"
graph_db_name = f"{dataset_id}.pkl"
vector_config = get_vectordb_config()
graph_config = get_graph_config()
async with db_engine.get_async_session() as session:
# Create dataset if it doesn't exist
if isinstance(dataset, str):
@ -49,12 +61,19 @@ async def get_or_create_dataset_database(
if existing:
return existing
# TODO: Set the vector and graph database stuff (name, provider, etc.) based on the whether or
# TODO: not we support multi user for that db. If not, set to default, which is lance and/or kuzu.
# If there are no existing rows build a new row
record = DatasetDatabase(
owner_id=user.id,
dataset_id=dataset_id,
vector_database_name=vector_db_name,
graph_database_name=graph_db_name,
vector_database_provider=vector_config.vector_db_provider,
graph_database_provider=graph_config.graph_database_provider,
vector_database_url=vector_config.vector_db_url,
graph_database_url=graph_config.graph_database_url,
)
try:

View file

@ -1,5 +1,6 @@
from .supported_databases import supported_databases
from .embeddings import get_embedding_engine
from cognee.infrastructure.databases.graph.config import get_graph_config
from functools import lru_cache
@ -45,6 +46,7 @@ def create_vector_engine(
url=vector_db_url,
api_key=vector_db_key,
embedding_engine=embedding_engine,
graph_name=get_graph_config().graph_database_name
)
if vector_db_provider == "pgvector":

View file

@ -12,8 +12,15 @@ class DatasetDatabase(Base):
UUID, ForeignKey("datasets.id", ondelete="CASCADE"), primary_key=True, index=True
)
# TODO: Why is this unique? Isn't it fact that two or more datasets can have the same vector and graph store?
vector_database_name = Column(String, unique=True, nullable=False)
graph_database_name = Column(String, unique=True, nullable=False)
vector_database_provider = Column(String, unique=True, nullable=False)
graph_database_provider = Column(String, unique=True, nullable=False)
vector_database_url = Column(String, unique=True, nullable=True)
graph_database_url = Column(String, unique=True, nullable=True)
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))