feat: Add dataset_id calculation that handles legacy dataset_id

This commit is contained in:
Igor Ilic 2025-11-04 13:11:57 +01:00
parent b0f85c9e99
commit ff388179fb
3 changed files with 71 additions and 14 deletions

View file

@ -1,5 +1,4 @@
from uuid import uuid5, NAMESPACE_OID, UUID
from typing import Optional
from sqlalchemy import select
from cognee.modules.data.models.Data import Data
@ -7,7 +6,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.users.models import User
async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID:
async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
"""
Function returns a unique UUID for data based on data identifier, user id and tenant id.
If data with legacy ID exists, return that ID to maintain compatibility.
@ -35,7 +34,7 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option
# return UUID hash of file contents + owner id + tenant_id
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID:
def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
"""
Function returns a unique UUID for data based on data identifier, user id and tenant id.
Args:
@ -47,13 +46,11 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option
UUID: Unique identifier for the data
"""
# return UUID hash of file contents + owner id + tenant_id
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}")
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
# Get all possible data_id values
data_id = {
"modern_data_id": _get_modern_unique_data_id(
data_identifier=data_identifier, user=user, tenant_id=tenant_id
),
"modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
"legacy_data_id": _get_deprecated_unique_data_id(
data_identifier=data_identifier, user=user
),

View file

@ -1,9 +1,71 @@
from uuid import UUID, uuid5, NAMESPACE_OID
from cognee.modules.users.models import User
from typing import Union
from sqlalchemy import select
from cognee.modules.data.models.Dataset import Dataset
from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_engine
async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
if isinstance(dataset_name, UUID):
return dataset_name
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
"""
Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
If dataset with legacy ID exists, return that ID to maintain compatibility.
Args:
dataset_name: string representing the dataset name
user: User object adding the dataset
tenant_id: UUID of the tenant for which dataset is being added
Returns:
UUID: Unique identifier for the dataset
"""
def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
"""
Legacy function, returns a unique UUID for dataset based on dataset name and user id.
Needed to support legacy datasets without tenant information.
Args:
dataset_name: string representing the dataset name
user: Current User object adding the dataset
Returns:
UUID: Unique identifier for the dataset
"""
if isinstance(dataset_name, UUID):
return dataset_name
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
"""
Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
Args:
dataset_name: string representing the dataset name
user: Current User object adding the dataset
tenant_id: UUID of the tenant for which dataset is being added
Returns:
UUID: Unique identifier for the dataset
"""
if isinstance(dataset_name, UUID):
return dataset_name
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
# Get all possible dataset_id values
dataset_id = {
"modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
"legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
}
# Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
legacy_dataset = (
await session.execute(
select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"])
)
).scalar_one_or_none()
if not legacy_dataset:
return dataset_id["modern_dataset_id"]
return dataset_id["legacy_dataset_id"]

View file

@ -8,6 +8,4 @@ from cognee.modules.data.methods import get_unique_data_id
async def identify(data: IngestionData, user: User) -> UUID:
data_content_hash: str = data.get_identifier()
return await get_unique_data_id(
data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id
)
return await get_unique_data_id(data_identifier=data_content_hash, user=user)