feat: Add dataset_id calculation that handles legacy dataset_id

This commit is contained in:
Igor Ilic 2025-11-04 13:11:57 +01:00
parent b0f85c9e99
commit ff388179fb
3 changed files with 71 additions and 14 deletions

View file

@ -1,5 +1,4 @@
from uuid import uuid5, NAMESPACE_OID, UUID from uuid import uuid5, NAMESPACE_OID, UUID
from typing import Optional
from sqlalchemy import select from sqlalchemy import select
from cognee.modules.data.models.Data import Data from cognee.modules.data.models.Data import Data
@ -7,7 +6,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.users.models import User from cognee.modules.users.models import User
async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID: async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
""" """
Function returns a unique UUID for data based on data identifier, user id and tenant id. Function returns a unique UUID for data based on data identifier, user id and tenant id.
If data with legacy ID exists, return that ID to maintain compatibility. If data with legacy ID exists, return that ID to maintain compatibility.
@ -35,7 +34,7 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option
# return UUID hash of file contents + owner id + tenant_id # return UUID hash of file contents + owner id + tenant_id
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}") return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID: def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
""" """
Function returns a unique UUID for data based on data identifier, user id and tenant id. Function returns a unique UUID for data based on data identifier, user id and tenant id.
Args: Args:
@ -47,13 +46,11 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option
UUID: Unique identifier for the data UUID: Unique identifier for the data
""" """
# return UUID hash of file contents + owner id + tenant_id # return UUID hash of file contents + owner id + tenant_id
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}") return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
# Get all possible data_id values # Get all possible data_id values
data_id = { data_id = {
"modern_data_id": _get_modern_unique_data_id( "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
data_identifier=data_identifier, user=user, tenant_id=tenant_id
),
"legacy_data_id": _get_deprecated_unique_data_id( "legacy_data_id": _get_deprecated_unique_data_id(
data_identifier=data_identifier, user=user data_identifier=data_identifier, user=user
), ),

View file

@ -1,9 +1,71 @@
from uuid import UUID, uuid5, NAMESPACE_OID from uuid import UUID, uuid5, NAMESPACE_OID
from cognee.modules.users.models import User
from typing import Union from typing import Union
from sqlalchemy import select
from cognee.modules.data.models.Dataset import Dataset
from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_engine
async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
if isinstance(dataset_name, UUID): """
return dataset_name Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") If dataset with legacy ID exists, return that ID to maintain compatibility.
Args:
dataset_name: string representing the dataset name
user: User object adding the dataset
tenant_id: UUID of the tenant for which dataset is being added
Returns:
UUID: Unique identifier for the dataset
"""
def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
"""
Legacy function, returns a unique UUID for dataset based on dataset name and user id.
Needed to support legacy datasets without tenant information.
Args:
dataset_name: string representing the dataset name
user: Current User object adding the dataset
Returns:
UUID: Unique identifier for the dataset
"""
if isinstance(dataset_name, UUID):
return dataset_name
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
"""
Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
Args:
dataset_name: string representing the dataset name
user: Current User object adding the dataset
tenant_id: UUID of the tenant for which dataset is being added
Returns:
UUID: Unique identifier for the dataset
"""
if isinstance(dataset_name, UUID):
return dataset_name
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
# Get all possible dataset_id values
dataset_id = {
"modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
"legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
}
# Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
legacy_dataset = (
await session.execute(
select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"])
)
).scalar_one_or_none()
if not legacy_dataset:
return dataset_id["modern_dataset_id"]
return dataset_id["legacy_dataset_id"]

View file

@ -8,6 +8,4 @@ from cognee.modules.data.methods import get_unique_data_id
async def identify(data: IngestionData, user: User) -> UUID: async def identify(data: IngestionData, user: User) -> UUID:
data_content_hash: str = data.get_identifier() data_content_hash: str = data.get_identifier()
return await get_unique_data_id( return await get_unique_data_id(data_identifier=data_content_hash, user=user)
data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id
)