diff --git a/cognee/modules/data/methods/get_unique_data_id.py b/cognee/modules/data/methods/get_unique_data_id.py index 3fc184ce4..877b5930c 100644 --- a/cognee/modules/data/methods/get_unique_data_id.py +++ b/cognee/modules/data/methods/get_unique_data_id.py @@ -1,5 +1,4 @@ from uuid import uuid5, NAMESPACE_OID, UUID -from typing import Optional from sqlalchemy import select from cognee.modules.data.models.Data import Data @@ -7,7 +6,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models import User -async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID: +async def get_unique_data_id(data_identifier: str, user: User) -> UUID: """ Function returns a unique UUID for data based on data identifier, user id and tenant id. If data with legacy ID exists, return that ID to maintain compatibility. @@ -35,7 +34,7 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option # return UUID hash of file contents + owner id + tenant_id return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}") - def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID: + def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID: """ Function returns a unique UUID for data based on data identifier, user id and tenant id. Args: @@ -47,13 +46,11 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option UUID: Unique identifier for the data """ # return UUID hash of file contents + owner id + tenant_id - return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}") + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}") # Get all possible data_id values data_id = { - "modern_data_id": _get_modern_unique_data_id( - data_identifier=data_identifier, user=user, tenant_id=tenant_id - ), + "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user), "legacy_data_id": _get_deprecated_unique_data_id( data_identifier=data_identifier, user=user ), diff --git a/cognee/modules/data/methods/get_unique_dataset_id.py b/cognee/modules/data/methods/get_unique_dataset_id.py index 2caf5fb55..274f24d1a 100644 --- a/cognee/modules/data/methods/get_unique_dataset_id.py +++ b/cognee/modules/data/methods/get_unique_dataset_id.py @@ -1,9 +1,71 @@ from uuid import UUID, uuid5, NAMESPACE_OID -from cognee.modules.users.models import User from typing import Union +from sqlalchemy import select + +from cognee.modules.data.models.Dataset import Dataset +from cognee.modules.users.models import User +from cognee.infrastructure.databases.relational import get_relational_engine async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: - if isinstance(dataset_name, UUID): - return dataset_name - return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") + """ + Function returns a unique UUID for dataset based on dataset name, user id and tenant id. + If dataset with legacy ID exists, return that ID to maintain compatibility. + + Args: + dataset_name: string representing the dataset name + user: User object adding the dataset + tenant_id: UUID of the tenant for which dataset is being added + + Returns: + UUID: Unique identifier for the dataset + """ + + def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: + """ + Legacy function, returns a unique UUID for dataset based on dataset name and user id. + Needed to support legacy datasets without tenant information. + Args: + dataset_name: string representing the dataset name + user: Current User object adding the dataset + + Returns: + UUID: Unique identifier for the dataset + """ + if isinstance(dataset_name, UUID): + return dataset_name + return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") + + def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: + """ + Returns a unique UUID for dataset based on dataset name, user id and tenant_id. + Args: + dataset_name: string representing the dataset name + user: Current User object adding the dataset + tenant_id: UUID of the tenant for which dataset is being added + + Returns: + UUID: Unique identifier for the dataset + """ + if isinstance(dataset_name, UUID): + return dataset_name + return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}") + + # Get all possible dataset_id values + dataset_id = { + "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user), + "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user), + } + + # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + legacy_dataset = ( + await session.execute( + select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"]) + ) + ).scalar_one_or_none() + + if not legacy_dataset: + return dataset_id["modern_dataset_id"] + return dataset_id["legacy_dataset_id"] diff --git a/cognee/modules/ingestion/identify.py b/cognee/modules/ingestion/identify.py index 5a0fe379e..640fce4a2 100644 --- a/cognee/modules/ingestion/identify.py +++ b/cognee/modules/ingestion/identify.py @@ -8,6 +8,4 @@ from cognee.modules.data.methods import get_unique_data_id async def identify(data: IngestionData, user: User) -> UUID: data_content_hash: str = data.get_identifier() - return await get_unique_data_id( - data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id - ) + return await get_unique_data_id(data_identifier=data_content_hash, user=user)