feat: Add dataset_id calculation that handles legacy dataset_id
This commit is contained in:
parent
b0f85c9e99
commit
ff388179fb
3 changed files with 71 additions and 14 deletions
|
|
@ -1,5 +1,4 @@
|
|||
from uuid import uuid5, NAMESPACE_OID, UUID
|
||||
from typing import Optional
|
||||
from sqlalchemy import select
|
||||
|
||||
from cognee.modules.data.models.Data import Data
|
||||
|
|
@ -7,7 +6,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine
|
|||
from cognee.modules.users.models import User
|
||||
|
||||
|
||||
async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID:
|
||||
async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
|
||||
"""
|
||||
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
||||
If data with legacy ID exists, return that ID to maintain compatibility.
|
||||
|
|
@ -35,7 +34,7 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option
|
|||
# return UUID hash of file contents + owner id + tenant_id
|
||||
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
|
||||
|
||||
def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID:
|
||||
def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
|
||||
"""
|
||||
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
||||
Args:
|
||||
|
|
@ -47,13 +46,11 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option
|
|||
UUID: Unique identifier for the data
|
||||
"""
|
||||
# return UUID hash of file contents + owner id + tenant_id
|
||||
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}")
|
||||
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
|
||||
|
||||
# Get all possible data_id values
|
||||
data_id = {
|
||||
"modern_data_id": _get_modern_unique_data_id(
|
||||
data_identifier=data_identifier, user=user, tenant_id=tenant_id
|
||||
),
|
||||
"modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
|
||||
"legacy_data_id": _get_deprecated_unique_data_id(
|
||||
data_identifier=data_identifier, user=user
|
||||
),
|
||||
|
|
|
|||
|
|
@ -1,9 +1,71 @@
|
|||
from uuid import UUID, uuid5, NAMESPACE_OID
|
||||
from cognee.modules.users.models import User
|
||||
from typing import Union
|
||||
from sqlalchemy import select
|
||||
|
||||
from cognee.modules.data.models.Dataset import Dataset
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
|
||||
|
||||
async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
||||
if isinstance(dataset_name, UUID):
|
||||
return dataset_name
|
||||
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
|
||||
"""
|
||||
Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
|
||||
If dataset with legacy ID exists, return that ID to maintain compatibility.
|
||||
|
||||
Args:
|
||||
dataset_name: string representing the dataset name
|
||||
user: User object adding the dataset
|
||||
tenant_id: UUID of the tenant for which dataset is being added
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the dataset
|
||||
"""
|
||||
|
||||
def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
||||
"""
|
||||
Legacy function, returns a unique UUID for dataset based on dataset name and user id.
|
||||
Needed to support legacy datasets without tenant information.
|
||||
Args:
|
||||
dataset_name: string representing the dataset name
|
||||
user: Current User object adding the dataset
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the dataset
|
||||
"""
|
||||
if isinstance(dataset_name, UUID):
|
||||
return dataset_name
|
||||
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
|
||||
|
||||
def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
||||
"""
|
||||
Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
|
||||
Args:
|
||||
dataset_name: string representing the dataset name
|
||||
user: Current User object adding the dataset
|
||||
tenant_id: UUID of the tenant for which dataset is being added
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the dataset
|
||||
"""
|
||||
if isinstance(dataset_name, UUID):
|
||||
return dataset_name
|
||||
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
|
||||
|
||||
# Get all possible dataset_id values
|
||||
dataset_id = {
|
||||
"modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
|
||||
"legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
|
||||
}
|
||||
|
||||
# Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
|
||||
db_engine = get_relational_engine()
|
||||
async with db_engine.get_async_session() as session:
|
||||
legacy_dataset = (
|
||||
await session.execute(
|
||||
select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"])
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if not legacy_dataset:
|
||||
return dataset_id["modern_dataset_id"]
|
||||
return dataset_id["legacy_dataset_id"]
|
||||
|
|
|
|||
|
|
@ -8,6 +8,4 @@ from cognee.modules.data.methods import get_unique_data_id
|
|||
async def identify(data: IngestionData, user: User) -> UUID:
|
||||
data_content_hash: str = data.get_identifier()
|
||||
|
||||
return await get_unique_data_id(
|
||||
data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id
|
||||
)
|
||||
return await get_unique_data_id(data_identifier=data_content_hash, user=user)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue