feat: add legacy and modern data_id calculating
This commit is contained in:
parent
e3b707a0c2
commit
b0f85c9e99
3 changed files with 78 additions and 4 deletions
|
|
@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
|
||||||
from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
|
from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
|
||||||
from .get_data import get_data
|
from .get_data import get_data
|
||||||
from .get_unique_dataset_id import get_unique_dataset_id
|
from .get_unique_dataset_id import get_unique_dataset_id
|
||||||
|
from .get_unique_data_id import get_unique_data_id
|
||||||
from .get_authorized_existing_datasets import get_authorized_existing_datasets
|
from .get_authorized_existing_datasets import get_authorized_existing_datasets
|
||||||
from .get_dataset_ids import get_dataset_ids
|
from .get_dataset_ids import get_dataset_ids
|
||||||
|
|
||||||
|
|
|
||||||
71
cognee/modules/data/methods/get_unique_data_id.py
Normal file
71
cognee/modules/data/methods/get_unique_data_id.py
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
from uuid import uuid5, NAMESPACE_OID, UUID
|
||||||
|
from typing import Optional
|
||||||
|
from sqlalchemy import select
|
||||||
|
|
||||||
|
from cognee.modules.data.models.Data import Data
|
||||||
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
|
from cognee.modules.users.models import User
|
||||||
|
|
||||||
|
|
||||||
|
async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID:
|
||||||
|
"""
|
||||||
|
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
||||||
|
If data with legacy ID exists, return that ID to maintain compatibility.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
||||||
|
user: User object adding the data
|
||||||
|
tenant_id: UUID of the tenant for which data is being added
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UUID: Unique identifier for the data
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
|
||||||
|
"""
|
||||||
|
Deprecated function, returns a unique UUID for data based on data identifier and user id.
|
||||||
|
Needed to support legacy data without tenant information.
|
||||||
|
Args:
|
||||||
|
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
||||||
|
user: User object adding the data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UUID: Unique identifier for the data
|
||||||
|
"""
|
||||||
|
# return UUID hash of file contents + owner id + tenant_id
|
||||||
|
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
|
||||||
|
|
||||||
|
def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID:
|
||||||
|
"""
|
||||||
|
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
||||||
|
Args:
|
||||||
|
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
||||||
|
user: User object adding the data
|
||||||
|
tenant_id: UUID of the tenant for which data is being added
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UUID: Unique identifier for the data
|
||||||
|
"""
|
||||||
|
# return UUID hash of file contents + owner id + tenant_id
|
||||||
|
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}")
|
||||||
|
|
||||||
|
# Get all possible data_id values
|
||||||
|
data_id = {
|
||||||
|
"modern_data_id": _get_modern_unique_data_id(
|
||||||
|
data_identifier=data_identifier, user=user, tenant_id=tenant_id
|
||||||
|
),
|
||||||
|
"legacy_data_id": _get_deprecated_unique_data_id(
|
||||||
|
data_identifier=data_identifier, user=user
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
|
||||||
|
db_engine = get_relational_engine()
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
legacy_data_point = (
|
||||||
|
await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
|
||||||
|
).scalar_one_or_none()
|
||||||
|
|
||||||
|
if not legacy_data_point:
|
||||||
|
return data_id["modern_data_id"]
|
||||||
|
return data_id["legacy_data_id"]
|
||||||
|
|
@ -1,11 +1,13 @@
|
||||||
from uuid import uuid5, NAMESPACE_OID
|
from uuid import UUID
|
||||||
from .data_types import IngestionData
|
from .data_types import IngestionData
|
||||||
|
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
|
from cognee.modules.data.methods import get_unique_data_id
|
||||||
|
|
||||||
|
|
||||||
def identify(data: IngestionData, user: User) -> str:
|
async def identify(data: IngestionData, user: User) -> UUID:
|
||||||
data_content_hash: str = data.get_identifier()
|
data_content_hash: str = data.get_identifier()
|
||||||
|
|
||||||
# return UUID hash of file contents + owner id
|
return await get_unique_data_id(
|
||||||
return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}")
|
data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue