feat: add legacy and modern data_id calculating

2025-11-04 13:01:10 +01:00 · 2025-11-04 13:01:10 +01:00 · b0f85c9e99
commit b0f85c9e99
parent e3b707a0c2
3 changed files with 78 additions and 4 deletions
--- a/cognee/modules/data/methods/init.py
+++ b/cognee/modules/data/methods/init.py
@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
 from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
 from .get_data import get_data
 from .get_unique_dataset_id import get_unique_dataset_id
 from .get_unique_data_id import get_unique_data_id
 from .get_authorized_existing_datasets import get_authorized_existing_datasets
 from .get_dataset_ids import get_dataset_ids
--- a/cognee/modules/data/methods/get_unique_data_id.py
+++ b/cognee/modules/data/methods/get_unique_data_id.py
@ -0,0 +1,71 @@
 from uuid import uuid5, NAMESPACE_OID, UUID
 from typing import Optional
 from sqlalchemy import select
 from cognee.modules.data.models.Data import Data
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.users.models import User
 async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID:
    """
    Function returns a unique UUID for data based on data identifier, user id and tenant id.
    If data with legacy ID exists, return that ID to maintain compatibility.
    Args:
        data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
        user: User object adding the data
        tenant_id: UUID of the tenant for which data is being added
    Returns:
        UUID: Unique identifier for the data
    """
    def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
        """
        Deprecated function, returns a unique UUID for data based on data identifier and user id.
        Needed to support legacy data without tenant information.
        Args:
            data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
            user: User object adding the data
        Returns:
            UUID: Unique identifier for the data
        """
        # return UUID hash of file contents + owner id + tenant_id
        return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
    def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID:
        """
        Function returns a unique UUID for data based on data identifier, user id and tenant id.
        Args:
            data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
            user: User object adding the data
            tenant_id: UUID of the tenant for which data is being added
        Returns:
            UUID: Unique identifier for the data
        """
        # return UUID hash of file contents + owner id + tenant_id
        return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}")
    # Get all possible data_id values
    data_id = {
        "modern_data_id": _get_modern_unique_data_id(
            data_identifier=data_identifier, user=user, tenant_id=tenant_id
        ),
        "legacy_data_id": _get_deprecated_unique_data_id(
            data_identifier=data_identifier, user=user
        ),
    }
    # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
    db_engine = get_relational_engine()
    async with db_engine.get_async_session() as session:
        legacy_data_point = (
            await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
        ).scalar_one_or_none()
        if not legacy_data_point:
            return data_id["modern_data_id"]
        return data_id["legacy_data_id"]
--- a/cognee/modules/ingestion/identify.py
+++ b/cognee/modules/ingestion/identify.py
@ -1,11 +1,13 @@
-from uuid import uuid5, NAMESPACE_OID
+from uuid import UUID
 from .data_types import IngestionData
 from cognee.modules.users.models import User
 from cognee.modules.data.methods import get_unique_data_id
-def identify(data: IngestionData, user: User) -> str:
+async def identify(data: IngestionData, user: User) -> UUID:
    data_content_hash: str = data.get_identifier()
-    # return UUID hash of file contents + owner id
+    return await get_unique_data_id(
-    return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}")
+        data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id
    )