feat: Add deduplication of data

Data is deduplicated per user so if a user tries to add data which already exists it will just be redirected to existing data in database Feature COG-505
2024-12-05 16:38:44 +01:00 · 2024-12-05 16:38:44 +01:00 · f5b5e56cc1
commit f5b5e56cc1
parent 0ce254b262
6 changed files with 28 additions and 13 deletions
--- a/cognee/infrastructure/files/utils/get_file_metadata.py
+++ b/cognee/infrastructure/files/utils/get_file_metadata.py
@ -1,4 +1,5 @@
 from typing import BinaryIO, TypedDict
+import hashlib
 from .guess_file_type import guess_file_type


@ -7,10 +8,14 @@ class FileMetadata(TypedDict):
    file_path: str
    mime_type: str
    extension: str
+    content_hash: str

 def get_file_metadata(file: BinaryIO) -> FileMetadata:
    """Get metadata from a file"""
    file.seek(0)
+    content_hash = hashlib.md5(file.read()).hexdigest()
+    file.seek(0)
+
    file_type = guess_file_type(file)

    file_path = file.name
@ -21,4 +26,5 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
        file_path = file_path,
        mime_type = file_type.mime,
        extension = file_type.extension,
+        content_hash = content_hash,
    )
--- a/cognee/modules/data/models/Data.py
+++ b/cognee/modules/data/models/Data.py
@ -1,7 +1,6 @@
 from datetime import datetime, timezone
 from typing import List
 from uuid import uuid4
-
 from sqlalchemy import UUID, Column, DateTime, String
 from sqlalchemy.orm import Mapped, relationship

@ -19,6 +18,8 @@ class Data(Base):
    extension = Column(String)
    mime_type = Column(String)
    raw_data_location = Column(String)
+    owner_id = Column(UUID, index=True)
+    content_hash = Column(String)
    created_at = Column(
        DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
    )
--- a/cognee/modules/data/operations/write_metadata.py
+++ b/cognee/modules/data/operations/write_metadata.py
@ -2,7 +2,6 @@ import inspect
 import json
 import re
 import warnings
-from typing import Any
 from uuid import UUID
 from sqlalchemy import select
 from typing import Any, BinaryIO, Union
--- a/cognee/modules/ingestion/data_types/BinaryData.py
+++ b/cognee/modules/ingestion/data_types/BinaryData.py
@ -17,7 +17,7 @@ class BinaryData(IngestionData):
    def get_identifier(self):
        metadata = self.get_metadata()

-        return self.name + "." + metadata["extension"]
+        return metadata["content_hash"]

    def get_metadata(self):
        self.ensure_metadata()
--- a/cognee/modules/ingestion/identify.py
+++ b/cognee/modules/ingestion/identify.py
@ -1,7 +1,9 @@
 from uuid import uuid5, NAMESPACE_OID
 from .data_types import IngestionData

-def identify(data: IngestionData) -> str:
-    data_id: str = data.get_identifier()
+from cognee.modules.users.models import User

-    return uuid5(NAMESPACE_OID, data_id)
+def identify(data: IngestionData, user: User) -> str:
+    data_content_hash: str = data.get_identifier()
+    # return UUID hash of file contents + owner id
+    return uuid5(NAMESPACE_OID,f"{data_content_hash}{user.id}")
--- a/cognee/tasks/ingestion/ingest_data_with_metadata.py
+++ b/cognee/tasks/ingestion/ingest_data_with_metadata.py
@ -5,7 +5,6 @@ import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.methods import create_dataset
 from cognee.modules.data.models.DatasetData import DatasetData
-from cognee.modules.data.operations.delete_metadata import delete_metadata
 from cognee.modules.users.models import User
 from cognee.modules.users.permissions.methods import give_permission_on_document
 from cognee.shared.utils import send_telemetry
@ -25,11 +24,11 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
    )

    @dlt.resource(standalone=True, merge_key="id")
-    async def data_resources(file_paths: str):
+    async def data_resources(file_paths: str, user: User):
        for file_path in file_paths:
            with open(file_path.replace("file://", ""), mode="rb") as file:
                classified_data = ingestion.classify(file)
-                data_id = ingestion.identify(classified_data)
+                data_id = ingestion.identify(classified_data, user)
                file_metadata = classified_data.get_metadata()
                yield {
                    "id": data_id,
@ -37,6 +36,8 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
                    "file_path": file_metadata["file_path"],
                    "extension": file_metadata["extension"],
                    "mime_type": file_metadata["mime_type"],
+                    "content_hash": file_metadata["content_hash"],
+                    "owner_id": str(user.id),
                }

    async def data_storing(data: Any, dataset_name: str, user: User):
@ -58,7 +59,8 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
            with open(file_path.replace("file://", ""), mode = "rb") as file:
                classified_data = ingestion.classify(file)

-                data_id = ingestion.identify(classified_data)
+                # data_id is the hash of file contents + owner id to avoid duplicate data
+                data_id = ingestion.identify(classified_data, user)

                file_metadata = classified_data.get_metadata()

@ -71,6 +73,7 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
                async with db_engine.get_async_session() as session:
                    dataset = await create_dataset(dataset_name, user.id, session)

+                    # Check to see if data should be updated
                    data_point = (
                        await session.execute(select(Data).filter(Data.id == data_id))
                    ).scalar_one_or_none()
@ -80,6 +83,8 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
                        data_point.raw_data_location = file_metadata["file_path"]
                        data_point.extension = file_metadata["extension"]
                        data_point.mime_type = file_metadata["mime_type"]
+                        data_point.owner_id = user.id
+                        data_point.content_hash = file_metadata["content_hash"]
                        await session.merge(data_point)
                    else:
                        data_point = Data(
@ -87,7 +92,9 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
                            name = file_metadata["name"],
                            raw_data_location = file_metadata["file_path"],
                            extension = file_metadata["extension"],
-                            mime_type = file_metadata["mime_type"]
+                            mime_type = file_metadata["mime_type"],
+                            owner_id = user.id,
+                            content_hash = file_metadata["content_hash"],
                        )

                    # Check if data is already in dataset
@ -118,14 +125,14 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
        # To use sqlite with dlt dataset_name must be set to "main".
        # Sqlite doesn't support schemas
        run_info = pipeline.run(
-            data_resources(file_paths),
+            data_resources(file_paths, user),
            table_name="file_metadata",
            dataset_name="main",
            write_disposition="merge",
        )
    else:
        run_info = pipeline.run(
-            data_resources(file_paths),
+            data_resources(file_paths, user),
            table_name="file_metadata",
            dataset_name=dataset_name,
            write_disposition="merge",