fix: changing deletion logic to use document id instead of content hash (#1210)

## Description Changing deletion logic to use document id instead of content hash ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-08-06 16:54:35 +02:00 · 2025-08-06 16:54:35 +02:00 · 4e816ad80b
commit 4e816ad80b
parent a9e74dac42
6 changed files with 83 additions and 62 deletions
--- a/cognee/api/v1/delete/delete.py
+++ b/cognee/api/v1/delete/delete.py
@ -16,7 +16,11 @@ from cognee.modules.users.methods import get_default_user
 from cognee.modules.data.methods import get_authorized_existing_datasets
 from cognee.context_global_variables import set_database_global_context_variables

-from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
+from cognee.api.v1.delete.exceptions import (
+    DocumentNotFoundError,
+    DatasetNotFoundError,
+    DocumentSubgraphNotFoundError,
+)

 logger = get_logger()

@ -82,17 +86,17 @@ async def delete(
            raise DocumentNotFoundError(f"Data {data_id} not found in dataset {dataset_id}")

        # Get the content hash for deletion
-        content_hash = data_point.content_hash
+        data_id = str(data_point.id)

    # Use the existing comprehensive deletion logic
-    return await delete_single_document(content_hash, dataset.id, mode)
+    return await delete_single_document(data_id, dataset.id, mode)


-async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
+async def delete_single_document(data_id: str, dataset_id: UUID = None, mode: str = "soft"):
    """Delete a single document by its content hash."""

    # Delete from graph database
-    deletion_result = await delete_document_subgraph(content_hash, mode)
+    deletion_result = await delete_document_subgraph(data_id, mode)

    logger.info(f"Deletion result: {deletion_result}")

@ -163,12 +167,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod

        # Get the data point
        data_point = (
-            await session.execute(select(Data).filter(Data.content_hash == content_hash))
+            await session.execute(select(Data).filter(Data.id == UUID(data_id)))
        ).scalar_one_or_none()

        if data_point is None:
            raise DocumentNotFoundError(
-                f"Document not found in relational DB with content hash: {content_hash}"
+                f"Document not found in relational DB with data id: {data_id}"
            )

        doc_id = data_point.id
@ -203,7 +207,7 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
        "status": "success",
        "message": "Document deleted from both graph and relational databases",
        "graph_deletions": deletion_result["deleted_counts"],
-        "content_hash": content_hash,
+        "data_id": data_id,
        "dataset": dataset_id,
        "deleted_node_ids": [
            str(node_id) for node_id in deleted_node_ids
@ -211,12 +215,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
    }


-async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
+async def delete_document_subgraph(document_id: str, mode: str = "soft"):
    """Delete a document and all its related nodes in the correct order."""
    graph_db = await get_graph_engine()
-    subgraph = await graph_db.get_document_subgraph(content_hash)
+    subgraph = await graph_db.get_document_subgraph(document_id)
    if not subgraph:
-        raise DocumentSubgraphNotFoundError(f"Document not found with content hash: {content_hash}")
+        raise DocumentSubgraphNotFoundError(f"Document not found with id: {document_id}")

    # Delete in the correct order to maintain graph integrity
    deletion_order = [
@ -260,6 +264,6 @@ async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
    return {
        "status": "success",
        "deleted_counts": deleted_counts,
-        "content_hash": content_hash,
+        "document_id": document_id,
        "deleted_node_ids": deleted_node_ids,
    }
--- a/cognee/infrastructure/databases/graph/kuzu/adapter.py
+++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py
@ -1524,7 +1524,7 @@ class KuzuAdapter(GraphDBInterface):
            logger.error(f"Error during database clearing: {e}")
            raise

-    async def get_document_subgraph(self, content_hash: str):
+    async def get_document_subgraph(self, data_id: str):
        """
        Get all nodes that should be deleted when removing a document.

@ -1535,7 +1535,7 @@ class KuzuAdapter(GraphDBInterface):
        Parameters:
        -----------

-            - content_hash (str): The identifier for the document to query against.
+            - data_id (str): The identifier for the document to query against.

        Returns:
        --------
@ -1545,7 +1545,7 @@ class KuzuAdapter(GraphDBInterface):
        """
        query = """
        MATCH (doc:Node)
-        WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.name = $content_hash
+        WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id

        OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node)
        WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk'
@ -1583,7 +1583,7 @@ class KuzuAdapter(GraphDBInterface):
            COLLECT(DISTINCT made_node) as made_from_nodes,
            COLLECT(DISTINCT type) as orphan_types
        """
-        result = await self.query(query, {"content_hash": f"text_{content_hash}"})
+        result = await self.query(query, {"data_id": f"{data_id}"})
        if not result or not result[0]:
            return None

--- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@ -1252,7 +1252,7 @@ class Neo4jAdapter(GraphDBInterface):

        return mandatory_metrics | optional_metrics

-    async def get_document_subgraph(self, content_hash: str):
+    async def get_document_subgraph(self, data_id: str):
        """
        Retrieve a subgraph related to a document identified by its content hash, including
        related entities and chunks.
@ -1271,7 +1271,7 @@ class Neo4jAdapter(GraphDBInterface):
        query = """
        MATCH (doc)
        WHERE (doc:TextDocument OR doc:PdfDocument)
-        AND doc.name = 'text_' + $content_hash
+        AND doc.id = $data_id

        OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
        OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
@ -1295,7 +1295,7 @@ class Neo4jAdapter(GraphDBInterface):
            collect(DISTINCT made_node) as made_from_nodes,
            collect(DISTINCT type) as orphan_types
        """
-        result = await self.query(query, {"content_hash": content_hash})
+        result = await self.query(query, {"data_id": data_id})
        return result[0] if result else None

    async def get_degree_one_nodes(self, node_type: str):
--- a/cognee/infrastructure/databases/graph/neptune_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neptune_driver/adapter.py
@ -510,12 +510,12 @@ class NeptuneGraphDB(GraphDBInterface):

            query = f"""
            MATCH (source:{self._GRAPH_NODE_LABEL})
-            WHERE id(source) = $source_id 
-            MATCH (target:{self._GRAPH_NODE_LABEL}) 
-            WHERE id(target) = $target_id 
-            MERGE (source)-[r:{relationship_name}]->(target) 
-            ON CREATE SET r = $properties, r.updated_at = timestamp() 
-            ON MATCH SET r = $properties, r.updated_at = timestamp() 
+            WHERE id(source) = $source_id
+            MATCH (target:{self._GRAPH_NODE_LABEL})
+            WHERE id(target) = $target_id
+            MERGE (source)-[r:{relationship_name}]->(target)
+            ON CREATE SET r = $properties, r.updated_at = timestamp()
+            ON MATCH SET r = $properties, r.updated_at = timestamp()
            RETURN r
            """

@ -565,9 +565,9 @@ class NeptuneGraphDB(GraphDBInterface):
                    WHERE id(source) = edge.from_node
                    MATCH (target:{self._GRAPH_NODE_LABEL})
                    WHERE id(target) = edge.to_node
-                    MERGE (source)-[r:{relationship_name}]->(target) 
-                    ON CREATE SET r = edge.properties, r.updated_at = timestamp() 
-                    ON MATCH SET r = edge.properties, r.updated_at = timestamp() 
+                    MERGE (source)-[r:{relationship_name}]->(target)
+                    ON CREATE SET r = edge.properties, r.updated_at = timestamp()
+                    ON MATCH SET r = edge.properties, r.updated_at = timestamp()
                    RETURN count(*) AS edges_processed
                    """

@ -817,7 +817,7 @@ class NeptuneGraphDB(GraphDBInterface):
            query = f"""
            MATCH (n:{self._GRAPH_NODE_LABEL})-[r]-(m:{self._GRAPH_NODE_LABEL})
            WHERE id(n) = $node_id
-            RETURN 
+            RETURN
                id(n) AS source_id,
                id(m) AS target_id,
                type(r) AS relationship_name,
@ -1034,7 +1034,7 @@ class NeptuneGraphDB(GraphDBInterface):
            query = f"""
            MATCH (source:{self._GRAPH_NODE_LABEL})-[r]->(target:{self._GRAPH_NODE_LABEL})
            WHERE id(source) = $node_id OR id(target) = $node_id
-            RETURN 
+            RETURN
                id(source) AS source_id,
                properties(source) AS source_props,
                id(target) AS target_id,
@ -1284,14 +1284,14 @@ class NeptuneGraphDB(GraphDBInterface):

        query = f"""
                MATCH (n :{self._GRAPH_NODE_LABEL})
-                WHERE size((n)--()) = 1 
+                WHERE size((n)--()) = 1
                AND n.type = $node_type
                RETURN n
                """
        result = await self.query(query, {"node_type": node_type})
        return [record["n"] for record in result] if result else []

-    async def get_document_subgraph(self, content_hash: str):
+    async def get_document_subgraph(self, data_id: str):
        """
        Retrieve a subgraph related to a document identified by its content hash, including
        related entities and chunks.
@ -1299,7 +1299,7 @@ class NeptuneGraphDB(GraphDBInterface):
        Parameters:
        -----------

-            - content_hash (str): The hash identifying the document whose subgraph should be
+            - data_id (str): The document_id identifying the document whose subgraph should be
              retrieved.

        Returns:
@ -1312,10 +1312,10 @@ class NeptuneGraphDB(GraphDBInterface):
        MATCH (doc)
        WHERE (doc:{self._GRAPH_NODE_LABEL})
        AND doc.type in ['TextDocument', 'PdfDocument']
-        AND doc.name = 'text_' + $content_hash
+        AND doc.id = $data_id

        OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk {{type: 'DocumentChunk'}})
-        
+
        // Alternative to WHERE NOT EXISTS
        OPTIONAL MATCH (chunk)-[:contains]->(entity {{type: 'Entity'}})
        OPTIONAL MATCH (entity)<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc)
@ -1330,7 +1330,7 @@ class NeptuneGraphDB(GraphDBInterface):
        OPTIONAL MATCH (type)<-[:is_a]-(otherEntity {{type: 'Entity'}})<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc)
          WHERE otherDoc.type in ['TextDocument', 'PdfDocument']
          AND otherDoc.id <> doc.id
-        
+
        // Alternative to WHERE NOT EXISTS
        WITH doc, entity, chunk, made_node, type, otherDoc
        WHERE otherDoc IS NULL
@ -1342,7 +1342,7 @@ class NeptuneGraphDB(GraphDBInterface):
            collect(DISTINCT made_node) as made_from_nodes,
            collect(DISTINCT type) as orphan_types
        """
-        result = await self.query(query, {"content_hash": content_hash})
+        result = await self.query(query, {"data_id": data_id})
        return result[0] if result else None

    async def _get_model_independent_graph_data(self):
@ -1388,7 +1388,7 @@ class NeptuneGraphDB(GraphDBInterface):
        CALL neptune.algo.wcc(n,{{}})
        YIELD node, component
        RETURN component, count(*) AS size
-        ORDER BY size DESC 
+        ORDER BY size DESC
        """

        result = await self.query(query)
--- a/cognee/infrastructure/databases/graph/networkx/adapter.py
+++ b/cognee/infrastructure/databases/graph/networkx/adapter.py
@ -826,7 +826,7 @@ class NetworkXAdapter(GraphDBInterface):

        return mandatory_metrics | optional_metrics

-    async def get_document_subgraph(self, content_hash: str):
+    async def get_document_subgraph(self, data_id: str):
        """
        Retrieve all relevant nodes when a document is being deleted, including chunks and
        orphaned entities.
@ -834,7 +834,7 @@ class NetworkXAdapter(GraphDBInterface):
        Parameters:
        -----------

-            - content_hash (str): The hash identifying the content of the document to fetch
+            - data_id(str): The data id identifying the document to fetch
              related nodes for.

        Returns:
@ -853,7 +853,7 @@ class NetworkXAdapter(GraphDBInterface):
        for node_id, attrs in self.graph.nodes(data=True):
            if (
                attrs.get("type") in ["TextDocument", "PdfDocument"]
-                and attrs.get("name") == f"text_{content_hash}"
+                and attrs.get("id") == f"{data_id}"
            ):
                document = {"id": str(node_id), **attrs}  # Convert UUID to string for consistency
                document_node_id = node_id  # Keep the original UUID
--- a/cognee/tests/test_deletion.py
+++ b/cognee/tests/test_deletion.py
@ -12,7 +12,15 @@ async def main():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)

-    text_1 = """
+    first_file = os.path.join(
+        pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
+    )
+
+    second_file = os.path.join(
+        pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
+    )
+
+    third_content = """
    1. Audi
    Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.

@ -31,27 +39,10 @@ async def main():
    Each of these car manufacturer contributes to Germany's reputation as a leader in the global automotive industry, showcasing a blend of innovation, performance, and design excellence.
    """

-    text_2 = """
-    1. Apple
-    Apple is renowned for its innovative consumer electronics and software. Its product lineup includes the iPhone, iPad, Mac computers, and wearables like the Apple Watch. Known for its emphasis on sleek design and user-friendly interfaces, Apple has built a loyal customer base and created a seamless ecosystem that integrates hardware, software, and services.
-
-    2. Google
-    Founded in 1998, Google started as a search engine and quickly became the go-to resource for finding information online. Over the years, the company has diversified its offerings to include digital advertising, cloud computing, mobile operating systems (Android), and various web services like Gmail and Google Maps. Google's innovations have played a major role in shaping the internet landscape.
-
-    3. Microsoft
-    Microsoft Corporation has been a dominant force in software for decades. Its Windows operating system and Microsoft Office suite are staples in both business and personal computing. In recent years, Microsoft has expanded into cloud computing with Azure, gaming with the Xbox platform, and even hardware through products like the Surface line. This evolution has helped the company maintain its relevance in a rapidly changing tech world.
-
-    4. Amazon
-    What began as an online bookstore has grown into one of the largest e-commerce platforms globally. Amazon is known for its vast online marketplace, but its influence extends far beyond retail. With Amazon Web Services (AWS), the company has become a leader in cloud computing, offering robust solutions that power websites, applications, and businesses around the world. Amazon's constant drive for innovation continues to reshape both retail and technology sectors.
-
-    5. Meta
-    Meta, originally known as Facebook, revolutionized social media by connecting billions of people worldwide. Beyond its core social networking service, Meta is investing in the next generation of digital experiences through virtual and augmented reality technologies, with projects like Oculus. The company's efforts signal a commitment to evolving digital interaction and building the metaverse—a shared virtual space where users can connect and collaborate.
-
-    Each of these companies has significantly impacted the technology landscape, driving innovation and transforming everyday life through their groundbreaking products and services.
-    """
+    ################### HARD DELETE

    # Add documents and get dataset information
-    add_result = await cognee.add([text_1, text_2])
+    add_result = await cognee.add([first_file, second_file, third_content])
    dataset_id = add_result.dataset_id

    await cognee.cognify()
@ -72,7 +63,33 @@ async def main():

    nodes, edges = await graph_engine.get_graph_data()

-    assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted."
+    assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with hard delete."
+
+    ################### SOFT DELETE
+
+    # Add documents and get dataset information
+    add_result = await cognee.add([first_file, second_file, third_content])
+    dataset_id = add_result.dataset_id
+
+    await cognee.cognify()
+
+    from cognee.infrastructure.databases.graph import get_graph_engine
+
+    graph_engine = await get_graph_engine()
+    nodes, edges = await graph_engine.get_graph_data()
+    assert len(nodes) > 10 and len(edges) > 10, "Graph database is not loaded."
+
+    # Get the data IDs from the dataset
+    dataset_data = await get_dataset_data(dataset_id)
+    assert len(dataset_data) > 0, "Dataset should contain data"
+
+    # Delete each document using its ID
+    for data_item in dataset_data:
+        await cognee.delete(data_item.id, dataset_id, mode="soft")
+
+    nodes, edges = await graph_engine.get_graph_data()
+
+    assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with soft delete."


 if __name__ == "__main__":