diff --git a/cognee/api/v1/delete/delete.py b/cognee/api/v1/delete/delete.py index a7eb88892..98f6cb9fc 100644 --- a/cognee/api/v1/delete/delete.py +++ b/cognee/api/v1/delete/delete.py @@ -16,7 +16,11 @@ from cognee.modules.users.methods import get_default_user from cognee.modules.data.methods import get_authorized_existing_datasets from cognee.context_global_variables import set_database_global_context_variables -from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError +from cognee.api.v1.delete.exceptions import ( + DocumentNotFoundError, + DatasetNotFoundError, + DocumentSubgraphNotFoundError, +) logger = get_logger() @@ -82,17 +86,17 @@ async def delete( raise DocumentNotFoundError(f"Data {data_id} not found in dataset {dataset_id}") # Get the content hash for deletion - content_hash = data_point.content_hash + data_id = str(data_point.id) # Use the existing comprehensive deletion logic - return await delete_single_document(content_hash, dataset.id, mode) + return await delete_single_document(data_id, dataset.id, mode) -async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"): +async def delete_single_document(data_id: str, dataset_id: UUID = None, mode: str = "soft"): """Delete a single document by its content hash.""" # Delete from graph database - deletion_result = await delete_document_subgraph(content_hash, mode) + deletion_result = await delete_document_subgraph(data_id, mode) logger.info(f"Deletion result: {deletion_result}") @@ -163,12 +167,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod # Get the data point data_point = ( - await session.execute(select(Data).filter(Data.content_hash == content_hash)) + await session.execute(select(Data).filter(Data.id == UUID(data_id))) ).scalar_one_or_none() if data_point is None: raise DocumentNotFoundError( - f"Document not found in relational DB with content hash: {content_hash}" + f"Document not found in relational DB with data id: {data_id}" ) doc_id = data_point.id @@ -203,7 +207,7 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod "status": "success", "message": "Document deleted from both graph and relational databases", "graph_deletions": deletion_result["deleted_counts"], - "content_hash": content_hash, + "data_id": data_id, "dataset": dataset_id, "deleted_node_ids": [ str(node_id) for node_id in deleted_node_ids @@ -211,12 +215,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod } -async def delete_document_subgraph(content_hash: str, mode: str = "soft"): +async def delete_document_subgraph(document_id: str, mode: str = "soft"): """Delete a document and all its related nodes in the correct order.""" graph_db = await get_graph_engine() - subgraph = await graph_db.get_document_subgraph(content_hash) + subgraph = await graph_db.get_document_subgraph(document_id) if not subgraph: - raise DocumentSubgraphNotFoundError(f"Document not found with content hash: {content_hash}") + raise DocumentSubgraphNotFoundError(f"Document not found with id: {document_id}") # Delete in the correct order to maintain graph integrity deletion_order = [ @@ -260,6 +264,6 @@ async def delete_document_subgraph(content_hash: str, mode: str = "soft"): return { "status": "success", "deleted_counts": deleted_counts, - "content_hash": content_hash, + "document_id": document_id, "deleted_node_ids": deleted_node_ids, } diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 4775e4b48..4262178be 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -1524,7 +1524,7 @@ class KuzuAdapter(GraphDBInterface): logger.error(f"Error during database clearing: {e}") raise - async def get_document_subgraph(self, content_hash: str): + async def get_document_subgraph(self, data_id: str): """ Get all nodes that should be deleted when removing a document. @@ -1535,7 +1535,7 @@ class KuzuAdapter(GraphDBInterface): Parameters: ----------- - - content_hash (str): The identifier for the document to query against. + - data_id (str): The identifier for the document to query against. Returns: -------- @@ -1545,7 +1545,7 @@ class KuzuAdapter(GraphDBInterface): """ query = """ MATCH (doc:Node) - WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.name = $content_hash + WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node) WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk' @@ -1583,7 +1583,7 @@ class KuzuAdapter(GraphDBInterface): COLLECT(DISTINCT made_node) as made_from_nodes, COLLECT(DISTINCT type) as orphan_types """ - result = await self.query(query, {"content_hash": f"text_{content_hash}"}) + result = await self.query(query, {"data_id": f"{data_id}"}) if not result or not result[0]: return None diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index ee2e5b18e..7091c32e1 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1252,7 +1252,7 @@ class Neo4jAdapter(GraphDBInterface): return mandatory_metrics | optional_metrics - async def get_document_subgraph(self, content_hash: str): + async def get_document_subgraph(self, data_id: str): """ Retrieve a subgraph related to a document identified by its content hash, including related entities and chunks. @@ -1271,7 +1271,7 @@ class Neo4jAdapter(GraphDBInterface): query = """ MATCH (doc) WHERE (doc:TextDocument OR doc:PdfDocument) - AND doc.name = 'text_' + $content_hash + AND doc.id = $data_id OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk) OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity) @@ -1295,7 +1295,7 @@ class Neo4jAdapter(GraphDBInterface): collect(DISTINCT made_node) as made_from_nodes, collect(DISTINCT type) as orphan_types """ - result = await self.query(query, {"content_hash": content_hash}) + result = await self.query(query, {"data_id": data_id}) return result[0] if result else None async def get_degree_one_nodes(self, node_type: str): diff --git a/cognee/infrastructure/databases/graph/neptune_driver/adapter.py b/cognee/infrastructure/databases/graph/neptune_driver/adapter.py index 02c297623..a2bc589af 100644 --- a/cognee/infrastructure/databases/graph/neptune_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neptune_driver/adapter.py @@ -510,12 +510,12 @@ class NeptuneGraphDB(GraphDBInterface): query = f""" MATCH (source:{self._GRAPH_NODE_LABEL}) - WHERE id(source) = $source_id - MATCH (target:{self._GRAPH_NODE_LABEL}) - WHERE id(target) = $target_id - MERGE (source)-[r:{relationship_name}]->(target) - ON CREATE SET r = $properties, r.updated_at = timestamp() - ON MATCH SET r = $properties, r.updated_at = timestamp() + WHERE id(source) = $source_id + MATCH (target:{self._GRAPH_NODE_LABEL}) + WHERE id(target) = $target_id + MERGE (source)-[r:{relationship_name}]->(target) + ON CREATE SET r = $properties, r.updated_at = timestamp() + ON MATCH SET r = $properties, r.updated_at = timestamp() RETURN r """ @@ -565,9 +565,9 @@ class NeptuneGraphDB(GraphDBInterface): WHERE id(source) = edge.from_node MATCH (target:{self._GRAPH_NODE_LABEL}) WHERE id(target) = edge.to_node - MERGE (source)-[r:{relationship_name}]->(target) - ON CREATE SET r = edge.properties, r.updated_at = timestamp() - ON MATCH SET r = edge.properties, r.updated_at = timestamp() + MERGE (source)-[r:{relationship_name}]->(target) + ON CREATE SET r = edge.properties, r.updated_at = timestamp() + ON MATCH SET r = edge.properties, r.updated_at = timestamp() RETURN count(*) AS edges_processed """ @@ -817,7 +817,7 @@ class NeptuneGraphDB(GraphDBInterface): query = f""" MATCH (n:{self._GRAPH_NODE_LABEL})-[r]-(m:{self._GRAPH_NODE_LABEL}) WHERE id(n) = $node_id - RETURN + RETURN id(n) AS source_id, id(m) AS target_id, type(r) AS relationship_name, @@ -1034,7 +1034,7 @@ class NeptuneGraphDB(GraphDBInterface): query = f""" MATCH (source:{self._GRAPH_NODE_LABEL})-[r]->(target:{self._GRAPH_NODE_LABEL}) WHERE id(source) = $node_id OR id(target) = $node_id - RETURN + RETURN id(source) AS source_id, properties(source) AS source_props, id(target) AS target_id, @@ -1284,14 +1284,14 @@ class NeptuneGraphDB(GraphDBInterface): query = f""" MATCH (n :{self._GRAPH_NODE_LABEL}) - WHERE size((n)--()) = 1 + WHERE size((n)--()) = 1 AND n.type = $node_type RETURN n """ result = await self.query(query, {"node_type": node_type}) return [record["n"] for record in result] if result else [] - async def get_document_subgraph(self, content_hash: str): + async def get_document_subgraph(self, data_id: str): """ Retrieve a subgraph related to a document identified by its content hash, including related entities and chunks. @@ -1299,7 +1299,7 @@ class NeptuneGraphDB(GraphDBInterface): Parameters: ----------- - - content_hash (str): The hash identifying the document whose subgraph should be + - data_id (str): The document_id identifying the document whose subgraph should be retrieved. Returns: @@ -1312,10 +1312,10 @@ class NeptuneGraphDB(GraphDBInterface): MATCH (doc) WHERE (doc:{self._GRAPH_NODE_LABEL}) AND doc.type in ['TextDocument', 'PdfDocument'] - AND doc.name = 'text_' + $content_hash + AND doc.id = $data_id OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk {{type: 'DocumentChunk'}}) - + // Alternative to WHERE NOT EXISTS OPTIONAL MATCH (chunk)-[:contains]->(entity {{type: 'Entity'}}) OPTIONAL MATCH (entity)<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc) @@ -1330,7 +1330,7 @@ class NeptuneGraphDB(GraphDBInterface): OPTIONAL MATCH (type)<-[:is_a]-(otherEntity {{type: 'Entity'}})<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc) WHERE otherDoc.type in ['TextDocument', 'PdfDocument'] AND otherDoc.id <> doc.id - + // Alternative to WHERE NOT EXISTS WITH doc, entity, chunk, made_node, type, otherDoc WHERE otherDoc IS NULL @@ -1342,7 +1342,7 @@ class NeptuneGraphDB(GraphDBInterface): collect(DISTINCT made_node) as made_from_nodes, collect(DISTINCT type) as orphan_types """ - result = await self.query(query, {"content_hash": content_hash}) + result = await self.query(query, {"data_id": data_id}) return result[0] if result else None async def _get_model_independent_graph_data(self): @@ -1388,7 +1388,7 @@ class NeptuneGraphDB(GraphDBInterface): CALL neptune.algo.wcc(n,{{}}) YIELD node, component RETURN component, count(*) AS size - ORDER BY size DESC + ORDER BY size DESC """ result = await self.query(query) diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index 7431b3fc8..da9b506a0 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -826,7 +826,7 @@ class NetworkXAdapter(GraphDBInterface): return mandatory_metrics | optional_metrics - async def get_document_subgraph(self, content_hash: str): + async def get_document_subgraph(self, data_id: str): """ Retrieve all relevant nodes when a document is being deleted, including chunks and orphaned entities. @@ -834,7 +834,7 @@ class NetworkXAdapter(GraphDBInterface): Parameters: ----------- - - content_hash (str): The hash identifying the content of the document to fetch + - data_id(str): The data id identifying the document to fetch related nodes for. Returns: @@ -853,7 +853,7 @@ class NetworkXAdapter(GraphDBInterface): for node_id, attrs in self.graph.nodes(data=True): if ( attrs.get("type") in ["TextDocument", "PdfDocument"] - and attrs.get("name") == f"text_{content_hash}" + and attrs.get("id") == f"{data_id}" ): document = {"id": str(node_id), **attrs} # Convert UUID to string for consistency document_node_id = node_id # Keep the original UUID diff --git a/cognee/tests/test_deletion.py b/cognee/tests/test_deletion.py index cc0c01c7a..54c69ad12 100644 --- a/cognee/tests/test_deletion.py +++ b/cognee/tests/test_deletion.py @@ -12,7 +12,15 @@ async def main(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - text_1 = """ + first_file = os.path.join( + pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf" + ) + + second_file = os.path.join( + pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt" + ) + + third_content = """ 1. Audi Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars. @@ -31,27 +39,10 @@ async def main(): Each of these car manufacturer contributes to Germany's reputation as a leader in the global automotive industry, showcasing a blend of innovation, performance, and design excellence. """ - text_2 = """ - 1. Apple - Apple is renowned for its innovative consumer electronics and software. Its product lineup includes the iPhone, iPad, Mac computers, and wearables like the Apple Watch. Known for its emphasis on sleek design and user-friendly interfaces, Apple has built a loyal customer base and created a seamless ecosystem that integrates hardware, software, and services. - - 2. Google - Founded in 1998, Google started as a search engine and quickly became the go-to resource for finding information online. Over the years, the company has diversified its offerings to include digital advertising, cloud computing, mobile operating systems (Android), and various web services like Gmail and Google Maps. Google's innovations have played a major role in shaping the internet landscape. - - 3. Microsoft - Microsoft Corporation has been a dominant force in software for decades. Its Windows operating system and Microsoft Office suite are staples in both business and personal computing. In recent years, Microsoft has expanded into cloud computing with Azure, gaming with the Xbox platform, and even hardware through products like the Surface line. This evolution has helped the company maintain its relevance in a rapidly changing tech world. - - 4. Amazon - What began as an online bookstore has grown into one of the largest e-commerce platforms globally. Amazon is known for its vast online marketplace, but its influence extends far beyond retail. With Amazon Web Services (AWS), the company has become a leader in cloud computing, offering robust solutions that power websites, applications, and businesses around the world. Amazon's constant drive for innovation continues to reshape both retail and technology sectors. - - 5. Meta - Meta, originally known as Facebook, revolutionized social media by connecting billions of people worldwide. Beyond its core social networking service, Meta is investing in the next generation of digital experiences through virtual and augmented reality technologies, with projects like Oculus. The company's efforts signal a commitment to evolving digital interaction and building the metaverse—a shared virtual space where users can connect and collaborate. - - Each of these companies has significantly impacted the technology landscape, driving innovation and transforming everyday life through their groundbreaking products and services. - """ + ################### HARD DELETE # Add documents and get dataset information - add_result = await cognee.add([text_1, text_2]) + add_result = await cognee.add([first_file, second_file, third_content]) dataset_id = add_result.dataset_id await cognee.cognify() @@ -72,7 +63,33 @@ async def main(): nodes, edges = await graph_engine.get_graph_data() - assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted." + assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with hard delete." + + ################### SOFT DELETE + + # Add documents and get dataset information + add_result = await cognee.add([first_file, second_file, third_content]) + dataset_id = add_result.dataset_id + + await cognee.cognify() + + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + nodes, edges = await graph_engine.get_graph_data() + assert len(nodes) > 10 and len(edges) > 10, "Graph database is not loaded." + + # Get the data IDs from the dataset + dataset_data = await get_dataset_data(dataset_id) + assert len(dataset_data) > 0, "Dataset should contain data" + + # Delete each document using its ID + for data_item in dataset_data: + await cognee.delete(data_item.id, dataset_id, mode="soft") + + nodes, edges = await graph_engine.get_graph_data() + + assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with soft delete." if __name__ == "__main__":