fix: changing deletion logic to use document id instead of content hash (#1210)

<!-- .github/pull_request_template.md -->

## Description
Changing deletion logic to use document id instead of content hash

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
hajdul88 2025-08-06 16:54:35 +02:00 committed by GitHub
parent a9e74dac42
commit 4e816ad80b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 83 additions and 62 deletions

View file

@ -16,7 +16,11 @@ from cognee.modules.users.methods import get_default_user
from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.context_global_variables import set_database_global_context_variables
from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
from cognee.api.v1.delete.exceptions import (
DocumentNotFoundError,
DatasetNotFoundError,
DocumentSubgraphNotFoundError,
)
logger = get_logger()
@ -82,17 +86,17 @@ async def delete(
raise DocumentNotFoundError(f"Data {data_id} not found in dataset {dataset_id}")
# Get the content hash for deletion
content_hash = data_point.content_hash
data_id = str(data_point.id)
# Use the existing comprehensive deletion logic
return await delete_single_document(content_hash, dataset.id, mode)
return await delete_single_document(data_id, dataset.id, mode)
async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
async def delete_single_document(data_id: str, dataset_id: UUID = None, mode: str = "soft"):
"""Delete a single document by its content hash."""
# Delete from graph database
deletion_result = await delete_document_subgraph(content_hash, mode)
deletion_result = await delete_document_subgraph(data_id, mode)
logger.info(f"Deletion result: {deletion_result}")
@ -163,12 +167,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
# Get the data point
data_point = (
await session.execute(select(Data).filter(Data.content_hash == content_hash))
await session.execute(select(Data).filter(Data.id == UUID(data_id)))
).scalar_one_or_none()
if data_point is None:
raise DocumentNotFoundError(
f"Document not found in relational DB with content hash: {content_hash}"
f"Document not found in relational DB with data id: {data_id}"
)
doc_id = data_point.id
@ -203,7 +207,7 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
"status": "success",
"message": "Document deleted from both graph and relational databases",
"graph_deletions": deletion_result["deleted_counts"],
"content_hash": content_hash,
"data_id": data_id,
"dataset": dataset_id,
"deleted_node_ids": [
str(node_id) for node_id in deleted_node_ids
@ -211,12 +215,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
}
async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
async def delete_document_subgraph(document_id: str, mode: str = "soft"):
"""Delete a document and all its related nodes in the correct order."""
graph_db = await get_graph_engine()
subgraph = await graph_db.get_document_subgraph(content_hash)
subgraph = await graph_db.get_document_subgraph(document_id)
if not subgraph:
raise DocumentSubgraphNotFoundError(f"Document not found with content hash: {content_hash}")
raise DocumentSubgraphNotFoundError(f"Document not found with id: {document_id}")
# Delete in the correct order to maintain graph integrity
deletion_order = [
@ -260,6 +264,6 @@ async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
return {
"status": "success",
"deleted_counts": deleted_counts,
"content_hash": content_hash,
"document_id": document_id,
"deleted_node_ids": deleted_node_ids,
}

View file

@ -1524,7 +1524,7 @@ class KuzuAdapter(GraphDBInterface):
logger.error(f"Error during database clearing: {e}")
raise
async def get_document_subgraph(self, content_hash: str):
async def get_document_subgraph(self, data_id: str):
"""
Get all nodes that should be deleted when removing a document.
@ -1535,7 +1535,7 @@ class KuzuAdapter(GraphDBInterface):
Parameters:
-----------
- content_hash (str): The identifier for the document to query against.
- data_id (str): The identifier for the document to query against.
Returns:
--------
@ -1545,7 +1545,7 @@ class KuzuAdapter(GraphDBInterface):
"""
query = """
MATCH (doc:Node)
WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.name = $content_hash
WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id
OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node)
WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk'
@ -1583,7 +1583,7 @@ class KuzuAdapter(GraphDBInterface):
COLLECT(DISTINCT made_node) as made_from_nodes,
COLLECT(DISTINCT type) as orphan_types
"""
result = await self.query(query, {"content_hash": f"text_{content_hash}"})
result = await self.query(query, {"data_id": f"{data_id}"})
if not result or not result[0]:
return None

View file

@ -1252,7 +1252,7 @@ class Neo4jAdapter(GraphDBInterface):
return mandatory_metrics | optional_metrics
async def get_document_subgraph(self, content_hash: str):
async def get_document_subgraph(self, data_id: str):
"""
Retrieve a subgraph related to a document identified by its content hash, including
related entities and chunks.
@ -1271,7 +1271,7 @@ class Neo4jAdapter(GraphDBInterface):
query = """
MATCH (doc)
WHERE (doc:TextDocument OR doc:PdfDocument)
AND doc.name = 'text_' + $content_hash
AND doc.id = $data_id
OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
@ -1295,7 +1295,7 @@ class Neo4jAdapter(GraphDBInterface):
collect(DISTINCT made_node) as made_from_nodes,
collect(DISTINCT type) as orphan_types
"""
result = await self.query(query, {"content_hash": content_hash})
result = await self.query(query, {"data_id": data_id})
return result[0] if result else None
async def get_degree_one_nodes(self, node_type: str):

View file

@ -510,12 +510,12 @@ class NeptuneGraphDB(GraphDBInterface):
query = f"""
MATCH (source:{self._GRAPH_NODE_LABEL})
WHERE id(source) = $source_id
MATCH (target:{self._GRAPH_NODE_LABEL})
WHERE id(target) = $target_id
MERGE (source)-[r:{relationship_name}]->(target)
ON CREATE SET r = $properties, r.updated_at = timestamp()
ON MATCH SET r = $properties, r.updated_at = timestamp()
WHERE id(source) = $source_id
MATCH (target:{self._GRAPH_NODE_LABEL})
WHERE id(target) = $target_id
MERGE (source)-[r:{relationship_name}]->(target)
ON CREATE SET r = $properties, r.updated_at = timestamp()
ON MATCH SET r = $properties, r.updated_at = timestamp()
RETURN r
"""
@ -565,9 +565,9 @@ class NeptuneGraphDB(GraphDBInterface):
WHERE id(source) = edge.from_node
MATCH (target:{self._GRAPH_NODE_LABEL})
WHERE id(target) = edge.to_node
MERGE (source)-[r:{relationship_name}]->(target)
ON CREATE SET r = edge.properties, r.updated_at = timestamp()
ON MATCH SET r = edge.properties, r.updated_at = timestamp()
MERGE (source)-[r:{relationship_name}]->(target)
ON CREATE SET r = edge.properties, r.updated_at = timestamp()
ON MATCH SET r = edge.properties, r.updated_at = timestamp()
RETURN count(*) AS edges_processed
"""
@ -817,7 +817,7 @@ class NeptuneGraphDB(GraphDBInterface):
query = f"""
MATCH (n:{self._GRAPH_NODE_LABEL})-[r]-(m:{self._GRAPH_NODE_LABEL})
WHERE id(n) = $node_id
RETURN
RETURN
id(n) AS source_id,
id(m) AS target_id,
type(r) AS relationship_name,
@ -1034,7 +1034,7 @@ class NeptuneGraphDB(GraphDBInterface):
query = f"""
MATCH (source:{self._GRAPH_NODE_LABEL})-[r]->(target:{self._GRAPH_NODE_LABEL})
WHERE id(source) = $node_id OR id(target) = $node_id
RETURN
RETURN
id(source) AS source_id,
properties(source) AS source_props,
id(target) AS target_id,
@ -1284,14 +1284,14 @@ class NeptuneGraphDB(GraphDBInterface):
query = f"""
MATCH (n :{self._GRAPH_NODE_LABEL})
WHERE size((n)--()) = 1
WHERE size((n)--()) = 1
AND n.type = $node_type
RETURN n
"""
result = await self.query(query, {"node_type": node_type})
return [record["n"] for record in result] if result else []
async def get_document_subgraph(self, content_hash: str):
async def get_document_subgraph(self, data_id: str):
"""
Retrieve a subgraph related to a document identified by its content hash, including
related entities and chunks.
@ -1299,7 +1299,7 @@ class NeptuneGraphDB(GraphDBInterface):
Parameters:
-----------
- content_hash (str): The hash identifying the document whose subgraph should be
- data_id (str): The document_id identifying the document whose subgraph should be
retrieved.
Returns:
@ -1312,10 +1312,10 @@ class NeptuneGraphDB(GraphDBInterface):
MATCH (doc)
WHERE (doc:{self._GRAPH_NODE_LABEL})
AND doc.type in ['TextDocument', 'PdfDocument']
AND doc.name = 'text_' + $content_hash
AND doc.id = $data_id
OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk {{type: 'DocumentChunk'}})
// Alternative to WHERE NOT EXISTS
OPTIONAL MATCH (chunk)-[:contains]->(entity {{type: 'Entity'}})
OPTIONAL MATCH (entity)<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc)
@ -1330,7 +1330,7 @@ class NeptuneGraphDB(GraphDBInterface):
OPTIONAL MATCH (type)<-[:is_a]-(otherEntity {{type: 'Entity'}})<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc)
WHERE otherDoc.type in ['TextDocument', 'PdfDocument']
AND otherDoc.id <> doc.id
// Alternative to WHERE NOT EXISTS
WITH doc, entity, chunk, made_node, type, otherDoc
WHERE otherDoc IS NULL
@ -1342,7 +1342,7 @@ class NeptuneGraphDB(GraphDBInterface):
collect(DISTINCT made_node) as made_from_nodes,
collect(DISTINCT type) as orphan_types
"""
result = await self.query(query, {"content_hash": content_hash})
result = await self.query(query, {"data_id": data_id})
return result[0] if result else None
async def _get_model_independent_graph_data(self):
@ -1388,7 +1388,7 @@ class NeptuneGraphDB(GraphDBInterface):
CALL neptune.algo.wcc(n,{{}})
YIELD node, component
RETURN component, count(*) AS size
ORDER BY size DESC
ORDER BY size DESC
"""
result = await self.query(query)

View file

@ -826,7 +826,7 @@ class NetworkXAdapter(GraphDBInterface):
return mandatory_metrics | optional_metrics
async def get_document_subgraph(self, content_hash: str):
async def get_document_subgraph(self, data_id: str):
"""
Retrieve all relevant nodes when a document is being deleted, including chunks and
orphaned entities.
@ -834,7 +834,7 @@ class NetworkXAdapter(GraphDBInterface):
Parameters:
-----------
- content_hash (str): The hash identifying the content of the document to fetch
- data_id(str): The data id identifying the document to fetch
related nodes for.
Returns:
@ -853,7 +853,7 @@ class NetworkXAdapter(GraphDBInterface):
for node_id, attrs in self.graph.nodes(data=True):
if (
attrs.get("type") in ["TextDocument", "PdfDocument"]
and attrs.get("name") == f"text_{content_hash}"
and attrs.get("id") == f"{data_id}"
):
document = {"id": str(node_id), **attrs} # Convert UUID to string for consistency
document_node_id = node_id # Keep the original UUID

View file

@ -12,7 +12,15 @@ async def main():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
text_1 = """
first_file = os.path.join(
pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
)
second_file = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
)
third_content = """
1. Audi
Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.
@ -31,27 +39,10 @@ async def main():
Each of these car manufacturer contributes to Germany's reputation as a leader in the global automotive industry, showcasing a blend of innovation, performance, and design excellence.
"""
text_2 = """
1. Apple
Apple is renowned for its innovative consumer electronics and software. Its product lineup includes the iPhone, iPad, Mac computers, and wearables like the Apple Watch. Known for its emphasis on sleek design and user-friendly interfaces, Apple has built a loyal customer base and created a seamless ecosystem that integrates hardware, software, and services.
2. Google
Founded in 1998, Google started as a search engine and quickly became the go-to resource for finding information online. Over the years, the company has diversified its offerings to include digital advertising, cloud computing, mobile operating systems (Android), and various web services like Gmail and Google Maps. Google's innovations have played a major role in shaping the internet landscape.
3. Microsoft
Microsoft Corporation has been a dominant force in software for decades. Its Windows operating system and Microsoft Office suite are staples in both business and personal computing. In recent years, Microsoft has expanded into cloud computing with Azure, gaming with the Xbox platform, and even hardware through products like the Surface line. This evolution has helped the company maintain its relevance in a rapidly changing tech world.
4. Amazon
What began as an online bookstore has grown into one of the largest e-commerce platforms globally. Amazon is known for its vast online marketplace, but its influence extends far beyond retail. With Amazon Web Services (AWS), the company has become a leader in cloud computing, offering robust solutions that power websites, applications, and businesses around the world. Amazon's constant drive for innovation continues to reshape both retail and technology sectors.
5. Meta
Meta, originally known as Facebook, revolutionized social media by connecting billions of people worldwide. Beyond its core social networking service, Meta is investing in the next generation of digital experiences through virtual and augmented reality technologies, with projects like Oculus. The company's efforts signal a commitment to evolving digital interaction and building the metaverse—a shared virtual space where users can connect and collaborate.
Each of these companies has significantly impacted the technology landscape, driving innovation and transforming everyday life through their groundbreaking products and services.
"""
################### HARD DELETE
# Add documents and get dataset information
add_result = await cognee.add([text_1, text_2])
add_result = await cognee.add([first_file, second_file, third_content])
dataset_id = add_result.dataset_id
await cognee.cognify()
@ -72,7 +63,33 @@ async def main():
nodes, edges = await graph_engine.get_graph_data()
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted."
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with hard delete."
################### SOFT DELETE
# Add documents and get dataset information
add_result = await cognee.add([first_file, second_file, third_content])
dataset_id = add_result.dataset_id
await cognee.cognify()
from cognee.infrastructure.databases.graph import get_graph_engine
graph_engine = await get_graph_engine()
nodes, edges = await graph_engine.get_graph_data()
assert len(nodes) > 10 and len(edges) > 10, "Graph database is not loaded."
# Get the data IDs from the dataset
dataset_data = await get_dataset_data(dataset_id)
assert len(dataset_data) > 0, "Dataset should contain data"
# Delete each document using its ID
for data_item in dataset_data:
await cognee.delete(data_item.id, dataset_id, mode="soft")
nodes, edges = await graph_engine.get_graph_data()
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with soft delete."
if __name__ == "__main__":