fix: changing deletion logic to use document id instead of content hash (#1210)
<!-- .github/pull_request_template.md --> ## Description Changing deletion logic to use document id instead of content hash ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
a9e74dac42
commit
4e816ad80b
6 changed files with 83 additions and 62 deletions
|
|
@ -16,7 +16,11 @@ from cognee.modules.users.methods import get_default_user
|
|||
from cognee.modules.data.methods import get_authorized_existing_datasets
|
||||
from cognee.context_global_variables import set_database_global_context_variables
|
||||
|
||||
from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
|
||||
from cognee.api.v1.delete.exceptions import (
|
||||
DocumentNotFoundError,
|
||||
DatasetNotFoundError,
|
||||
DocumentSubgraphNotFoundError,
|
||||
)
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -82,17 +86,17 @@ async def delete(
|
|||
raise DocumentNotFoundError(f"Data {data_id} not found in dataset {dataset_id}")
|
||||
|
||||
# Get the content hash for deletion
|
||||
content_hash = data_point.content_hash
|
||||
data_id = str(data_point.id)
|
||||
|
||||
# Use the existing comprehensive deletion logic
|
||||
return await delete_single_document(content_hash, dataset.id, mode)
|
||||
return await delete_single_document(data_id, dataset.id, mode)
|
||||
|
||||
|
||||
async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
|
||||
async def delete_single_document(data_id: str, dataset_id: UUID = None, mode: str = "soft"):
|
||||
"""Delete a single document by its content hash."""
|
||||
|
||||
# Delete from graph database
|
||||
deletion_result = await delete_document_subgraph(content_hash, mode)
|
||||
deletion_result = await delete_document_subgraph(data_id, mode)
|
||||
|
||||
logger.info(f"Deletion result: {deletion_result}")
|
||||
|
||||
|
|
@ -163,12 +167,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
|
|||
|
||||
# Get the data point
|
||||
data_point = (
|
||||
await session.execute(select(Data).filter(Data.content_hash == content_hash))
|
||||
await session.execute(select(Data).filter(Data.id == UUID(data_id)))
|
||||
).scalar_one_or_none()
|
||||
|
||||
if data_point is None:
|
||||
raise DocumentNotFoundError(
|
||||
f"Document not found in relational DB with content hash: {content_hash}"
|
||||
f"Document not found in relational DB with data id: {data_id}"
|
||||
)
|
||||
|
||||
doc_id = data_point.id
|
||||
|
|
@ -203,7 +207,7 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
|
|||
"status": "success",
|
||||
"message": "Document deleted from both graph and relational databases",
|
||||
"graph_deletions": deletion_result["deleted_counts"],
|
||||
"content_hash": content_hash,
|
||||
"data_id": data_id,
|
||||
"dataset": dataset_id,
|
||||
"deleted_node_ids": [
|
||||
str(node_id) for node_id in deleted_node_ids
|
||||
|
|
@ -211,12 +215,12 @@ async def delete_single_document(content_hash: str, dataset_id: UUID = None, mod
|
|||
}
|
||||
|
||||
|
||||
async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
|
||||
async def delete_document_subgraph(document_id: str, mode: str = "soft"):
|
||||
"""Delete a document and all its related nodes in the correct order."""
|
||||
graph_db = await get_graph_engine()
|
||||
subgraph = await graph_db.get_document_subgraph(content_hash)
|
||||
subgraph = await graph_db.get_document_subgraph(document_id)
|
||||
if not subgraph:
|
||||
raise DocumentSubgraphNotFoundError(f"Document not found with content hash: {content_hash}")
|
||||
raise DocumentSubgraphNotFoundError(f"Document not found with id: {document_id}")
|
||||
|
||||
# Delete in the correct order to maintain graph integrity
|
||||
deletion_order = [
|
||||
|
|
@ -260,6 +264,6 @@ async def delete_document_subgraph(content_hash: str, mode: str = "soft"):
|
|||
return {
|
||||
"status": "success",
|
||||
"deleted_counts": deleted_counts,
|
||||
"content_hash": content_hash,
|
||||
"document_id": document_id,
|
||||
"deleted_node_ids": deleted_node_ids,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1524,7 +1524,7 @@ class KuzuAdapter(GraphDBInterface):
|
|||
logger.error(f"Error during database clearing: {e}")
|
||||
raise
|
||||
|
||||
async def get_document_subgraph(self, content_hash: str):
|
||||
async def get_document_subgraph(self, data_id: str):
|
||||
"""
|
||||
Get all nodes that should be deleted when removing a document.
|
||||
|
||||
|
|
@ -1535,7 +1535,7 @@ class KuzuAdapter(GraphDBInterface):
|
|||
Parameters:
|
||||
-----------
|
||||
|
||||
- content_hash (str): The identifier for the document to query against.
|
||||
- data_id (str): The identifier for the document to query against.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
|
@ -1545,7 +1545,7 @@ class KuzuAdapter(GraphDBInterface):
|
|||
"""
|
||||
query = """
|
||||
MATCH (doc:Node)
|
||||
WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.name = $content_hash
|
||||
WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id
|
||||
|
||||
OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node)
|
||||
WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk'
|
||||
|
|
@ -1583,7 +1583,7 @@ class KuzuAdapter(GraphDBInterface):
|
|||
COLLECT(DISTINCT made_node) as made_from_nodes,
|
||||
COLLECT(DISTINCT type) as orphan_types
|
||||
"""
|
||||
result = await self.query(query, {"content_hash": f"text_{content_hash}"})
|
||||
result = await self.query(query, {"data_id": f"{data_id}"})
|
||||
if not result or not result[0]:
|
||||
return None
|
||||
|
||||
|
|
|
|||
|
|
@ -1252,7 +1252,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|||
|
||||
return mandatory_metrics | optional_metrics
|
||||
|
||||
async def get_document_subgraph(self, content_hash: str):
|
||||
async def get_document_subgraph(self, data_id: str):
|
||||
"""
|
||||
Retrieve a subgraph related to a document identified by its content hash, including
|
||||
related entities and chunks.
|
||||
|
|
@ -1271,7 +1271,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|||
query = """
|
||||
MATCH (doc)
|
||||
WHERE (doc:TextDocument OR doc:PdfDocument)
|
||||
AND doc.name = 'text_' + $content_hash
|
||||
AND doc.id = $data_id
|
||||
|
||||
OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
|
||||
OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
|
||||
|
|
@ -1295,7 +1295,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|||
collect(DISTINCT made_node) as made_from_nodes,
|
||||
collect(DISTINCT type) as orphan_types
|
||||
"""
|
||||
result = await self.query(query, {"content_hash": content_hash})
|
||||
result = await self.query(query, {"data_id": data_id})
|
||||
return result[0] if result else None
|
||||
|
||||
async def get_degree_one_nodes(self, node_type: str):
|
||||
|
|
|
|||
|
|
@ -510,12 +510,12 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
|
||||
query = f"""
|
||||
MATCH (source:{self._GRAPH_NODE_LABEL})
|
||||
WHERE id(source) = $source_id
|
||||
MATCH (target:{self._GRAPH_NODE_LABEL})
|
||||
WHERE id(target) = $target_id
|
||||
MERGE (source)-[r:{relationship_name}]->(target)
|
||||
ON CREATE SET r = $properties, r.updated_at = timestamp()
|
||||
ON MATCH SET r = $properties, r.updated_at = timestamp()
|
||||
WHERE id(source) = $source_id
|
||||
MATCH (target:{self._GRAPH_NODE_LABEL})
|
||||
WHERE id(target) = $target_id
|
||||
MERGE (source)-[r:{relationship_name}]->(target)
|
||||
ON CREATE SET r = $properties, r.updated_at = timestamp()
|
||||
ON MATCH SET r = $properties, r.updated_at = timestamp()
|
||||
RETURN r
|
||||
"""
|
||||
|
||||
|
|
@ -565,9 +565,9 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
WHERE id(source) = edge.from_node
|
||||
MATCH (target:{self._GRAPH_NODE_LABEL})
|
||||
WHERE id(target) = edge.to_node
|
||||
MERGE (source)-[r:{relationship_name}]->(target)
|
||||
ON CREATE SET r = edge.properties, r.updated_at = timestamp()
|
||||
ON MATCH SET r = edge.properties, r.updated_at = timestamp()
|
||||
MERGE (source)-[r:{relationship_name}]->(target)
|
||||
ON CREATE SET r = edge.properties, r.updated_at = timestamp()
|
||||
ON MATCH SET r = edge.properties, r.updated_at = timestamp()
|
||||
RETURN count(*) AS edges_processed
|
||||
"""
|
||||
|
||||
|
|
@ -817,7 +817,7 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
query = f"""
|
||||
MATCH (n:{self._GRAPH_NODE_LABEL})-[r]-(m:{self._GRAPH_NODE_LABEL})
|
||||
WHERE id(n) = $node_id
|
||||
RETURN
|
||||
RETURN
|
||||
id(n) AS source_id,
|
||||
id(m) AS target_id,
|
||||
type(r) AS relationship_name,
|
||||
|
|
@ -1034,7 +1034,7 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
query = f"""
|
||||
MATCH (source:{self._GRAPH_NODE_LABEL})-[r]->(target:{self._GRAPH_NODE_LABEL})
|
||||
WHERE id(source) = $node_id OR id(target) = $node_id
|
||||
RETURN
|
||||
RETURN
|
||||
id(source) AS source_id,
|
||||
properties(source) AS source_props,
|
||||
id(target) AS target_id,
|
||||
|
|
@ -1284,14 +1284,14 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
|
||||
query = f"""
|
||||
MATCH (n :{self._GRAPH_NODE_LABEL})
|
||||
WHERE size((n)--()) = 1
|
||||
WHERE size((n)--()) = 1
|
||||
AND n.type = $node_type
|
||||
RETURN n
|
||||
"""
|
||||
result = await self.query(query, {"node_type": node_type})
|
||||
return [record["n"] for record in result] if result else []
|
||||
|
||||
async def get_document_subgraph(self, content_hash: str):
|
||||
async def get_document_subgraph(self, data_id: str):
|
||||
"""
|
||||
Retrieve a subgraph related to a document identified by its content hash, including
|
||||
related entities and chunks.
|
||||
|
|
@ -1299,7 +1299,7 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
Parameters:
|
||||
-----------
|
||||
|
||||
- content_hash (str): The hash identifying the document whose subgraph should be
|
||||
- data_id (str): The document_id identifying the document whose subgraph should be
|
||||
retrieved.
|
||||
|
||||
Returns:
|
||||
|
|
@ -1312,10 +1312,10 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
MATCH (doc)
|
||||
WHERE (doc:{self._GRAPH_NODE_LABEL})
|
||||
AND doc.type in ['TextDocument', 'PdfDocument']
|
||||
AND doc.name = 'text_' + $content_hash
|
||||
AND doc.id = $data_id
|
||||
|
||||
OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk {{type: 'DocumentChunk'}})
|
||||
|
||||
|
||||
// Alternative to WHERE NOT EXISTS
|
||||
OPTIONAL MATCH (chunk)-[:contains]->(entity {{type: 'Entity'}})
|
||||
OPTIONAL MATCH (entity)<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc)
|
||||
|
|
@ -1330,7 +1330,7 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
OPTIONAL MATCH (type)<-[:is_a]-(otherEntity {{type: 'Entity'}})<-[:contains]-(otherChunk {{type: 'DocumentChunk'}})-[:is_part_of]->(otherDoc)
|
||||
WHERE otherDoc.type in ['TextDocument', 'PdfDocument']
|
||||
AND otherDoc.id <> doc.id
|
||||
|
||||
|
||||
// Alternative to WHERE NOT EXISTS
|
||||
WITH doc, entity, chunk, made_node, type, otherDoc
|
||||
WHERE otherDoc IS NULL
|
||||
|
|
@ -1342,7 +1342,7 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
collect(DISTINCT made_node) as made_from_nodes,
|
||||
collect(DISTINCT type) as orphan_types
|
||||
"""
|
||||
result = await self.query(query, {"content_hash": content_hash})
|
||||
result = await self.query(query, {"data_id": data_id})
|
||||
return result[0] if result else None
|
||||
|
||||
async def _get_model_independent_graph_data(self):
|
||||
|
|
@ -1388,7 +1388,7 @@ class NeptuneGraphDB(GraphDBInterface):
|
|||
CALL neptune.algo.wcc(n,{{}})
|
||||
YIELD node, component
|
||||
RETURN component, count(*) AS size
|
||||
ORDER BY size DESC
|
||||
ORDER BY size DESC
|
||||
"""
|
||||
|
||||
result = await self.query(query)
|
||||
|
|
|
|||
|
|
@ -826,7 +826,7 @@ class NetworkXAdapter(GraphDBInterface):
|
|||
|
||||
return mandatory_metrics | optional_metrics
|
||||
|
||||
async def get_document_subgraph(self, content_hash: str):
|
||||
async def get_document_subgraph(self, data_id: str):
|
||||
"""
|
||||
Retrieve all relevant nodes when a document is being deleted, including chunks and
|
||||
orphaned entities.
|
||||
|
|
@ -834,7 +834,7 @@ class NetworkXAdapter(GraphDBInterface):
|
|||
Parameters:
|
||||
-----------
|
||||
|
||||
- content_hash (str): The hash identifying the content of the document to fetch
|
||||
- data_id(str): The data id identifying the document to fetch
|
||||
related nodes for.
|
||||
|
||||
Returns:
|
||||
|
|
@ -853,7 +853,7 @@ class NetworkXAdapter(GraphDBInterface):
|
|||
for node_id, attrs in self.graph.nodes(data=True):
|
||||
if (
|
||||
attrs.get("type") in ["TextDocument", "PdfDocument"]
|
||||
and attrs.get("name") == f"text_{content_hash}"
|
||||
and attrs.get("id") == f"{data_id}"
|
||||
):
|
||||
document = {"id": str(node_id), **attrs} # Convert UUID to string for consistency
|
||||
document_node_id = node_id # Keep the original UUID
|
||||
|
|
|
|||
|
|
@ -12,7 +12,15 @@ async def main():
|
|||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
text_1 = """
|
||||
first_file = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
|
||||
)
|
||||
|
||||
second_file = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
|
||||
)
|
||||
|
||||
third_content = """
|
||||
1. Audi
|
||||
Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.
|
||||
|
||||
|
|
@ -31,27 +39,10 @@ async def main():
|
|||
Each of these car manufacturer contributes to Germany's reputation as a leader in the global automotive industry, showcasing a blend of innovation, performance, and design excellence.
|
||||
"""
|
||||
|
||||
text_2 = """
|
||||
1. Apple
|
||||
Apple is renowned for its innovative consumer electronics and software. Its product lineup includes the iPhone, iPad, Mac computers, and wearables like the Apple Watch. Known for its emphasis on sleek design and user-friendly interfaces, Apple has built a loyal customer base and created a seamless ecosystem that integrates hardware, software, and services.
|
||||
|
||||
2. Google
|
||||
Founded in 1998, Google started as a search engine and quickly became the go-to resource for finding information online. Over the years, the company has diversified its offerings to include digital advertising, cloud computing, mobile operating systems (Android), and various web services like Gmail and Google Maps. Google's innovations have played a major role in shaping the internet landscape.
|
||||
|
||||
3. Microsoft
|
||||
Microsoft Corporation has been a dominant force in software for decades. Its Windows operating system and Microsoft Office suite are staples in both business and personal computing. In recent years, Microsoft has expanded into cloud computing with Azure, gaming with the Xbox platform, and even hardware through products like the Surface line. This evolution has helped the company maintain its relevance in a rapidly changing tech world.
|
||||
|
||||
4. Amazon
|
||||
What began as an online bookstore has grown into one of the largest e-commerce platforms globally. Amazon is known for its vast online marketplace, but its influence extends far beyond retail. With Amazon Web Services (AWS), the company has become a leader in cloud computing, offering robust solutions that power websites, applications, and businesses around the world. Amazon's constant drive for innovation continues to reshape both retail and technology sectors.
|
||||
|
||||
5. Meta
|
||||
Meta, originally known as Facebook, revolutionized social media by connecting billions of people worldwide. Beyond its core social networking service, Meta is investing in the next generation of digital experiences through virtual and augmented reality technologies, with projects like Oculus. The company's efforts signal a commitment to evolving digital interaction and building the metaverse—a shared virtual space where users can connect and collaborate.
|
||||
|
||||
Each of these companies has significantly impacted the technology landscape, driving innovation and transforming everyday life through their groundbreaking products and services.
|
||||
"""
|
||||
################### HARD DELETE
|
||||
|
||||
# Add documents and get dataset information
|
||||
add_result = await cognee.add([text_1, text_2])
|
||||
add_result = await cognee.add([first_file, second_file, third_content])
|
||||
dataset_id = add_result.dataset_id
|
||||
|
||||
await cognee.cognify()
|
||||
|
|
@ -72,7 +63,33 @@ async def main():
|
|||
|
||||
nodes, edges = await graph_engine.get_graph_data()
|
||||
|
||||
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted."
|
||||
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with hard delete."
|
||||
|
||||
################### SOFT DELETE
|
||||
|
||||
# Add documents and get dataset information
|
||||
add_result = await cognee.add([first_file, second_file, third_content])
|
||||
dataset_id = add_result.dataset_id
|
||||
|
||||
await cognee.cognify()
|
||||
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
|
||||
graph_engine = await get_graph_engine()
|
||||
nodes, edges = await graph_engine.get_graph_data()
|
||||
assert len(nodes) > 10 and len(edges) > 10, "Graph database is not loaded."
|
||||
|
||||
# Get the data IDs from the dataset
|
||||
dataset_data = await get_dataset_data(dataset_id)
|
||||
assert len(dataset_data) > 0, "Dataset should contain data"
|
||||
|
||||
# Delete each document using its ID
|
||||
for data_item in dataset_data:
|
||||
await cognee.delete(data_item.id, dataset_id, mode="soft")
|
||||
|
||||
nodes, edges = await graph_engine.get_graph_data()
|
||||
|
||||
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with soft delete."
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue