From fd9aaf57b1af101b6ec0796b50e096c208f4d35f Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 12 Aug 2025 19:01:06 +0200 Subject: [PATCH] feature: Adds doctype handling to delete (audio, image, unstructured) (#1239) ## Description feature: Adds doctype handling to delete (audio, image, unstructured) ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../databases/graph/kuzu/adapter.py | 6 ++-- .../databases/graph/neo4j_driver/adapter.py | 6 ++-- cognee/tests/test_deletion.py | 34 ++++++++++++++++--- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 2fa01a947..1bafb3754 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -1550,7 +1550,7 @@ class KuzuAdapter(GraphDBInterface): """ query = """ MATCH (doc:Node) - WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id + WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument' OR doc.type = 'AudioDocument' OR doc.type = 'ImageDocument' OR doc.type = 'UnstructuredDocument') AND doc.id = $data_id OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node) WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk' @@ -1561,7 +1561,7 @@ class KuzuAdapter(GraphDBInterface): MATCH (entity)<-[e3:EDGE]-(otherChunk:Node)-[e4:EDGE]->(otherDoc:Node) WHERE e3.relationship_name = 'contains' AND e4.relationship_name = 'is_part_of' - AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument') + AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument') AND otherDoc.id <> doc.id } @@ -1577,7 +1577,7 @@ class KuzuAdapter(GraphDBInterface): AND e9.relationship_name = 'is_part_of' AND otherEntity.type = 'Entity' AND otherChunk.type = 'DocumentChunk' - AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument') + AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument') AND otherDoc.id <> doc.id } diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 7091c32e1..ea8072554 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1270,21 +1270,21 @@ class Neo4jAdapter(GraphDBInterface): """ query = """ MATCH (doc) - WHERE (doc:TextDocument OR doc:PdfDocument) + WHERE (doc:TextDocument OR doc:PdfDocument OR doc:UnstructuredDocument OR doc:AudioDocument or doc:ImageDocument) AND doc.id = $data_id OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk) OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity) WHERE NOT EXISTS { MATCH (entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc) - WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument) + WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument) AND otherDoc.id <> doc.id } OPTIONAL MATCH (chunk)<-[:made_from]-(made_node:TextSummary) OPTIONAL MATCH (entity)-[:is_a]->(type:EntityType) WHERE NOT EXISTS { MATCH (type)<-[:is_a]-(otherEntity:Entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc) - WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument) + WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument) AND otherDoc.id <> doc.id } diff --git a/cognee/tests/test_deletion.py b/cognee/tests/test_deletion.py index 54c69ad12..99450084d 100644 --- a/cognee/tests/test_deletion.py +++ b/cognee/tests/test_deletion.py @@ -12,15 +12,21 @@ async def main(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - first_file = os.path.join( + pdf_document = os.path.join( pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf" ) - second_file = os.path.join( + txt_document = os.path.join( pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt" ) - third_content = """ + audio_document = os.path.join(pathlib.Path(__file__).parent, "test_data/text_to_speech.mp3") + + image_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.png") + + unstructured_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.pptx") + + text_document_as_literal = """ 1. Audi Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars. @@ -42,7 +48,16 @@ async def main(): ################### HARD DELETE # Add documents and get dataset information - add_result = await cognee.add([first_file, second_file, third_content]) + add_result = await cognee.add( + [ + pdf_document, + txt_document, + text_document_as_literal, + unstructured_document, + audio_document, + image_document, + ] + ) dataset_id = add_result.dataset_id await cognee.cognify() @@ -68,7 +83,16 @@ async def main(): ################### SOFT DELETE # Add documents and get dataset information - add_result = await cognee.add([first_file, second_file, third_content]) + add_result = await cognee.add( + [ + pdf_document, + txt_document, + text_document_as_literal, + unstructured_document, + audio_document, + image_document, + ] + ) dataset_id = add_result.dataset_id await cognee.cognify()