From fd9aaf57b1af101b6ec0796b50e096c208f4d35f Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Tue, 12 Aug 2025 19:01:06 +0200
Subject: [PATCH] feature: Adds doctype handling to delete (audio, image,
 unstructured) (#1239)

<!-- .github/pull_request_template.md -->

## Description
feature: Adds doctype handling to delete (audio, image, unstructured)

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
---
 .../databases/graph/kuzu/adapter.py           |  6 ++--
 .../databases/graph/neo4j_driver/adapter.py   |  6 ++--
 cognee/tests/test_deletion.py                 | 34 ++++++++++++++++---
 3 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py
index 2fa01a947..1bafb3754 100644
--- a/cognee/infrastructure/databases/graph/kuzu/adapter.py
+++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py
@@ -1550,7 +1550,7 @@ class KuzuAdapter(GraphDBInterface):
         """
         query = """
         MATCH (doc:Node)
-        WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id
+        WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument' OR doc.type = 'AudioDocument' OR doc.type = 'ImageDocument' OR doc.type = 'UnstructuredDocument') AND doc.id = $data_id
 
         OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node)
         WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk'
@@ -1561,7 +1561,7 @@ class KuzuAdapter(GraphDBInterface):
             MATCH (entity)<-[e3:EDGE]-(otherChunk:Node)-[e4:EDGE]->(otherDoc:Node)
             WHERE e3.relationship_name = 'contains'
             AND e4.relationship_name = 'is_part_of'
-            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument')
+            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument')
             AND otherDoc.id <> doc.id
         }
 
@@ -1577,7 +1577,7 @@ class KuzuAdapter(GraphDBInterface):
             AND e9.relationship_name = 'is_part_of'
             AND otherEntity.type = 'Entity'
             AND otherChunk.type = 'DocumentChunk'
-            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument')
+            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument')
             AND otherDoc.id <> doc.id
         }
 
diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
index 7091c32e1..ea8072554 100644
--- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@@ -1270,21 +1270,21 @@ class Neo4jAdapter(GraphDBInterface):
         """
         query = """
         MATCH (doc)
-        WHERE (doc:TextDocument OR doc:PdfDocument)
+        WHERE (doc:TextDocument OR doc:PdfDocument OR doc:UnstructuredDocument OR doc:AudioDocument or doc:ImageDocument)
         AND doc.id = $data_id
 
         OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
         OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
         WHERE NOT EXISTS {
             MATCH (entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
-            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
+            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
             AND otherDoc.id <> doc.id
         }
         OPTIONAL MATCH (chunk)<-[:made_from]-(made_node:TextSummary)
         OPTIONAL MATCH (entity)-[:is_a]->(type:EntityType)
         WHERE NOT EXISTS {
             MATCH (type)<-[:is_a]-(otherEntity:Entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
-            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
+            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
             AND otherDoc.id <> doc.id
         }
 
diff --git a/cognee/tests/test_deletion.py b/cognee/tests/test_deletion.py
index 54c69ad12..99450084d 100644
--- a/cognee/tests/test_deletion.py
+++ b/cognee/tests/test_deletion.py
@@ -12,15 +12,21 @@ async def main():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-    first_file = os.path.join(
+    pdf_document = os.path.join(
         pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
     )
 
-    second_file = os.path.join(
+    txt_document = os.path.join(
         pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
     )
 
-    third_content = """
+    audio_document = os.path.join(pathlib.Path(__file__).parent, "test_data/text_to_speech.mp3")
+
+    image_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.png")
+
+    unstructured_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.pptx")
+
+    text_document_as_literal = """
     1. Audi
     Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.
 
@@ -42,7 +48,16 @@ async def main():
     ################### HARD DELETE
 
     # Add documents and get dataset information
-    add_result = await cognee.add([first_file, second_file, third_content])
+    add_result = await cognee.add(
+        [
+            pdf_document,
+            txt_document,
+            text_document_as_literal,
+            unstructured_document,
+            audio_document,
+            image_document,
+        ]
+    )
     dataset_id = add_result.dataset_id
 
     await cognee.cognify()
@@ -68,7 +83,16 @@ async def main():
     ################### SOFT DELETE
 
     # Add documents and get dataset information
-    add_result = await cognee.add([first_file, second_file, third_content])
+    add_result = await cognee.add(
+        [
+            pdf_document,
+            txt_document,
+            text_document_as_literal,
+            unstructured_document,
+            audio_document,
+            image_document,
+        ]
+    )
     dataset_id = add_result.dataset_id
 
     await cognee.cognify()