feature: Adds doctype handling to delete (audio, image, unstructured) (#1239)

## Description feature: Adds doctype handling to delete (audio, image, unstructured) ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-08-12 19:01:06 +02:00 · 2025-08-12 19:01:06 +02:00 · fd9aaf57b1
commit fd9aaf57b1
parent 0c42d19505
3 changed files with 35 additions and 11 deletions
--- a/cognee/infrastructure/databases/graph/kuzu/adapter.py
+++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py
@ -1550,7 +1550,7 @@ class KuzuAdapter(GraphDBInterface):
        """
        query = """
        MATCH (doc:Node)
-        WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id
+        WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument' OR doc.type = 'AudioDocument' OR doc.type = 'ImageDocument' OR doc.type = 'UnstructuredDocument') AND doc.id = $data_id

        OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node)
        WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk'
@ -1561,7 +1561,7 @@ class KuzuAdapter(GraphDBInterface):
            MATCH (entity)<-[e3:EDGE]-(otherChunk:Node)-[e4:EDGE]->(otherDoc:Node)
            WHERE e3.relationship_name = 'contains'
            AND e4.relationship_name = 'is_part_of'
-            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument')
+            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument')
            AND otherDoc.id <> doc.id
        }

@ -1577,7 +1577,7 @@ class KuzuAdapter(GraphDBInterface):
            AND e9.relationship_name = 'is_part_of'
            AND otherEntity.type = 'Entity'
            AND otherChunk.type = 'DocumentChunk'
-            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument')
+            AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument')
            AND otherDoc.id <> doc.id
        }

--- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@ -1270,21 +1270,21 @@ class Neo4jAdapter(GraphDBInterface):
        """
        query = """
        MATCH (doc)
-        WHERE (doc:TextDocument OR doc:PdfDocument)
+        WHERE (doc:TextDocument OR doc:PdfDocument OR doc:UnstructuredDocument OR doc:AudioDocument or doc:ImageDocument)
        AND doc.id = $data_id

        OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
        OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
        WHERE NOT EXISTS {
            MATCH (entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
-            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
+            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
            AND otherDoc.id <> doc.id
        }
        OPTIONAL MATCH (chunk)<-[:made_from]-(made_node:TextSummary)
        OPTIONAL MATCH (entity)-[:is_a]->(type:EntityType)
        WHERE NOT EXISTS {
            MATCH (type)<-[:is_a]-(otherEntity:Entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
-            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
+            WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
            AND otherDoc.id <> doc.id
        }

--- a/cognee/tests/test_deletion.py
+++ b/cognee/tests/test_deletion.py
@ -12,15 +12,21 @@ async def main():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)

-    first_file = os.path.join(
+    pdf_document = os.path.join(
        pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
    )

-    second_file = os.path.join(
+    txt_document = os.path.join(
        pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
    )

-    third_content = """
+    audio_document = os.path.join(pathlib.Path(__file__).parent, "test_data/text_to_speech.mp3")
+
+    image_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.png")
+
+    unstructured_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.pptx")
+
+    text_document_as_literal = """
    1. Audi
    Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.

@ -42,7 +48,16 @@ async def main():
    ################### HARD DELETE

    # Add documents and get dataset information
-    add_result = await cognee.add([first_file, second_file, third_content])
+    add_result = await cognee.add(
+        [
+            pdf_document,
+            txt_document,
+            text_document_as_literal,
+            unstructured_document,
+            audio_document,
+            image_document,
+        ]
+    )
    dataset_id = add_result.dataset_id

    await cognee.cognify()
@ -68,7 +83,16 @@ async def main():
    ################### SOFT DELETE

    # Add documents and get dataset information
-    add_result = await cognee.add([first_file, second_file, third_content])
+    add_result = await cognee.add(
+        [
+            pdf_document,
+            txt_document,
+            text_document_as_literal,
+            unstructured_document,
+            audio_document,
+            image_document,
+        ]
+    )
    dataset_id = add_result.dataset_id

    await cognee.cognify()