feature: Adds doctype handling to delete (audio, image, unstructured) (#1239)

<!-- .github/pull_request_template.md -->

## Description
feature: Adds doctype handling to delete (audio, image, unstructured)

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
hajdul88 2025-08-12 19:01:06 +02:00 committed by GitHub
parent 0c42d19505
commit fd9aaf57b1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 35 additions and 11 deletions

View file

@ -1550,7 +1550,7 @@ class KuzuAdapter(GraphDBInterface):
"""
query = """
MATCH (doc:Node)
WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument') AND doc.id = $data_id
WHERE (doc.type = 'TextDocument' OR doc.type = 'PdfDocument' OR doc.type = 'AudioDocument' OR doc.type = 'ImageDocument' OR doc.type = 'UnstructuredDocument') AND doc.id = $data_id
OPTIONAL MATCH (doc)<-[e1:EDGE]-(chunk:Node)
WHERE e1.relationship_name = 'is_part_of' AND chunk.type = 'DocumentChunk'
@ -1561,7 +1561,7 @@ class KuzuAdapter(GraphDBInterface):
MATCH (entity)<-[e3:EDGE]-(otherChunk:Node)-[e4:EDGE]->(otherDoc:Node)
WHERE e3.relationship_name = 'contains'
AND e4.relationship_name = 'is_part_of'
AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument')
AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument')
AND otherDoc.id <> doc.id
}
@ -1577,7 +1577,7 @@ class KuzuAdapter(GraphDBInterface):
AND e9.relationship_name = 'is_part_of'
AND otherEntity.type = 'Entity'
AND otherChunk.type = 'DocumentChunk'
AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument')
AND (otherDoc.type = 'TextDocument' OR otherDoc.type = 'PdfDocument' OR otherDoc.type = 'AudioDocument' OR otherDoc.type = 'ImageDocument' OR otherDoc.type = 'UnstructuredDocument')
AND otherDoc.id <> doc.id
}

View file

@ -1270,21 +1270,21 @@ class Neo4jAdapter(GraphDBInterface):
"""
query = """
MATCH (doc)
WHERE (doc:TextDocument OR doc:PdfDocument)
WHERE (doc:TextDocument OR doc:PdfDocument OR doc:UnstructuredDocument OR doc:AudioDocument or doc:ImageDocument)
AND doc.id = $data_id
OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
WHERE NOT EXISTS {
MATCH (entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
AND otherDoc.id <> doc.id
}
OPTIONAL MATCH (chunk)<-[:made_from]-(made_node:TextSummary)
OPTIONAL MATCH (entity)-[:is_a]->(type:EntityType)
WHERE NOT EXISTS {
MATCH (type)<-[:is_a]-(otherEntity:Entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
AND otherDoc.id <> doc.id
}

View file

@ -12,15 +12,21 @@ async def main():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
first_file = os.path.join(
pdf_document = os.path.join(
pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
)
second_file = os.path.join(
txt_document = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
)
third_content = """
audio_document = os.path.join(pathlib.Path(__file__).parent, "test_data/text_to_speech.mp3")
image_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.png")
unstructured_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.pptx")
text_document_as_literal = """
1. Audi
Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.
@ -42,7 +48,16 @@ async def main():
################### HARD DELETE
# Add documents and get dataset information
add_result = await cognee.add([first_file, second_file, third_content])
add_result = await cognee.add(
[
pdf_document,
txt_document,
text_document_as_literal,
unstructured_document,
audio_document,
image_document,
]
)
dataset_id = add_result.dataset_id
await cognee.cognify()
@ -68,7 +83,16 @@ async def main():
################### SOFT DELETE
# Add documents and get dataset information
add_result = await cognee.add([first_file, second_file, third_content])
add_result = await cognee.add(
[
pdf_document,
txt_document,
text_document_as_literal,
unstructured_document,
audio_document,
image_document,
]
)
dataset_id = add_result.dataset_id
await cognee.cognify()