cognee/cognee/tasks/chunks/remove_disconnected_chunks.py
2025-01-17 10:30:34 +01:00

38 lines
1.4 KiB
Python

from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
async def remove_disconnected_chunks(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]:
"""
Removes disconnected or obsolete chunks from the graph database.
Notes:
- Obsolete chunks are defined as chunks with no "next_chunk" predecessor.
- Fully disconnected nodes are identified and deleted separately.
- This function assumes that the graph database is properly initialized and accessible.
"""
graph_engine = await get_graph_engine()
document_ids = set((data_chunk.document_id for data_chunk in data_chunks))
obsolete_chunk_ids = []
for document_id in document_ids:
chunks = await graph_engine.get_successors(document_id, edge_label="has_chunk")
for chunk in chunks:
previous_chunks = await graph_engine.get_predecessors(
chunk["uuid"], edge_label="next_chunk"
)
if len(previous_chunks) == 0:
obsolete_chunk_ids.append(chunk["uuid"])
if len(obsolete_chunk_ids) > 0:
await graph_engine.delete_nodes(obsolete_chunk_ids)
disconnected_nodes = await graph_engine.get_disconnected_nodes()
if len(disconnected_nodes) > 0:
await graph_engine.delete_nodes(disconnected_nodes)
return data_chunks