From 6a4d31356bb613e5cf74e7972445f804796ee6d4 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 2 Dec 2025 18:55:47 +0530 Subject: [PATCH] fix: using graph projection instead of conditions --- .../retrieval/utils/access_tracking.py | 156 ++-- cognee/tasks/cleanup/cleanup_unused_data.py | 759 ++++++++---------- 2 files changed, 418 insertions(+), 497 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 12a66f8bc..935c47157 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,118 +4,116 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID -import os +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data from cognee.shared.logging_utils import get_logger from sqlalchemy import update +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph logger = get_logger(__name__) async def update_node_access_timestamps(items: List[Any]): if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return - + graph_engine = await get_graph_engine() timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) timestamp_dt = datetime.now(timezone.utc) - + # Extract node IDs node_ids = [] for item in items: item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if item_id: node_ids.append(str(item_id)) - + if not node_ids: return - - try: - # Detect database provider and use appropriate queries - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - if provider == "kuzu": - await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms) - elif provider == "neo4j": - await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms) - elif provider == "neptune": - await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms) - else: - logger.warning(f"Unsupported graph provider: {provider}") - return + try: + # Update nodes using graph projection ( database-agnostic approach + await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) # Find origin documents and update SQL - doc_ids = await _find_origin_documents(graph_engine, node_ids, provider) + doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: await _update_sql_records(doc_ids, timestamp_dt) - + except Exception as e: logger.error(f"Failed to update timestamps: {e}") raise -async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms): - """Kuzu-specific node updates""" - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - -async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms): - """Neo4j-specific node updates""" - for node_id in node_ids: - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - -async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms): - """Neptune-specific node updates""" - for node_id in node_ids: - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - -async def _find_origin_documents(graph_engine, node_ids, provider): - """Find origin documents with provider-specific queries""" - if provider == "kuzu": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - elif provider == "neo4j": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - elif provider == "neptune": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ +async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): + """Update nodes using graph projection - works with any graph database""" + # Project the graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id"], + edge_properties_to_project=[] + ) - result = await graph_engine.query(query, {"node_ids": node_ids}) - return list(set([row[0] for row in result if row and row[0]])) if result else [] + # Update each node's last_accessed_at property + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node: + # Update the node in the database + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + + if provider == "kuzu": + # Kuzu stores properties as JSON + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} + ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + elif provider == "neo4j": + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + elif provider == "neptune": + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _find_origin_documents_via_projection(graph_engine, node_ids): + """Find origin documents using graph projection instead of DB queries""" + # Project the entire graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id", "type"], + edge_properties_to_project=["relationship_name"] + ) + + # Find origin documents by traversing the in-memory graph + doc_ids = set() + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node and node.get_attribute("type") == "DocumentChunk": + # Traverse edges to find connected documents + for edge in node.get_skeleton_edges(): + # Get the neighbor node + neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node() + if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: + doc_ids.add(neighbor.id) + + return list(doc_ids) async def _update_sql_records(doc_ids, timestamp_dt): """Update SQL Data table (same for all providers)""" @@ -124,6 +122,6 @@ async def _update_sql_records(doc_ids, timestamp_dt): stmt = update(Data).where( Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) ).values(last_accessed=timestamp_dt) - + await session.execute(stmt) await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index b89c939a8..c70b97a00 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,448 +1,371 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries)) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries)) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + days since last access to consider data unused + dry_run : bool + If True, only report what would be delete without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes using projection (database-agnostic) + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes using graph projection + unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes (provider-agnostic deletion) + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - minutes_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = False +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - minutes_threshold : int - days since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be delete without actually deleting (default: True) + If True, only report what would be deleted user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) + Filter by user ID if provided Returns ------- Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc - ) + Cleanup results + """ + db_engine = get_relational_engine() - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes directly from graph - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) + ) + ) - # Detect database provider and find unused nodes - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes with provider-specific logic - deleted_counts = await _delete_unused_nodes(unused_nodes, provider) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + result = await session.execute(query) + unused_data = result.all() + + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: return { - "status": "completed", - "unused_count": total_unused, + "status": "dry_run", + "unused_count": len(unused_data), "deleted_count": { "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] + "documents": 0 }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } + + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") + + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() +async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: + """ + Find unused nodes using graph projection - database-agnostic approach. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Project the entire graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id", "type", "last_accessed_at"], + edge_properties_to_project=[] + ) - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + # Get all nodes from the projected graph + all_nodes = memory_fragment.get_nodes() + + for node in all_nodes: + node_type = node.get_attribute("type") + if node_type not in unused_nodes: + continue + + # Check last_accessed_at property + last_accessed = node.get_attribute("last_accessed_at") + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node.id) + logger.debug( + f"Found unused {node_type}", + node_id=node.id, + last_accessed=last_accessed ) + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion (using graph projection for consistency) + if any(unused_nodes.values()): + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id"], + edge_properties_to_project=[] ) - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node: + # Count edges from the in-memory graph + edge_count = len(node.get_skeleton_edges()) + deleted_counts["associations"] += edge_count + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches (database-agnostic) + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None, - provider: str = "kuzu" -) -> Dict[str, list]: - """ - Find unused nodes with provider-specific queries. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch - user_id : UUID, optional - Filter by user ID if provided - provider : str - Graph database provider (kuzu, neo4j, neptune) - - Returns - ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs - """ - graph_engine = await get_graph_engine() - - if provider == "kuzu": - return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms) - elif provider == "neo4j": - return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms) - elif provider == "neptune": - return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms) - else: - logger.warning(f"Unsupported graph provider: {provider}") - return {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - -async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms): - """Kuzu-specific unused node detection""" - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for node_id, node_type, props_json in results: - if node_type not in unused_nodes: - continue - - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms): - """Neo4j-specific unused node detection""" - query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for row in results: - node_id = row["n"]["id"] - node_type = row["n"]["type"] - last_accessed = row["n"].get("last_accessed_at") - - if node_type not in unused_nodes: - continue - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms): - """Neptune-specific unused node detection""" - query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for row in results: - node_id = row["n"]["id"] - node_type = row["n"]["type"] - last_accessed = row["n"].get("last_accessed_at") - - if node_type not in unused_nodes: - continue - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - provider : str - Graph database provider (kuzu, neo4j, neptune) - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - if provider == "kuzu": - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - elif provider == "neo4j": - result = await graph_engine.query( - "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - elif provider == "neptune": - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - - if result and len(result) > 0: - count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"] - deleted_counts["associations"] += count - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) - - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts