From 3372679f7bb40c01ffd9e337ead27fe9f8981d54 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 29 Oct 2025 20:12:14 +0530 Subject: [PATCH 01/25] feat: adding last_accessed_at field to the models and updating the retrievers to update the timestamp --- .../modules/chunking/models/DocumentChunk.py | 7 +++ cognee/modules/engine/models/Entity.py | 7 ++- cognee/modules/retrieval/chunks_retriever.py | 55 +++++++---------- .../modules/retrieval/summaries_retriever.py | 28 ++++----- .../retrieval/utils/access_tracking.py | 61 +++++++++++++++++++ cognee/tasks/summarization/models.py | 8 ++- 6 files changed, 115 insertions(+), 51 deletions(-) create mode 100644 cognee/modules/retrieval/utils/access_tracking.py diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 9f8c57486..c4c6a2ed3 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,5 +1,7 @@ from typing import List, Union +from pydantic import BaseModel, Field +from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.data.processing.document_types import Document from cognee.modules.engine.models import Entity @@ -22,6 +24,7 @@ class DocumentChunk(DataPoint): - cut_type: The type of cut that defined this chunk. - is_part_of: The document to which this chunk belongs. - contains: A list of entities or events contained within the chunk (default is None). + - last_accessed_at: The timestamp of the last time the chunk was accessed. - metadata: A dictionary to hold meta information related to the chunk, including index fields. """ @@ -32,5 +35,9 @@ class DocumentChunk(DataPoint): cut_type: str is_part_of: Document contains: List[Union[Entity, Event]] = None + + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) metadata: dict = {"index_fields": ["text"]} diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 36da2e344..3e48ea02a 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -1,11 +1,14 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.engine.models.EntityType import EntityType from typing import Optional - +from datetime import datetime, timezone +from pydantic import BaseModel, Field class Entity(DataPoint): name: str is_a: Optional[EntityType] = None description: str - + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) metadata: dict = {"index_fields": ["name"]} diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index 94b9d3fb9..74634b71e 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -1,10 +1,11 @@ from typing import Any, Optional - +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError +from datetime import datetime, timezone logger = get_logger("ChunksRetriever") @@ -27,38 +28,26 @@ class ChunksRetriever(BaseRetriever): ): self.top_k = top_k - async def get_context(self, query: str) -> Any: - """ - Retrieves document chunks context based on the query. - - Searches for document chunks relevant to the specified query using a vector engine. - Raises a NoDataError if no data is found in the system. - - Parameters: - ----------- - - - query (str): The query string to search for relevant document chunks. - - Returns: - -------- - - - Any: A list of document chunk payloads retrieved from the search. - """ - logger.info( - f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" - ) - - vector_engine = get_vector_engine() - - try: - found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) - logger.info(f"Found {len(found_chunks)} chunks from vector search") - except CollectionNotFoundError as error: - logger.error("DocumentChunk_text collection not found in vector database") - raise NoDataError("No data found in the system, please add data first.") from error - - chunk_payloads = [result.payload for result in found_chunks] - logger.info(f"Returning {len(chunk_payloads)} chunk payloads") + async def get_context(self, query: str) -> Any: + """Retrieves document chunks context based on the query.""" + logger.info( + f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" + ) + + vector_engine = get_vector_engine() + + try: + found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) + logger.info(f"Found {len(found_chunks)} chunks from vector search") + + # NEW: Update access timestamps + await update_node_access_timestamps(found_chunks, "DocumentChunk") + except CollectionNotFoundError as error: + logger.error("DocumentChunk_text collection not found in vector database") + raise NoDataError("No data found in the system, please add data first.") from error + + chunk_payloads = [result.payload for result in found_chunks] + logger.info(f"Returning {len(chunk_payloads)} chunk payloads") return chunk_payloads async def get_completion( diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 87b224946..7f996274e 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -4,6 +4,7 @@ from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.exceptions.exceptions import NoDataError +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError logger = get_logger("SummariesRetriever") @@ -47,20 +48,19 @@ class SummariesRetriever(BaseRetriever): f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" ) - vector_engine = get_vector_engine() - - try: - summaries_results = await vector_engine.search( - "TextSummary_text", query, limit=self.top_k - ) - logger.info(f"Found {len(summaries_results)} summaries from vector search") - except CollectionNotFoundError as error: - logger.error("TextSummary_text collection not found in vector database") - raise NoDataError("No data found in the system, please add data first.") from error - - summary_payloads = [summary.payload for summary in summaries_results] - logger.info(f"Returning {len(summary_payloads)} summary payloads") - return summary_payloads + vector_engine = get_vector_engine() + + try: + summaries_results = await vector_engine.search( + "TextSummary_text", query, limit=self.top_k + ) + + await update_node_access_timestamps(summaries_results, "TextSummary") + + except CollectionNotFoundError as error: + raise NoDataError("No data found in the system, please add data first.") from error + + return [summary.payload for summary in summaries_results] async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py new file mode 100644 index 000000000..ca5ed88cd --- /dev/null +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -0,0 +1,61 @@ + +"""Utilities for tracking data access in retrievers.""" + +import json +from datetime import datetime, timezone +from typing import List, Any + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.shared.logging_utils import get_logger + +logger = get_logger(__name__) + + +async def update_node_access_timestamps(items: List[Any], node_type: str): + """ + Update last_accessed_at for nodes in Kuzu graph database. + + Parameters + ---------- + items : List[Any] + List of items with payload containing 'id' field (from vector search results) + node_type : str + Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary') + """ + if not items: + return + + graph_engine = await get_graph_engine() + # Convert to milliseconds since epoch (matching the field format) + timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) + + for item in items: + # Extract ID from payload (vector search results have this structure) + item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") + if not item_id: + continue + + try: + # Get current node properties from Kuzu's Node table + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props", + {"id": str(item_id), "node_type": node_type} + ) + + if result and len(result) > 0 and result[0][0]: + # Parse existing properties JSON + props = json.loads(result[0][0]) if result[0][0] else {} + # Update last_accessed_at with millisecond timestamp + props["last_accessed_at"] = timestamp_ms + + # Write back to graph database + await graph_engine.query( + "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props", + {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)} + ) + except Exception as e: + logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}") + continue + + logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes") + diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 75ed82d50..46f9a8d8b 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -1,5 +1,7 @@ -from typing import Union +from pydantic import BaseModel, Field +from typing import Union +from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models import DocumentChunk from cognee.shared.CodeGraphEntities import CodeFile, CodePart @@ -17,7 +19,9 @@ class TextSummary(DataPoint): text: str made_from: DocumentChunk - + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) metadata: dict = {"index_fields": ["text"]} From 3f27c5592b58af29369125362510e96b72c56cbc Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 29 Oct 2025 20:17:27 +0530 Subject: [PATCH 02/25] feat: adding last_accessed_at field to the models and updating the retrievers to update the timestamp --- cognee/modules/retrieval/chunks_retriever.py | 48 +++++++++++-------- .../modules/retrieval/summaries_retriever.py | 28 ++++++----- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index 74634b71e..f821fc902 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -29,26 +29,34 @@ class ChunksRetriever(BaseRetriever): self.top_k = top_k async def get_context(self, query: str) -> Any: - """Retrieves document chunks context based on the query.""" - logger.info( - f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" - ) - - vector_engine = get_vector_engine() - - try: - found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) - logger.info(f"Found {len(found_chunks)} chunks from vector search") - - # NEW: Update access timestamps - await update_node_access_timestamps(found_chunks, "DocumentChunk") - except CollectionNotFoundError as error: - logger.error("DocumentChunk_text collection not found in vector database") - raise NoDataError("No data found in the system, please add data first.") from error - - chunk_payloads = [result.payload for result in found_chunks] - logger.info(f"Returning {len(chunk_payloads)} chunk payloads") - return chunk_payloads + """ + Retrieves document chunks context based on the query. + Searches for document chunks relevant to the specified query using a vector engine. + Raises a NoDataError if no data is found in the system. + Parameters: + ----------- + - query (str): The query string to search for relevant document chunks. + Returns: + -------- + - Any: A list of document chunk payloads retrieved from the search. + """ + logger.info( + f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" + ) + + vector_engine = get_vector_engine() + + try: + found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) + logger.info(f"Found {len(found_chunks)} chunks from vector search") + await update_node_access_timestamps(found_chunks, "DocumentChunk") + + except CollectionNotFoundError as error: + logger.error("DocumentChunk_text collection not found in vector database") + raise NoDataError("No data found in the system, please add data first.") from error + + chunk_payloads = [result.payload for result in found_chunks] + logger.info(f"Returning {len(chunk_payloads)} chunk payloads") async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 7f996274e..9ac8b096d 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -48,19 +48,23 @@ class SummariesRetriever(BaseRetriever): f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" ) - vector_engine = get_vector_engine() - - try: - summaries_results = await vector_engine.search( - "TextSummary_text", query, limit=self.top_k - ) - + vector_engine = get_vector_engine() + + try: + summaries_results = await vector_engine.search( + "TextSummary_text", query, limit=self.top_k + ) + logger.info(f"Found {len(summaries_results)} summaries from vector search") + await update_node_access_timestamps(summaries_results, "TextSummary") - - except CollectionNotFoundError as error: - raise NoDataError("No data found in the system, please add data first.") from error - - return [summary.payload for summary in summaries_results] + + except CollectionNotFoundError as error: + logger.error("TextSummary_text collection not found in vector database") + raise NoDataError("No data found in the system, please add data first.") from error + + summary_payloads = [summary.payload for summary in summaries_results] + logger.info(f"Returning {len(summary_payloads)} summary payloads") + return summary_payloads async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs From 5f6f0502c832d129749b453121c6f5be565044bc Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 31 Oct 2025 00:00:18 +0530 Subject: [PATCH 03/25] fix: removing last_acessed_at from individual model and adding it to DataPoint --- cognee/infrastructure/engine/models/DataPoint.py | 3 +++ cognee/modules/chunking/models/DocumentChunk.py | 5 ----- cognee/modules/engine/models/Entity.py | 3 --- cognee/tasks/summarization/models.py | 3 --- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 812380eaa..3178713c8 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -43,6 +43,9 @@ class DataPoint(BaseModel): updated_at: int = Field( default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) ) + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) ontology_valid: bool = False version: int = 1 # Default version topological_rank: Optional[int] = 0 diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index c4c6a2ed3..601454802 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -35,9 +35,4 @@ class DocumentChunk(DataPoint): cut_type: str is_part_of: Document contains: List[Union[Entity, Event]] = None - - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) - metadata: dict = {"index_fields": ["text"]} diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 3e48ea02a..4083cd2e6 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -8,7 +8,4 @@ class Entity(DataPoint): name: str is_a: Optional[EntityType] = None description: str - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) metadata: dict = {"index_fields": ["name"]} diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 46f9a8d8b..8cee2ade3 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -19,9 +19,6 @@ class TextSummary(DataPoint): text: str made_from: DocumentChunk - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) metadata: dict = {"index_fields": ["text"]} From 6f06e4a5eb1143ddcb2ad08132486630b8a2deae Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 31 Oct 2025 00:17:13 +0530 Subject: [PATCH 04/25] fix: removing node_type and try except --- cognee/modules/retrieval/chunks_retriever.py | 2 +- .../modules/retrieval/summaries_retriever.py | 2 +- .../retrieval/utils/access_tracking.py | 55 ++++++++++--------- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index f821fc902..be1f95811 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -49,7 +49,7 @@ class ChunksRetriever(BaseRetriever): try: found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) logger.info(f"Found {len(found_chunks)} chunks from vector search") - await update_node_access_timestamps(found_chunks, "DocumentChunk") + await update_node_access_timestamps(found_chunks) except CollectionNotFoundError as error: logger.error("DocumentChunk_text collection not found in vector database") diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 9ac8b096d..0df750d22 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -56,7 +56,7 @@ class SummariesRetriever(BaseRetriever): ) logger.info(f"Found {len(summaries_results)} summaries from vector search") - await update_node_access_timestamps(summaries_results, "TextSummary") + await update_node_access_timestamps(summaries_results) except CollectionNotFoundError as error: logger.error("TextSummary_text collection not found in vector database") diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index ca5ed88cd..79afd25db 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -1,4 +1,4 @@ - + """Utilities for tracking data access in retrievers.""" import json @@ -11,51 +11,54 @@ from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) -async def update_node_access_timestamps(items: List[Any], node_type: str): +async def update_node_access_timestamps(items: List[Any]): """ Update last_accessed_at for nodes in Kuzu graph database. + Automatically determines node type from the graph database. Parameters ---------- items : List[Any] List of items with payload containing 'id' field (from vector search results) - node_type : str - Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary') """ if not items: return graph_engine = await get_graph_engine() - # Convert to milliseconds since epoch (matching the field format) timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) for item in items: - # Extract ID from payload (vector search results have this structure) + # Extract ID from payload item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if not item_id: continue - try: - # Get current node properties from Kuzu's Node table - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props", - {"id": str(item_id), "node_type": node_type} + # try: + # Query to get both node type and properties in one call + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props", + {"id": str(item_id)} + ) + + if result and len(result) > 0 and result[0]: + node_type = result[0][0] # First column: node_type + props_json = result[0][1] # Second column: properties + + # Parse existing properties JSON + props = json.loads(props_json) if props_json else {} + # Update last_accessed_at with millisecond timestamp + props["last_accessed_at"] = timestamp_ms + + # Write back to graph database + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": str(item_id), "props": json.dumps(props)} ) - if result and len(result) > 0 and result[0][0]: - # Parse existing properties JSON - props = json.loads(result[0][0]) if result[0][0] else {} - # Update last_accessed_at with millisecond timestamp - props["last_accessed_at"] = timestamp_ms + logger.debug(f"Updated access timestamp for {node_type} node {item_id}") - # Write back to graph database - await graph_engine.query( - "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props", - {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)} - ) - except Exception as e: - logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}") - continue + # except Exception as e: + # logger.error(f"Failed to update timestamp for node {item_id}: {e}") + # continue - logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes") - + logger.debug(f"Updated access timestamps for {len(items)} nodes") From f1afd1f0a2a5433dc341c485b08ce33d1bc16252 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 31 Oct 2025 15:49:34 +0530 Subject: [PATCH 05/25] feat: adding cleanup function and adding update_node_acess_timestamps in completion retriever and graph_completion retriever --- .../modules/retrieval/completion_retriever.py | 3 +- .../retrieval/graph_completion_retriever.py | 13 +- cognee/tasks/cleanup/cleanup_unused_data.py | 232 ++++++++++++++++++ 3 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 cognee/tasks/cleanup/cleanup_unused_data.py diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index bb568924d..fc8ef747f 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -8,6 +8,7 @@ from cognee.modules.retrieval.utils.session_cache import ( save_conversation_history, get_conversation_history, ) +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError @@ -65,7 +66,7 @@ class CompletionRetriever(BaseRetriever): if len(found_chunks) == 0: return "" - + await update_node_access_timestamps(found_chunks) # Combine all chunks text returned from vector search (number of chunks is determined by top_k chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks] combined_context = "\n".join(chunks_payload) diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index b7ab4edae..ac7e45e3c 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -16,6 +16,7 @@ from cognee.modules.retrieval.utils.session_cache import ( ) from cognee.shared.logging_utils import get_logger from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.modules.retrieval.utils.models import CogneeUserInteraction from cognee.modules.engine.models.node_set import NodeSet from cognee.infrastructure.databases.graph import get_graph_engine @@ -138,7 +139,17 @@ class GraphCompletionRetriever(BaseGraphRetriever): return [] # context = await self.resolve_edges_to_text(triplets) - + entity_nodes = [] + seen_ids = set() + for triplet in triplets: + if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node1.id)}) + seen_ids.add(triplet.node1.id) + if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node2.id)}) + seen_ids.add(triplet.node2.id) + + await update_node_access_timestamps(entity_nodes) return triplets async def get_completion( diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py new file mode 100644 index 000000000..e97692bb4 --- /dev/null +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -0,0 +1,232 @@ +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.shared.logging_utils import get_logger + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: int = 30, + dry_run: bool = True, + user_id: Optional[UUID] = None +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + Minutes since last access to consider data unused (default: 30) + dry_run : bool + If True, only report what would be deleted without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None + ) + + # Calculate cutoff timestamp in milliseconds + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes + unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + +async def _find_unused_nodes( + cutoff_timestamp_ms: int, + user_id: Optional[UUID] = None +) -> Dict[str, list]: + """ + Query Kuzu for nodes with old last_accessed_at timestamps. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + user_id : UUID, optional + Filter by user ID if provided + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Query all nodes with their properties + query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" + results = await graph_engine.query(query) + + unused_nodes = { + "DocumentChunk": [], + "Entity": [], + "TextSummary": [] + } + + for node_id, node_type, props_json in results: + # Only process tracked node types + if node_type not in unused_nodes: + continue + + # Parse properties JSON + if props_json: + try: + props = json.loads(props_json) + last_accessed = props.get("last_accessed_at") + + # Check if node is unused (never accessed or accessed before cutoff) + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + # TODO: Add user_id filtering when user ownership is implemented + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties for node {node_id}") + continue + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + if result and len(result) > 0: + deleted_counts["associations"] += result[0][0] + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) + + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + # Delete from vector collection + if await vector_engine.has_collection(collection_name): + for node_id in node_ids: + try: + await vector_engine.delete(collection_name, {"id": str(node_id)}) + except Exception as e: + logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + + return deleted_counts From 5080e8f8a5c20d092b917b66eb52a577fe899231 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 3 Nov 2025 00:59:04 +0530 Subject: [PATCH 06/25] feat: genarlizing getting entities from triplets --- cognee/modules/graph/utils/__init__.py | 1 + .../graph/utils/get_entity_nodes_from_triplets.py | 13 +++++++++++++ .../modules/retrieval/graph_completion_retriever.py | 12 +++--------- 3 files changed, 17 insertions(+), 9 deletions(-) create mode 100644 cognee/modules/graph/utils/get_entity_nodes_from_triplets.py diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py index ebc648495..4c0b29d47 100644 --- a/cognee/modules/graph/utils/__init__.py +++ b/cognee/modules/graph/utils/__init__.py @@ -5,3 +5,4 @@ from .retrieve_existing_edges import retrieve_existing_edges from .convert_node_to_data_point import convert_node_to_data_point from .deduplicate_nodes_and_edges import deduplicate_nodes_and_edges from .resolve_edges_to_text import resolve_edges_to_text +from .get_entity_nodes_from_triplets import get_entity_nodes_from_triplets diff --git a/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py new file mode 100644 index 000000000..598a36854 --- /dev/null +++ b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py @@ -0,0 +1,13 @@ + +def get_entity_nodes_from_triplets(triplets): + entity_nodes = [] + seen_ids = set() + for triplet in triplets: + if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node1.id)}) + seen_ids.add(triplet.node1.id) + if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node2.id)}) + seen_ids.add(triplet.node2.id) + + return entity_nodes diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index ac7e45e3c..122cc943f 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -22,6 +22,7 @@ from cognee.modules.engine.models.node_set import NodeSet from cognee.infrastructure.databases.graph import get_graph_engine from cognee.context_global_variables import session_user from cognee.infrastructure.databases.cache.config import CacheConfig +from cognee.modules.graph.utils import get_entity_nodes_from_triplets logger = get_logger("GraphCompletionRetriever") @@ -139,15 +140,8 @@ class GraphCompletionRetriever(BaseGraphRetriever): return [] # context = await self.resolve_edges_to_text(triplets) - entity_nodes = [] - seen_ids = set() - for triplet in triplets: - if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: - entity_nodes.append({"id": str(triplet.node1.id)}) - seen_ids.add(triplet.node1.id) - if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: - entity_nodes.append({"id": str(triplet.node2.id)}) - seen_ids.add(triplet.node2.id) + + entity_nodes = get_entity_nodes_from_triplets(triplets) await update_node_access_timestamps(entity_nodes) return triplets From d34fd9237bf41c6b421bd556541b50ea68246e45 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 4 Nov 2025 22:04:32 +0530 Subject: [PATCH 07/25] feat: adding last_acessed in the Data model --- .../e1ec1dcb50b6_add_last_accessed_to_data.py | 30 ++++++ cognee/modules/data/models/Data.py | 1 + .../retrieval/utils/access_tracking.py | 102 ++++++++++++------ 3 files changed, 100 insertions(+), 33 deletions(-) create mode 100644 alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py new file mode 100644 index 000000000..0ccefa63b --- /dev/null +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -0,0 +1,30 @@ +"""add_last_accessed_to_data + +Revision ID: e1ec1dcb50b6 +Revises: 211ab850ef3d +Create Date: 2025-11-04 21:45:52.642322 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e1ec1dcb50b6' +down_revision: Union[str, None] = '211ab850ef3d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column('data', + sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) + ) + # Optionally initialize with created_at values for existing records + op.execute("UPDATE data SET last_accessed = created_at") + + +def downgrade() -> None: + op.drop_column('data', 'last_accessed') diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py index ef228f2e1..27ab7481e 100644 --- a/cognee/modules/data/models/Data.py +++ b/cognee/modules/data/models/Data.py @@ -36,6 +36,7 @@ class Data(Base): data_size = Column(Integer, nullable=True) # File size in bytes created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) + last_accessed = Column(DateTime(timezone=True), nullable=True) datasets = relationship( "Dataset", diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 79afd25db..621e09e27 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -1,20 +1,27 @@ - """Utilities for tracking data access in retrievers.""" import json from datetime import datetime, timezone from typing import List, Any +from uuid import UUID from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data from cognee.shared.logging_utils import get_logger +from sqlalchemy import update logger = get_logger(__name__) async def update_node_access_timestamps(items: List[Any]): """ - Update last_accessed_at for nodes in Kuzu graph database. - Automatically determines node type from the graph database. + Update last_accessed_at for nodes in graph database and corresponding Data records in SQL. + + This function: + 1. Updates last_accessed_at in the graph database nodes (in properties JSON) + 2. Traverses to find origin TextDocument nodes + 3. Updates last_accessed in the SQL Data table for those documents Parameters ---------- @@ -26,39 +33,68 @@ async def update_node_access_timestamps(items: List[Any]): graph_engine = await get_graph_engine() timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) + timestamp_dt = datetime.now(timezone.utc) + # Extract node IDs + node_ids = [] for item in items: - # Extract ID from payload item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") - if not item_id: - continue - - # try: - # Query to get both node type and properties in one call - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props", - {"id": str(item_id)} - ) - - if result and len(result) > 0 and result[0]: - node_type = result[0][0] # First column: node_type - props_json = result[0][1] # Second column: properties - - # Parse existing properties JSON - props = json.loads(props_json) if props_json else {} - # Update last_accessed_at with millisecond timestamp - props["last_accessed_at"] = timestamp_ms - - # Write back to graph database - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": str(item_id), "props": json.dumps(props)} + if item_id: + node_ids.append(str(item_id)) + + if not node_ids: + return + + try: + # Step 1: Batch update graph nodes + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} ) - logger.debug(f"Updated access timestamp for {node_type} node {item_id}") + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms - # except Exception as e: - # logger.error(f"Failed to update timestamp for node {item_id}: {e}") - # continue - - logger.debug(f"Updated access timestamps for {len(items)} nodes") + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + + logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes") + + # Step 2: Find origin TextDocument nodes + origin_query = """ + UNWIND $node_ids AS node_id + MATCH (n:Node {id: node_id}) + OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node) + WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from') + AND chunk.type = 'DocumentChunk' + OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node) + WHERE e2.relationship_name = 'is_part_of' + AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] + RETURN DISTINCT doc.id as doc_id + """ + + result = await graph_engine.query(origin_query, {"node_ids": node_ids}) + + # Extract document IDs + doc_ids = [row[0] for row in result if row and row[0]] if result else [] + + # Step 3: Update SQL Data table + if doc_ids: + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + stmt = update(Data).where( + Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) + ).values(last_accessed=timestamp_dt) + + await session.execute(stmt) + await session.commit() + + logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL") + + except Exception as e: + logger.error(f"Failed to update timestamps: {e}") + raise From 3c0e915812a4ffb8662419647572c6229ed963a9 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 12:25:51 +0530 Subject: [PATCH 08/25] fix: removing hard relations --- .../modules/retrieval/utils/access_tracking.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 621e09e27..36c0b7f50 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -20,7 +20,7 @@ async def update_node_access_timestamps(items: List[Any]): This function: 1. Updates last_accessed_at in the graph database nodes (in properties JSON) - 2. Traverses to find origin TextDocument nodes + 2. Traverses to find origin TextDocument nodes (without hardcoded relationship names) 3. Updates last_accessed in the SQL Data table for those documents Parameters @@ -64,23 +64,21 @@ async def update_node_access_timestamps(items: List[Any]): logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes") - # Step 2: Find origin TextDocument nodes + # Step 2: Find origin TextDocument nodes (without hardcoded relationship names) origin_query = """ UNWIND $node_ids AS node_id MATCH (n:Node {id: node_id}) OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node) - WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from') - AND chunk.type = 'DocumentChunk' - OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node) - WHERE e2.relationship_name = 'is_part_of' - AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] + WHERE chunk.type = 'DocumentChunk' + OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node) + WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] RETURN DISTINCT doc.id as doc_id """ result = await graph_engine.query(origin_query, {"node_ids": node_ids}) - # Extract document IDs - doc_ids = [row[0] for row in result if row and row[0]] if result else [] + # Extract and deduplicate document IDs + doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else [] # Step 3: Update SQL Data table if doc_ids: From 9041a804ecc2d0be1903c2de0ac875f32fcc553c Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 18:32:49 +0530 Subject: [PATCH 09/25] fix: add text_doc flag --- cognee/tasks/cleanup/cleanup_unused_data.py | 520 ++++++++++++-------- 1 file changed, 312 insertions(+), 208 deletions(-) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index e97692bb4..c9c711fe2 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,232 +1,336 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID - -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.shared.logging_utils import get_logger - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: int = 30, + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + Minutes since last access to consider data unused (default: 30) + dry_run : bool + If True, only report what would be deleted without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes directly from graph + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes + unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - minutes_threshold: int = 30, - dry_run: bool = True, +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - minutes_threshold : int - Minutes since last access to consider data unused (default: 30) + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be deleted without actually deleting (default: True) - user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - - Returns - ------- - Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None - ) - - # Calculate cutoff timestamp in milliseconds - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") - - # Find unused nodes - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes - deleted_counts = await _delete_unused_nodes(unused_nodes) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) - - return { - "status": "completed", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None -) -> Dict[str, list]: - """ - Query Kuzu for nodes with old last_accessed_at timestamps. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch + If True, only report what would be deleted user_id : UUID, optional Filter by user ID if provided Returns ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs + Dict[str, Any] + Cleanup results """ - graph_engine = await get_graph_engine() + db_engine = get_relational_engine() - # Query all nodes with their properties - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = { - "DocumentChunk": [], - "Entity": [], - "TextSummary": [] - } - - for node_id, node_type, props_json in results: - # Only process tracked node types - if node_type not in unused_nodes: - continue - - # Parse properties JSON - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - # Check if node is unused (never accessed or accessed before cutoff) - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - # TODO: Add user_id filtering when user ownership is implemented - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) ) - if result and len(result) > 0: - deleted_counts["associations"] += result[0][0] + ) + + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + + result = await session.execute(query) + unused_data = result.all() - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) + logger.info(f"Found {len(unused_data)} unused documents in SQL") - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } + if dry_run: + return { + "status": "dry_run", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": 0, + "documents": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: try: - # Delete from vector collection - if await vector_engine.has_collection(collection_name): - for node_id in node_ids: - try: - await vector_engine.delete(collection_name, {"id": str(node_id)}) - except Exception as e: - logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") + logger.error(f"Failed to delete document {data.id}: {e}") + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + +async def _find_unused_nodes( + cutoff_timestamp_ms: int, + user_id: Optional[UUID] = None +) -> Dict[str, list]: + """ + Query Kuzu for nodes with old last_accessed_at timestamps. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + user_id : UUID, optional + Filter by user ID if provided + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Query all nodes with their properties + query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" + results = await graph_engine.query(query) + + unused_nodes = { + "DocumentChunk": [], + "Entity": [], + "TextSummary": [] + } + + for node_id, node_type, props_json in results: + # Only process tracked node types + if node_type not in unused_nodes: + continue + + # Parse properties JSON + if props_json: + try: + props = json.loads(props_json) + last_accessed = props.get("last_accessed_at") + + # Check if node is unused (never accessed or accessed before cutoff) + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties for node {node_id}") + continue + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + if result and len(result) > 0: + deleted_counts["associations"] += result[0][0] + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) + + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + # Delete from vector collection + if await vector_engine.has_collection(collection_name): + for node_id in node_ids: + try: + await vector_engine.delete(collection_name, {"id": str(node_id)}) + except Exception as e: + logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts From ff263c0132b170b3c03961606db56c2a174d2b90 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 18:40:58 +0530 Subject: [PATCH 10/25] fix: add column check in migration --- .../e1ec1dcb50b6_add_last_accessed_to_data.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py index 0ccefa63b..267e11fb2 100644 --- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -17,14 +17,30 @@ down_revision: Union[str, None] = '211ab850ef3d' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + def upgrade() -> None: - op.add_column('data', - sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) - ) - # Optionally initialize with created_at values for existing records - op.execute("UPDATE data SET last_accessed = created_at") + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if not last_accessed_column: + op.add_column('data', + sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) + ) + # Optionally initialize with created_at values for existing records + op.execute("UPDATE data SET last_accessed = created_at") def downgrade() -> None: - op.drop_column('data', 'last_accessed') + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if last_accessed_column: + op.drop_column('data', 'last_accessed') From c5f0c4af87ff13bf8e3cbe0f4e9163ece44c3094 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 20:22:17 +0530 Subject: [PATCH 11/25] fix: add text_doc flag --- cognee/modules/retrieval/utils/access_tracking.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 36c0b7f50..65d597a93 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -67,12 +67,9 @@ async def update_node_access_timestamps(items: List[Any]): # Step 2: Find origin TextDocument nodes (without hardcoded relationship names) origin_query = """ UNWIND $node_ids AS node_id - MATCH (n:Node {id: node_id}) - OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node) - WHERE chunk.type = 'DocumentChunk' - OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node) - WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] - RETURN DISTINCT doc.id as doc_id + MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id """ result = await graph_engine.query(origin_query, {"node_ids": node_ids}) From fdf037b3d0117bd29f0c541ed027895c070678df Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Thu, 6 Nov 2025 23:00:56 +0530 Subject: [PATCH 12/25] fix: min to days --- cognee/tasks/cleanup/cleanup_unused_data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index c9c711fe2..4df622a2c 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -23,7 +23,7 @@ logger = get_logger(__name__) async def cleanup_unused_data( - minutes_threshold: int = 30, + days_threshold: Optional[int], dry_run: bool = True, user_id: Optional[UUID] = None, text_doc: bool = False @@ -33,8 +33,8 @@ async def cleanup_unused_data( Parameters ---------- - minutes_threshold : int - Minutes since last access to consider data unused (default: 30) + days_threshold : int + days since last access to consider data unused dry_run : bool If True, only report what would be deleted without actually deleting (default: True) user_id : UUID, optional @@ -50,14 +50,14 @@ async def cleanup_unused_data( """ logger.info( "Starting cleanup task", - minutes_threshold=minutes_threshold, + days_threshold=days_threshold, dry_run=dry_run, user_id=str(user_id) if user_id else None, text_doc=text_doc ) # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold) if text_doc: # SQL-based approach: Find unused TextDocuments and use cognee.delete() From 84c8e07ddd980af7c11b89c7e510b38e5c44f119 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 7 Nov 2025 12:03:17 +0530 Subject: [PATCH 13/25] fix: remove uneccessary imports --- cognee/modules/chunking/models/DocumentChunk.py | 2 -- cognee/modules/engine/models/Entity.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index a9fb08a9e..e2b216a9b 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,7 +1,5 @@ from typing import List, Union -from pydantic import BaseModel, Field -from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine.models.Edge import Edge from cognee.modules.data.processing.document_types import Document diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 4083cd2e6..a34a6503c 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -1,8 +1,6 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.engine.models.EntityType import EntityType from typing import Optional -from datetime import datetime, timezone -from pydantic import BaseModel, Field class Entity(DataPoint): name: str From 84bd2f38f7513c244ed1040937a1e5a5297cec2e Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 7 Nov 2025 12:12:46 +0530 Subject: [PATCH 14/25] fix: remove uneccessary imports --- cognee/tasks/summarization/models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 8cee2ade3..8420cfaa5 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -1,7 +1,5 @@ -from pydantic import BaseModel, Field from typing import Union -from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models import DocumentChunk from cognee.shared.CodeGraphEntities import CodeFile, CodePart From d351c9a009d12a8a8a4869afa7aee38c61482e21 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 10 Nov 2025 21:58:01 +0530 Subject: [PATCH 15/25] fix: return chunk payload --- cognee/modules/retrieval/chunks_retriever.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index be1f95811..b7a90238a 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -57,6 +57,7 @@ class ChunksRetriever(BaseRetriever): chunk_payloads = [result.payload for result in found_chunks] logger.info(f"Returning {len(chunk_payloads)} chunk payloads") + return chunk_payloads async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None From 7bd7079aac9fcb003bcc20e118bc65d066e9029c Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 18 Nov 2025 22:17:23 +0530 Subject: [PATCH 16/25] fix: vecto_engine.delte_data_points --- cognee/tasks/cleanup/cleanup_unused_data.py | 33 ++++++++++----------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index 4df622a2c..fd4b68204 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -315,22 +315,21 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: "TextSummary": "TextSummary_text" } - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - # Delete from vector collection - if await vector_engine.has_collection(collection_name): - for node_id in node_ids: - try: - await vector_engine.delete(collection_name, {"id": str(node_id)}) - except Exception as e: - logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") return deleted_counts From 5fac3b40b94e4c81a7d9828ca9d2d84ab5e82bc1 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 18 Nov 2025 22:26:59 +0530 Subject: [PATCH 17/25] fix: test file for cleanup unused data --- cognee/tests/test_cleanup_unused_data.py | 244 +++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 cognee/tests/test_cleanup_unused_data.py diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py new file mode 100644 index 000000000..c21b9f5ea --- /dev/null +++ b/cognee/tests/test_cleanup_unused_data.py @@ -0,0 +1,244 @@ +import os +import pathlib +import cognee +from datetime import datetime, timezone, timedelta +from uuid import UUID +from sqlalchemy import select, update +from cognee.modules.data.models import Data, DatasetData +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import get_logger +from cognee.modules.search.types import SearchType + +logger = get_logger() + + +async def test_textdocument_cleanup_with_sql(): + """ + End-to-end test for TextDocument cleanup based on last_accessed timestamps. + + Tests: + 1. Add and cognify a document + 2. Perform search to populate last_accessed timestamp + 3. Verify last_accessed is set in SQL Data table + 4. Manually age the timestamp beyond cleanup threshold + 5. Run cleanup with text_doc=True + 6. Verify document was deleted from all databases (relational, graph, and vector) + """ + # Setup test directories + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") + ).resolve() + ) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") + ).resolve() + ) + + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) + + # Initialize database + from cognee.modules.engine.operations.setup import setup + + # Clean slate + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + logger.info("๐Ÿงช Testing TextDocument cleanup based on last_accessed") + + # Step 1: Add and cognify a test document + dataset_name = "test_cleanup_dataset" + test_text = """ + Machine learning is a subset of artificial intelligence that enables systems to learn + and improve from experience without being explicitly programmed. Deep learning uses + neural networks with multiple layers to process data. + """ + + await setup() + user = await get_default_user() + await cognee.add([test_text], dataset_name=dataset_name, user=user) + + cognify_result = await cognee.cognify([dataset_name], user=user) + + # Extract dataset_id from cognify result (ds_id is already a UUID) + dataset_id = None + for ds_id, pipeline_result in cognify_result.items(): + dataset_id = ds_id # Don't wrap in UUID() - it's already a UUID object + break + + assert dataset_id is not None, "Failed to get dataset_id from cognify result" + logger.info(f"โœ… Document added and cognified. Dataset ID: {dataset_id}") + + # Step 2: Perform search to trigger last_accessed update + logger.info("Triggering search to update last_accessed...") + search_results = await cognee.search( + query_type=SearchType.CHUNKS, + query_text="machine learning", + datasets=[dataset_name], + user=user + ) + logger.info(f"โœ… Search completed, found {len(search_results)} results") + + # Step 3: Verify last_accessed was set in SQL Data table + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Get the Data record for this dataset + result = await session.execute( + select(Data, DatasetData) + .join(DatasetData, Data.id == DatasetData.data_id) + .where(DatasetData.dataset_id == dataset_id) + ) + data_records = result.all() + assert len(data_records) > 0, "No Data records found for the dataset" + data_record = data_records[0][0] + data_id = data_record.id + + # Verify last_accessed is set (should be set by search operation) + assert data_record.last_accessed is not None, ( + "last_accessed should be set after search operation" + ) + + original_last_accessed = data_record.last_accessed + logger.info(f"โœ… last_accessed verified: {original_last_accessed}") + + # Step 4: Manually age the timestamp to be older than cleanup threshold + days_threshold = 30 + aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10) + + async with db_engine.get_async_session() as session: + stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) + await session.execute(stmt) + await session.commit() + + # Query in a NEW session to avoid cached values + async with db_engine.get_async_session() as session: + result = await session.execute(select(Data).where(Data.id == data_id)) + updated_data = result.scalar_one_or_none() + + # Make both timezone-aware for comparison + retrieved_timestamp = updated_data.last_accessed + if retrieved_timestamp.tzinfo is None: + # If database returned naive datetime, make it UTC-aware + retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) + + assert retrieved_timestamp == aged_timestamp, ( + f"Timestamp should be updated to aged value. " + f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}" + ) + + # Step 5: Test cleanup with text_doc=True + from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data + + # First do a dry run + logger.info("Testing dry run with text_doc=True...") + dry_run_result = await cleanup_unused_data( + days_threshold=30, + dry_run=True, + user_id=user.id, + text_doc=True + ) + + assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'" + assert dry_run_result['unused_count'] > 0, ( + "Should find at least one unused document" + ) + logger.info(f"โœ… Dry run found {dry_run_result['unused_count']} unused documents") + + # Now run actual cleanup + logger.info("Executing cleanup with text_doc=True...") + cleanup_result = await cleanup_unused_data( + days_threshold=30, + dry_run=False, + user_id=user.id, + text_doc=True + ) + + assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" + assert cleanup_result["deleted_count"]["documents"] > 0, ( + "At least one document should be deleted" + ) + logger.info(f"โœ… Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") + + # Step 6: Verify the document was actually deleted from SQL + async with db_engine.get_async_session() as session: + deleted_data = ( + await session.execute(select(Data).where(Data.id == data_id)) + ).scalar_one_or_none() + + assert deleted_data is None, ( + "Data record should be deleted after cleanup" + ) + logger.info("โœ… Confirmed: Data record was deleted from SQL database") + + # Verify the dataset-data link was also removed + async with db_engine.get_async_session() as session: + dataset_data_link = ( + await session.execute( + select(DatasetData).where( + DatasetData.data_id == data_id, + DatasetData.dataset_id == dataset_id + ) + ) + ).scalar_one_or_none() + + assert dataset_data_link is None, ( + "DatasetData link should be deleted after cleanup" + ) + logger.info("โœ… Confirmed: DatasetData link was deleted") + + # Verify graph nodes were cleaned up + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + + # Try to find the TextDocument node - it should not exist + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n", + {"id": str(data_id)} + ) + + assert len(result) == 0, ( + "TextDocument node should be deleted from graph database" + ) + logger.info("โœ… Confirmed: TextDocument node was deleted from graph database") + + # Verify vector database was cleaned up + from cognee.infrastructure.databases.vector import get_vector_engine + + vector_engine = get_vector_engine() + + # Check each collection that should have been cleaned up + vector_collections = [ + "DocumentChunk_text", + "Entity_name", + "TextSummary_text" + ] + + for collection_name in vector_collections: + if await vector_engine.has_collection(collection_name): + # Try to retrieve the deleted data points + try: + results = await vector_engine.retrieve(collection_name, [str(data_id)]) + assert len(results) == 0, ( + f"Data points should be deleted from {collection_name} collection" + ) + logger.info(f"โœ… Confirmed: {collection_name} collection is clean") + except Exception as e: + # Collection might be empty or not exist, which is fine + logger.info(f"โœ… Confirmed: {collection_name} collection is empty or doesn't exist") + pass + + logger.info("โœ… Confirmed: Vector database entries were deleted") + + logger.info("๐ŸŽ‰ All cleanup tests passed!") + + return True + + +if __name__ == "__main__": + import asyncio + success = asyncio.run(test_textdocument_cleanup_with_sql()) + exit(0 if success else 1) From 43290af1b23d24d6ab8b5d57c243abe1cee8787e Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 19 Nov 2025 21:00:16 +0530 Subject: [PATCH 18/25] fix: set last_acessed to current timestamp --- alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py index 267e11fb2..a16c99e9f 100644 --- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -34,7 +34,7 @@ def upgrade() -> None: sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) ) # Optionally initialize with created_at values for existing records - op.execute("UPDATE data SET last_accessed = created_at") + op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP") def downgrade() -> None: From b52c1a1e25e6edffe112462836ab315b36bec567 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 24 Nov 2025 12:50:39 +0530 Subject: [PATCH 19/25] fix: flag to enable and disable last_accessed --- .../e1ec1dcb50b6_add_last_accessed_to_data.py | 88 ++++++++++--------- .../retrieval/utils/access_tracking.py | 7 +- cognee/tasks/cleanup/cleanup_unused_data.py | 40 ++++++++- 3 files changed, 90 insertions(+), 45 deletions(-) diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py index a16c99e9f..f1a36ae59 100644 --- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -1,46 +1,52 @@ -"""add_last_accessed_to_data - -Revision ID: e1ec1dcb50b6 -Revises: 211ab850ef3d -Create Date: 2025-11-04 21:45:52.642322 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = 'e1ec1dcb50b6' -down_revision: Union[str, None] = '211ab850ef3d' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - -def _get_column(inspector, table, name, schema=None): - for col in inspector.get_columns(table, schema=schema): - if col["name"] == name: - return col - return None +"""add_last_accessed_to_data + +Revision ID: e1ec1dcb50b6 +Revises: 211ab850ef3d +Create Date: 2025-11-04 21:45:52.642322 + +""" +import os +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa -def upgrade() -> None: - conn = op.get_bind() - insp = sa.inspect(conn) - - last_accessed_column = _get_column(insp, "data", "last_accessed") - if not last_accessed_column: - op.add_column('data', - sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) - ) - # Optionally initialize with created_at values for existing records - op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP") +# revision identifiers, used by Alembic. +revision: str = 'e1ec1dcb50b6' +down_revision: Union[str, None] = '211ab850ef3d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None -def downgrade() -> None: - conn = op.get_bind() - insp = sa.inspect(conn) - - last_accessed_column = _get_column(insp, "data", "last_accessed") - if last_accessed_column: +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if not last_accessed_column: + # Always create the column for schema consistency + op.add_column('data', + sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) + ) + + # Only initialize existing records if feature is enabled + enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true" + if enable_last_accessed: + op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP") + + +def downgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if last_accessed_column: op.drop_column('data', 'last_accessed') diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 65d597a93..6df0284ec 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,7 +4,7 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID - +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data @@ -27,7 +27,10 @@ async def update_node_access_timestamps(items: List[Any]): ---------- items : List[Any] List of items with payload containing 'id' field (from vector search results) - """ + """ + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + return + if not items: return diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index fd4b68204..175452a0a 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -10,7 +10,7 @@ import json from datetime import datetime, timezone, timedelta from typing import Optional, Dict, Any from uuid import UUID - +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.relational import get_relational_engine @@ -47,7 +47,43 @@ async def cleanup_unused_data( ------- Dict[str, Any] Cleanup results with status, counts, and timestamp - """ + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + logger.info( "Starting cleanup task", days_threshold=days_threshold, From 5cb6510205742e7a5abf2afe23d2527b229931d0 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 24 Nov 2025 13:12:46 +0530 Subject: [PATCH 20/25] fix: import --- cognee/tasks/cleanup/cleanup_unused_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index 175452a0a..a90d96b5c 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -18,6 +18,7 @@ from cognee.modules.data.models import Data, DatasetData from cognee.shared.logging_utils import get_logger from sqlalchemy import select, or_ import cognee +import sqlalchemy as sa logger = get_logger(__name__) From 12ce80005ceccafac38a63da458e6df376776b61 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 26 Nov 2025 17:32:50 +0530 Subject: [PATCH 21/25] fix: generalized queries --- .../retrieval/utils/access_tracking.py | 147 ++-- cognee/tasks/cleanup/cleanup_unused_data.py | 778 ++++++++++-------- 2 files changed, 516 insertions(+), 409 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 6df0284ec..12a66f8bc 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -13,24 +13,10 @@ from sqlalchemy import update logger = get_logger(__name__) - async def update_node_access_timestamps(items: List[Any]): - """ - Update last_accessed_at for nodes in graph database and corresponding Data records in SQL. - - This function: - 1. Updates last_accessed_at in the graph database nodes (in properties JSON) - 2. Traverses to find origin TextDocument nodes (without hardcoded relationship names) - 3. Updates last_accessed in the SQL Data table for those documents - - Parameters - ---------- - items : List[Any] - List of items with payload containing 'id' field (from vector search results) - """ if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return @@ -49,50 +35,95 @@ async def update_node_access_timestamps(items: List[Any]): return try: - # Step 1: Batch update graph nodes - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) + # Detect database provider and use appropriate queries + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + + if provider == "kuzu": + await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms) + elif provider == "neo4j": + await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms) + elif provider == "neptune": + await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms) + else: + logger.warning(f"Unsupported graph provider: {provider}") + return - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - - logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes") - - # Step 2: Find origin TextDocument nodes (without hardcoded relationship names) - origin_query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - - result = await graph_engine.query(origin_query, {"node_ids": node_ids}) - - # Extract and deduplicate document IDs - doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else [] - - # Step 3: Update SQL Data table + # Find origin documents and update SQL + doc_ids = await _find_origin_documents(graph_engine, node_ids, provider) if doc_ids: - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - stmt = update(Data).where( - Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) - ).values(last_accessed=timestamp_dt) - - await session.execute(stmt) - await session.commit() - - logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL") - + await _update_sql_records(doc_ids, timestamp_dt) + except Exception as e: logger.error(f"Failed to update timestamps: {e}") - raise + raise + +async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms): + """Kuzu-specific node updates""" + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} + ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + +async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms): + """Neo4j-specific node updates""" + for node_id in node_ids: + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms): + """Neptune-specific node updates""" + for node_id in node_ids: + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _find_origin_documents(graph_engine, node_ids, provider): + """Find origin documents with provider-specific queries""" + if provider == "kuzu": + query = """ + UNWIND $node_ids AS node_id + MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id + """ + elif provider == "neo4j": + query = """ + UNWIND $node_ids AS node_id + MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id + """ + elif provider == "neptune": + query = """ + UNWIND $node_ids AS node_id + MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id + """ + + result = await graph_engine.query(query, {"node_ids": node_ids}) + return list(set([row[0] for row in result if row and row[0]])) if result else [] + +async def _update_sql_records(doc_ids, timestamp_dt): + """Update SQL Data table (same for all providers)""" + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + stmt = update(Data).where( + Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) + ).values(last_accessed=timestamp_dt) + + await session.execute(stmt) + await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index a90d96b5c..b89c939a8 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,372 +1,448 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries)) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + days since last access to consider data unused + dry_run : bool + If True, only report what would be delete without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes directly from graph + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Detect database provider and find unused nodes + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes with provider-specific logic + deleted_counts = await _delete_unused_nodes(unused_nodes, provider) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - days_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = False +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - days_threshold : int - days since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be deleted without actually deleting (default: True) - user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) - - Returns - ------- - Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - days_threshold=days_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc - ) - - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold) - - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes directly from graph - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") - - # Find unused nodes - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes - deleted_counts = await _delete_unused_nodes(unused_nodes) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) - - return { - "status": "completed", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). - - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() - - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) - ) - ) - - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) - - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: - try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") - except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None -) -> Dict[str, list]: - """ - Query Kuzu for nodes with old last_accessed_at timestamps. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch + If True, only report what would be deleted user_id : UUID, optional Filter by user ID if provided Returns ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs + Dict[str, Any] + Cleanup results """ - graph_engine = await get_graph_engine() + db_engine = get_relational_engine() - # Query all nodes with their properties - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = { - "DocumentChunk": [], - "Entity": [], - "TextSummary": [] - } - - for node_id, node_type, props_json in results: - # Only process tracked node types - if node_type not in unused_nodes: - continue - - # Parse properties JSON - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - # Check if node is unused (never accessed or accessed before cutoff) - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) ) - if result and len(result) > 0: - deleted_counts["associations"] += result[0][0] - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue + ) - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) + result = await session.execute(query) + unused_data = result.all() - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: + return { + "status": "dry_run", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": 0, + "documents": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } + + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") + + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - + + +async def _find_unused_nodes( + cutoff_timestamp_ms: int, + user_id: Optional[UUID] = None, + provider: str = "kuzu" +) -> Dict[str, list]: + """ + Find unused nodes with provider-specific queries. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + user_id : UUID, optional + Filter by user ID if provided + provider : str + Graph database provider (kuzu, neo4j, neptune) + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + if provider == "kuzu": + return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms) + elif provider == "neo4j": + return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms) + elif provider == "neptune": + return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms) + else: + logger.warning(f"Unsupported graph provider: {provider}") + return {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + +async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms): + """Kuzu-specific unused node detection""" + query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" + results = await graph_engine.query(query) + + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + for node_id, node_type, props_json in results: + if node_type not in unused_nodes: + continue + + if props_json: + try: + props = json.loads(props_json) + last_accessed = props.get("last_accessed_at") + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties for node {node_id}") + continue + + return unused_nodes + + +async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms): + """Neo4j-specific unused node detection""" + query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at" + results = await graph_engine.query(query) + + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + for row in results: + node_id = row["n"]["id"] + node_type = row["n"]["type"] + last_accessed = row["n"].get("last_accessed_at") + + if node_type not in unused_nodes: + continue + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + + return unused_nodes + + +async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms): + """Neptune-specific unused node detection""" + query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at" + results = await graph_engine.query(query) + + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + for row in results: + node_id = row["n"]["id"] + node_type = row["n"]["type"] + last_accessed = row["n"].get("last_accessed_at") + + if node_type not in unused_nodes: + continue + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + provider : str + Graph database provider (kuzu, neo4j, neptune) + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + if provider == "kuzu": + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + elif provider == "neo4j": + result = await graph_engine.query( + "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + elif provider == "neptune": + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + + if result and len(result) > 0: + count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"] + deleted_counts["associations"] += count + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) + + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts From 6a4d31356bb613e5cf74e7972445f804796ee6d4 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 2 Dec 2025 18:55:47 +0530 Subject: [PATCH 22/25] fix: using graph projection instead of conditions --- .../retrieval/utils/access_tracking.py | 156 ++-- cognee/tasks/cleanup/cleanup_unused_data.py | 759 ++++++++---------- 2 files changed, 418 insertions(+), 497 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 12a66f8bc..935c47157 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,118 +4,116 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID -import os +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data from cognee.shared.logging_utils import get_logger from sqlalchemy import update +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph logger = get_logger(__name__) async def update_node_access_timestamps(items: List[Any]): if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return - + graph_engine = await get_graph_engine() timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) timestamp_dt = datetime.now(timezone.utc) - + # Extract node IDs node_ids = [] for item in items: item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if item_id: node_ids.append(str(item_id)) - + if not node_ids: return - - try: - # Detect database provider and use appropriate queries - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - if provider == "kuzu": - await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms) - elif provider == "neo4j": - await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms) - elif provider == "neptune": - await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms) - else: - logger.warning(f"Unsupported graph provider: {provider}") - return + try: + # Update nodes using graph projection ( database-agnostic approach + await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) # Find origin documents and update SQL - doc_ids = await _find_origin_documents(graph_engine, node_ids, provider) + doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: await _update_sql_records(doc_ids, timestamp_dt) - + except Exception as e: logger.error(f"Failed to update timestamps: {e}") raise -async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms): - """Kuzu-specific node updates""" - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - -async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms): - """Neo4j-specific node updates""" - for node_id in node_ids: - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - -async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms): - """Neptune-specific node updates""" - for node_id in node_ids: - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - -async def _find_origin_documents(graph_engine, node_ids, provider): - """Find origin documents with provider-specific queries""" - if provider == "kuzu": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - elif provider == "neo4j": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - elif provider == "neptune": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ +async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): + """Update nodes using graph projection - works with any graph database""" + # Project the graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id"], + edge_properties_to_project=[] + ) - result = await graph_engine.query(query, {"node_ids": node_ids}) - return list(set([row[0] for row in result if row and row[0]])) if result else [] + # Update each node's last_accessed_at property + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node: + # Update the node in the database + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + + if provider == "kuzu": + # Kuzu stores properties as JSON + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} + ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + elif provider == "neo4j": + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + elif provider == "neptune": + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _find_origin_documents_via_projection(graph_engine, node_ids): + """Find origin documents using graph projection instead of DB queries""" + # Project the entire graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id", "type"], + edge_properties_to_project=["relationship_name"] + ) + + # Find origin documents by traversing the in-memory graph + doc_ids = set() + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node and node.get_attribute("type") == "DocumentChunk": + # Traverse edges to find connected documents + for edge in node.get_skeleton_edges(): + # Get the neighbor node + neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node() + if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: + doc_ids.add(neighbor.id) + + return list(doc_ids) async def _update_sql_records(doc_ids, timestamp_dt): """Update SQL Data table (same for all providers)""" @@ -124,6 +122,6 @@ async def _update_sql_records(doc_ids, timestamp_dt): stmt = update(Data).where( Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) ).values(last_accessed=timestamp_dt) - + await session.execute(stmt) await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index b89c939a8..c70b97a00 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,448 +1,371 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries)) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries)) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + days since last access to consider data unused + dry_run : bool + If True, only report what would be delete without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes using projection (database-agnostic) + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes using graph projection + unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes (provider-agnostic deletion) + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - minutes_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = False +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - minutes_threshold : int - days since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be delete without actually deleting (default: True) + If True, only report what would be deleted user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) + Filter by user ID if provided Returns ------- Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc - ) + Cleanup results + """ + db_engine = get_relational_engine() - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes directly from graph - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) + ) + ) - # Detect database provider and find unused nodes - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes with provider-specific logic - deleted_counts = await _delete_unused_nodes(unused_nodes, provider) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + result = await session.execute(query) + unused_data = result.all() + + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: return { - "status": "completed", - "unused_count": total_unused, + "status": "dry_run", + "unused_count": len(unused_data), "deleted_count": { "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] + "documents": 0 }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } + + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") + + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() +async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: + """ + Find unused nodes using graph projection - database-agnostic approach. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Project the entire graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id", "type", "last_accessed_at"], + edge_properties_to_project=[] + ) - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + # Get all nodes from the projected graph + all_nodes = memory_fragment.get_nodes() + + for node in all_nodes: + node_type = node.get_attribute("type") + if node_type not in unused_nodes: + continue + + # Check last_accessed_at property + last_accessed = node.get_attribute("last_accessed_at") + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node.id) + logger.debug( + f"Found unused {node_type}", + node_id=node.id, + last_accessed=last_accessed ) + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion (using graph projection for consistency) + if any(unused_nodes.values()): + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id"], + edge_properties_to_project=[] ) - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node: + # Count edges from the in-memory graph + edge_count = len(node.get_skeleton_edges()) + deleted_counts["associations"] += edge_count + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches (database-agnostic) + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None, - provider: str = "kuzu" -) -> Dict[str, list]: - """ - Find unused nodes with provider-specific queries. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch - user_id : UUID, optional - Filter by user ID if provided - provider : str - Graph database provider (kuzu, neo4j, neptune) - - Returns - ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs - """ - graph_engine = await get_graph_engine() - - if provider == "kuzu": - return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms) - elif provider == "neo4j": - return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms) - elif provider == "neptune": - return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms) - else: - logger.warning(f"Unsupported graph provider: {provider}") - return {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - -async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms): - """Kuzu-specific unused node detection""" - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for node_id, node_type, props_json in results: - if node_type not in unused_nodes: - continue - - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms): - """Neo4j-specific unused node detection""" - query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for row in results: - node_id = row["n"]["id"] - node_type = row["n"]["type"] - last_accessed = row["n"].get("last_accessed_at") - - if node_type not in unused_nodes: - continue - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms): - """Neptune-specific unused node detection""" - query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for row in results: - node_id = row["n"]["id"] - node_type = row["n"]["type"] - last_accessed = row["n"].get("last_accessed_at") - - if node_type not in unused_nodes: - continue - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - provider : str - Graph database provider (kuzu, neo4j, neptune) - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - if provider == "kuzu": - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - elif provider == "neo4j": - result = await graph_engine.query( - "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - elif provider == "neptune": - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - - if result and len(result) > 0: - count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"] - deleted_counts["associations"] += count - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) - - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts From 5f00abf3e4f3b913ae67391d487104ea3b9ae872 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 2 Dec 2025 22:25:03 +0530 Subject: [PATCH 23/25] fix: fallback and document deletion --- .../retrieval/utils/access_tracking.py | 73 +++++++++++-------- cognee/tasks/cleanup/cleanup_unused_data.py | 41 +++++++---- 2 files changed, 68 insertions(+), 46 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 935c47157..c7b06ee17 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -36,16 +36,22 @@ async def update_node_access_timestamps(items: List[Any]): return try: - # Update nodes using graph projection ( database-agnostic approach + # Try to update nodes in graph database (may fail for unsupported DBs) await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) + except Exception as e: + logger.warning( + f"Failed to update node timestamps in graph database: {e}. " + "Will update document-level timestamps in SQL instead." + ) - # Find origin documents and update SQL + # Always try to find origin documents and update SQL + # This ensures document-level tracking works even if graph updates fail + try: doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: await _update_sql_records(doc_ids, timestamp_dt) - except Exception as e: - logger.error(f"Failed to update timestamps: {e}") + logger.error(f"Failed to update SQL timestamps: {e}") raise async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): @@ -59,37 +65,42 @@ async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): ) # Update each node's last_accessed_at property + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + for node_id in node_ids: node = memory_fragment.get_node(node_id) if node: - # Update the node in the database - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - - if provider == "kuzu": - # Kuzu stores properties as JSON - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} + try: + # Update the node in the database + if provider == "kuzu": + # Kuzu stores properties as JSON + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} ) - elif provider == "neo4j": - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - elif provider == "neptune": - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + elif provider == "neo4j": + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + elif provider == "neptune": + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + except Exception as e: + # Log but continue with other nodes + logger.debug(f"Failed to update node {node_id}: {e}") + continue async def _find_origin_documents_via_projection(graph_engine, node_ids): """Find origin documents using graph projection instead of DB queries""" diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index c70b97a00..3894635dd 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,9 +1,9 @@ """ Task for automatically deleting unused data from the memify pipeline. -This task identifies and removes data (chunks, entities, summaries)) that hasn't +This task identifies and removes entire documents that haven't been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. +efficiency and storage optimization through whole-document removal. """ import json @@ -28,22 +28,26 @@ async def cleanup_unused_data( minutes_threshold: Optional[int], dry_run: bool = True, user_id: Optional[UUID] = None, - text_doc: bool = False + text_doc: bool = True, # Changed default to True for document-level cleanup + node_level: bool = False # New parameter for explicit node-level cleanup ) -> Dict[str, Any]: """ Identify and remove unused data from the memify pipeline. - + Parameters ---------- minutes_threshold : int - days since last access to consider data unused + Minutes since last access to consider data unused dry_run : bool - If True, only report what would be delete without actually deleting (default: True) + If True, only report what would be deleted without actually deleting (default: True) user_id : UUID, optional Limit cleanup to specific user's data (default: None) text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) + If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion + node_level : bool + If True, perform chaotic node-level deletion of unused chunks, entities, and summaries + (default: False - deprecated in favor of document-level cleanup) Returns ------- @@ -91,17 +95,19 @@ async def cleanup_unused_data( minutes_threshold=minutes_threshold, dry_run=dry_run, user_id=str(user_id) if user_id else None, - text_doc=text_doc + text_doc=text_doc, + node_level=node_level ) # Calculate cutoff timestamp cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes using projection (database-agnostic) + if node_level: + # Deprecated: Node-level approach (chaotic) + logger.warning( + "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. " + "Consider using document-level cleanup (default) instead." + ) cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") @@ -147,6 +153,9 @@ async def cleanup_unused_data( }, "cleanup_date": datetime.now(timezone.utc).isoformat() } + else: + # Default: Document-level approach (recommended) + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) async def _cleanup_via_sql( @@ -243,6 +252,7 @@ async def _cleanup_via_sql( async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: """ Find unused nodes using graph projection - database-agnostic approach. + NOTE: This function is deprecated as it leads to fragmented knowledge graphs. Parameters ---------- @@ -291,6 +301,7 @@ async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[st async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: """ Delete unused nodes from graph and vector databases. + NOTE: This function is deprecated as it leads to fragmented knowledge graphs. Parameters ---------- @@ -325,7 +336,7 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: if not node_ids: continue - # Count edges connected to these nodes + # Count edges from the in-memory graph for node_id in node_ids: node = memory_fragment.get_node(node_id) if node: From 829a6f0d04bcfec6e9c9f94219a29d6ab5cd909d Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 10 Dec 2025 22:41:01 +0530 Subject: [PATCH 24/25] fix: only document level deletion --- .../retrieval/utils/access_tracking.py | 80 +-- cognee/tasks/cleanup/cleanup_unused_data.py | 521 ++++++------------ cognee/tests/test_cleanup_unused_data.py | 388 ++++++------- 3 files changed, 333 insertions(+), 656 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index c7b06ee17..54fd043b9 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,7 +4,7 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID -import os +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data @@ -14,38 +14,28 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph logger = get_logger(__name__) + async def update_node_access_timestamps(items: List[Any]): if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return - + graph_engine = await get_graph_engine() - timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) timestamp_dt = datetime.now(timezone.utc) - + # Extract node IDs node_ids = [] for item in items: item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if item_id: node_ids.append(str(item_id)) - + if not node_ids: return - - try: - # Try to update nodes in graph database (may fail for unsupported DBs) - await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) - except Exception as e: - logger.warning( - f"Failed to update node timestamps in graph database: {e}. " - "Will update document-level timestamps in SQL instead." - ) - - # Always try to find origin documents and update SQL - # This ensures document-level tracking works even if graph updates fail + + # Focus on document-level tracking via projection try: doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: @@ -54,53 +44,6 @@ async def update_node_access_timestamps(items: List[Any]): logger.error(f"Failed to update SQL timestamps: {e}") raise -async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): - """Update nodes using graph projection - works with any graph database""" - # Project the graph with necessary properties - memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db( - graph_engine, - node_properties_to_project=["id"], - edge_properties_to_project=[] - ) - - # Update each node's last_accessed_at property - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - - for node_id in node_ids: - node = memory_fragment.get_node(node_id) - if node: - try: - # Update the node in the database - if provider == "kuzu": - # Kuzu stores properties as JSON - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - elif provider == "neo4j": - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - elif provider == "neptune": - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - except Exception as e: - # Log but continue with other nodes - logger.debug(f"Failed to update node {node_id}: {e}") - continue async def _find_origin_documents_via_projection(graph_engine, node_ids): """Find origin documents using graph projection instead of DB queries""" @@ -111,7 +54,7 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids): node_properties_to_project=["id", "type"], edge_properties_to_project=["relationship_name"] ) - + # Find origin documents by traversing the in-memory graph doc_ids = set() for node_id in node_ids: @@ -123,9 +66,10 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids): neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node() if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: doc_ids.add(neighbor.id) - + return list(doc_ids) + async def _update_sql_records(doc_ids, timestamp_dt): """Update SQL Data table (same for all providers)""" db_engine = get_relational_engine() @@ -133,6 +77,6 @@ async def _update_sql_records(doc_ids, timestamp_dt): stmt = update(Data).where( Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) ).values(last_accessed=timestamp_dt) - + await session.execute(stmt) await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index 3894635dd..34cde1b6f 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,382 +1,187 @@ -""" -Task for automatically deleting unused data from the memify pipeline. +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes entire documents that haven't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization through whole-document removal. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + Minutes since last access to consider data unused + dry_run : bool + If True, only report what would be deleted without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) -This task identifies and removes entire documents that haven't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization through whole-document removal. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa -from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph - -logger = get_logger(__name__) + # Document-level approach (recommended) + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) -async def cleanup_unused_data( - minutes_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = True, # Changed default to True for document-level cleanup - node_level: bool = False # New parameter for explicit node-level cleanup +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. - + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). + Parameters ---------- - minutes_threshold : int - Minutes since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be deleted without actually deleting (default: True) + If True, only report what would be deleted user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion - node_level : bool - If True, perform chaotic node-level deletion of unused chunks, entities, and summaries - (default: False - deprecated in favor of document-level cleanup) + Filter by user ID if provided Returns ------- Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc, - node_level=node_level - ) + Cleanup results + """ + db_engine = get_relational_engine() - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - - if node_level: - # Deprecated: Node-level approach (chaotic) - logger.warning( - "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. " - "Consider using document-level cleanup (default) instead." - ) - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) + ) + ) - # Find unused nodes using graph projection - unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes (provider-agnostic deletion) - deleted_counts = await _delete_unused_nodes(unused_nodes) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + result = await session.execute(query) + unused_data = result.all() + + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: return { - "status": "completed", - "unused_count": total_unused, + "status": "dry_run", + "unused_count": len(unused_data), "deleted_count": { "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] + "documents": 0 }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - else: - # Default: Document-level approach (recommended) - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). - - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() - - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) - ) - ) - - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) - - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: - try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") - except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: - """ - Find unused nodes using graph projection - database-agnostic approach. - NOTE: This function is deprecated as it leads to fragmented knowledge graphs. + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None - Returns - ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs - """ - graph_engine = await get_graph_engine() + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") - # Project the entire graph with necessary properties - memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db( - graph_engine, - node_properties_to_project=["id", "type", "last_accessed_at"], - edge_properties_to_project=[] - ) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - # Get all nodes from the projected graph - all_nodes = memory_fragment.get_nodes() - - for node in all_nodes: - node_type = node.get_attribute("type") - if node_type not in unused_nodes: - continue - - # Check last_accessed_at property - last_accessed = node.get_attribute("last_accessed_at") + logger.info("Cleanup completed", deleted_count=deleted_count) - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node.id) - logger.debug( - f"Found unused {node_type}", - node_id=node.id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - NOTE: This function is deprecated as it leads to fragmented knowledge graphs. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion (using graph projection for consistency) - if any(unused_nodes.values()): - memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db( - graph_engine, - node_properties_to_project=["id"], - edge_properties_to_project=[] - ) - - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges from the in-memory graph - for node_id in node_ids: - node = memory_fragment.get_node(node_id) - if node: - # Count edges from the in-memory graph - edge_count = len(node.get_skeleton_edges()) - deleted_counts["associations"] += edge_count - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches (database-agnostic) - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) - - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - - return deleted_counts + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py index c21b9f5ea..e738dcba0 100644 --- a/cognee/tests/test_cleanup_unused_data.py +++ b/cognee/tests/test_cleanup_unused_data.py @@ -1,244 +1,172 @@ -import os -import pathlib -import cognee -from datetime import datetime, timezone, timedelta -from uuid import UUID -from sqlalchemy import select, update -from cognee.modules.data.models import Data, DatasetData -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.users.methods import get_default_user -from cognee.shared.logging_utils import get_logger -from cognee.modules.search.types import SearchType - -logger = get_logger() - - -async def test_textdocument_cleanup_with_sql(): - """ - End-to-end test for TextDocument cleanup based on last_accessed timestamps. +import os +import pathlib +import cognee +from datetime import datetime, timezone, timedelta +from uuid import UUID +from sqlalchemy import select, update +from cognee.modules.data.models import Data, DatasetData +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import get_logger +from cognee.modules.search.types import SearchType - Tests: - 1. Add and cognify a document - 2. Perform search to populate last_accessed timestamp - 3. Verify last_accessed is set in SQL Data table - 4. Manually age the timestamp beyond cleanup threshold - 5. Run cleanup with text_doc=True - 6. Verify document was deleted from all databases (relational, graph, and vector) - """ - # Setup test directories - data_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") - ).resolve() - ) - cognee_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") - ).resolve() - ) +logger = get_logger() - cognee.config.data_root_directory(data_directory_path) - cognee.config.system_root_directory(cognee_directory_path) - # Initialize database - from cognee.modules.engine.operations.setup import setup +async def test_textdocument_cleanup_with_sql(): + """ + End-to-end test for TextDocument cleanup based on last_accessed timestamps. + """ + # Enable last accessed tracking BEFORE any cognee operations + os.environ["ENABLE_LAST_ACCESSED"] = "true" - # Clean slate - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - logger.info("๐Ÿงช Testing TextDocument cleanup based on last_accessed") - - # Step 1: Add and cognify a test document - dataset_name = "test_cleanup_dataset" - test_text = """ - Machine learning is a subset of artificial intelligence that enables systems to learn - and improve from experience without being explicitly programmed. Deep learning uses - neural networks with multiple layers to process data. - """ - - await setup() - user = await get_default_user() - await cognee.add([test_text], dataset_name=dataset_name, user=user) - - cognify_result = await cognee.cognify([dataset_name], user=user) - - # Extract dataset_id from cognify result (ds_id is already a UUID) - dataset_id = None - for ds_id, pipeline_result in cognify_result.items(): - dataset_id = ds_id # Don't wrap in UUID() - it's already a UUID object - break - - assert dataset_id is not None, "Failed to get dataset_id from cognify result" - logger.info(f"โœ… Document added and cognified. Dataset ID: {dataset_id}") - - # Step 2: Perform search to trigger last_accessed update - logger.info("Triggering search to update last_accessed...") - search_results = await cognee.search( - query_type=SearchType.CHUNKS, - query_text="machine learning", - datasets=[dataset_name], - user=user - ) - logger.info(f"โœ… Search completed, found {len(search_results)} results") - - # Step 3: Verify last_accessed was set in SQL Data table - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Get the Data record for this dataset - result = await session.execute( - select(Data, DatasetData) - .join(DatasetData, Data.id == DatasetData.data_id) - .where(DatasetData.dataset_id == dataset_id) - ) - data_records = result.all() - assert len(data_records) > 0, "No Data records found for the dataset" - data_record = data_records[0][0] - data_id = data_record.id + # Setup test directories + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") + ).resolve() + ) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") + ).resolve() + ) - # Verify last_accessed is set (should be set by search operation) - assert data_record.last_accessed is not None, ( - "last_accessed should be set after search operation" - ) + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) - original_last_accessed = data_record.last_accessed - logger.info(f"โœ… last_accessed verified: {original_last_accessed}") - - # Step 4: Manually age the timestamp to be older than cleanup threshold - days_threshold = 30 - aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10) - - async with db_engine.get_async_session() as session: - stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) - await session.execute(stmt) - await session.commit() - - # Query in a NEW session to avoid cached values - async with db_engine.get_async_session() as session: - result = await session.execute(select(Data).where(Data.id == data_id)) - updated_data = result.scalar_one_or_none() + # Initialize database + from cognee.modules.engine.operations.setup import setup - # Make both timezone-aware for comparison - retrieved_timestamp = updated_data.last_accessed - if retrieved_timestamp.tzinfo is None: - # If database returned naive datetime, make it UTC-aware - retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) + # Clean slate + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) - assert retrieved_timestamp == aged_timestamp, ( - f"Timestamp should be updated to aged value. " - f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}" - ) - - # Step 5: Test cleanup with text_doc=True - from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data - - # First do a dry run - logger.info("Testing dry run with text_doc=True...") - dry_run_result = await cleanup_unused_data( - days_threshold=30, - dry_run=True, - user_id=user.id, - text_doc=True - ) - - assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'" - assert dry_run_result['unused_count'] > 0, ( - "Should find at least one unused document" - ) - logger.info(f"โœ… Dry run found {dry_run_result['unused_count']} unused documents") - - # Now run actual cleanup - logger.info("Executing cleanup with text_doc=True...") - cleanup_result = await cleanup_unused_data( - days_threshold=30, - dry_run=False, - user_id=user.id, - text_doc=True - ) - - assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" - assert cleanup_result["deleted_count"]["documents"] > 0, ( - "At least one document should be deleted" - ) - logger.info(f"โœ… Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") - - # Step 6: Verify the document was actually deleted from SQL - async with db_engine.get_async_session() as session: - deleted_data = ( - await session.execute(select(Data).where(Data.id == data_id)) - ).scalar_one_or_none() + logger.info("๐Ÿงช Testing TextDocument cleanup based on last_accessed") - assert deleted_data is None, ( - "Data record should be deleted after cleanup" - ) - logger.info("โœ… Confirmed: Data record was deleted from SQL database") - - # Verify the dataset-data link was also removed - async with db_engine.get_async_session() as session: - dataset_data_link = ( - await session.execute( - select(DatasetData).where( - DatasetData.data_id == data_id, - DatasetData.dataset_id == dataset_id - ) - ) - ).scalar_one_or_none() + # Step 1: Add and cognify a test document + dataset_name = "test_cleanup_dataset" + test_text = """ + Machine learning is a subset of artificial intelligence that enables systems to learn + and improve from experience without being explicitly programmed. Deep learning uses + neural networks with multiple layers to process data. + """ - assert dataset_data_link is None, ( - "DatasetData link should be deleted after cleanup" - ) - logger.info("โœ… Confirmed: DatasetData link was deleted") + await setup() + user = await get_default_user() + await cognee.add([test_text], dataset_name=dataset_name, user=user) + + cognify_result = await cognee.cognify([dataset_name], user=user) + + # Extract dataset_id from cognify result + dataset_id = None + for ds_id, pipeline_result in cognify_result.items(): + dataset_id = ds_id + break + + assert dataset_id is not None, "Failed to get dataset_id from cognify result" + logger.info(f"โœ… Document added and cognified. Dataset ID: {dataset_id}") + + # Step 2: Perform search to trigger last_accessed update + logger.info("Triggering search to update last_accessed...") + search_results = await cognee.search( + query_type=SearchType.CHUNKS, + query_text="machine learning", + datasets=[dataset_name], + user=user + ) + logger.info(f"โœ… Search completed, found {len(search_results)} results") + assert len(search_results) > 0, "Search should return results" + + # Step 3: Verify last_accessed was set and get data_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + result = await session.execute( + select(Data, DatasetData) + .join(DatasetData, Data.id == DatasetData.data_id) + .where(DatasetData.dataset_id == dataset_id) + ) + data_records = result.all() + assert len(data_records) > 0, "No Data records found for the dataset" + data_record = data_records[0][0] + data_id = data_record.id + + # Verify last_accessed is set + assert data_record.last_accessed is not None, ( + "last_accessed should be set after search operation" + ) + + original_last_accessed = data_record.last_accessed + logger.info(f"โœ… last_accessed verified: {original_last_accessed}") + + # Step 4: Manually age the timestamp + minutes_threshold = 30 + aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10) + + async with db_engine.get_async_session() as session: + stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) + await session.execute(stmt) + await session.commit() + + # Verify timestamp was updated + async with db_engine.get_async_session() as session: + result = await session.execute(select(Data).where(Data.id == data_id)) + updated_data = result.scalar_one_or_none() + assert updated_data is not None, "Data record should exist" + retrieved_timestamp = updated_data.last_accessed + if retrieved_timestamp.tzinfo is None: + retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) + assert retrieved_timestamp == aged_timestamp, ( + f"Timestamp should be updated to aged value" + ) + + # Step 5: Test cleanup (document-level is now the default) + from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data + + # First do a dry run + logger.info("Testing dry run...") + dry_run_result = await cleanup_unused_data( + minutes_threshold=10, + dry_run=True, + user_id=user.id + ) + + # Debug: Print the actual result + logger.info(f"Dry run result: {dry_run_result}") - # Verify graph nodes were cleaned up - from cognee.infrastructure.databases.graph import get_graph_engine + assert dry_run_result['status'] == 'dry_run', f"Status should be 'dry_run', got: {dry_run_result['status']}" + assert dry_run_result['unused_count'] > 0, ( + "Should find at least one unused document" + ) + logger.info(f"โœ… Dry run found {dry_run_result['unused_count']} unused documents") + + # Now run actual cleanup + logger.info("Executing cleanup...") + cleanup_result = await cleanup_unused_data( + minutes_threshold=30, + dry_run=False, + user_id=user.id + ) + + assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" + assert cleanup_result["deleted_count"]["documents"] > 0, ( + "At least one document should be deleted" + ) + logger.info(f"โœ… Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") + + # Step 6: Verify deletion + async with db_engine.get_async_session() as session: + deleted_data = ( + await session.execute(select(Data).where(Data.id == data_id)) + ).scalar_one_or_none() + assert deleted_data is None, "Data record should be deleted" + logger.info("โœ… Confirmed: Data record was deleted") + + logger.info("๐ŸŽ‰ All cleanup tests passed!") + return True - graph_engine = await get_graph_engine() - # Try to find the TextDocument node - it should not exist - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n", - {"id": str(data_id)} - ) - - assert len(result) == 0, ( - "TextDocument node should be deleted from graph database" - ) - logger.info("โœ… Confirmed: TextDocument node was deleted from graph database") - - # Verify vector database was cleaned up - from cognee.infrastructure.databases.vector import get_vector_engine - - vector_engine = get_vector_engine() - - # Check each collection that should have been cleaned up - vector_collections = [ - "DocumentChunk_text", - "Entity_name", - "TextSummary_text" - ] - - for collection_name in vector_collections: - if await vector_engine.has_collection(collection_name): - # Try to retrieve the deleted data points - try: - results = await vector_engine.retrieve(collection_name, [str(data_id)]) - assert len(results) == 0, ( - f"Data points should be deleted from {collection_name} collection" - ) - logger.info(f"โœ… Confirmed: {collection_name} collection is clean") - except Exception as e: - # Collection might be empty or not exist, which is fine - logger.info(f"โœ… Confirmed: {collection_name} collection is empty or doesn't exist") - pass - - logger.info("โœ… Confirmed: Vector database entries were deleted") - - logger.info("๐ŸŽ‰ All cleanup tests passed!") - - return True - - -if __name__ == "__main__": - import asyncio - success = asyncio.run(test_textdocument_cleanup_with_sql()) +if __name__ == "__main__": + import asyncio + success = asyncio.run(test_textdocument_cleanup_with_sql()) exit(0 if success else 1) From 2485c3f5f0c2b25572213fe7638467859679c8d2 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Thu, 11 Dec 2025 12:48:06 +0530 Subject: [PATCH 25/25] fix: only document level deletion --- cognee/infrastructure/engine/models/DataPoint.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 3178713c8..812380eaa 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -43,9 +43,6 @@ class DataPoint(BaseModel): updated_at: int = Field( default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) ) - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) ontology_valid: bool = False version: int = 1 # Default version topological_rank: Optional[int] = 0