From 3372679f7bb40c01ffd9e337ead27fe9f8981d54 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 29 Oct 2025 20:12:14 +0530
Subject: [PATCH 01/25] feat: adding last_accessed_at field to the models and
 updating the retrievers to update the timestamp

---
 .../modules/chunking/models/DocumentChunk.py  |  7 +++
 cognee/modules/engine/models/Entity.py        |  7 ++-
 cognee/modules/retrieval/chunks_retriever.py  | 55 +++++++----------
 .../modules/retrieval/summaries_retriever.py  | 28 ++++-----
 .../retrieval/utils/access_tracking.py        | 61 +++++++++++++++++++
 cognee/tasks/summarization/models.py          |  8 ++-
 6 files changed, 115 insertions(+), 51 deletions(-)
 create mode 100644 cognee/modules/retrieval/utils/access_tracking.py

diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index 9f8c57486..c4c6a2ed3 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -1,5 +1,7 @@
 from typing import List, Union
 
+from pydantic import BaseModel, Field
+from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.data.processing.document_types import Document
 from cognee.modules.engine.models import Entity
@@ -22,6 +24,7 @@ class DocumentChunk(DataPoint):
     - cut_type: The type of cut that defined this chunk.
     - is_part_of: The document to which this chunk belongs.
     - contains: A list of entities or events contained within the chunk (default is None).
+    - last_accessed_at: The timestamp of the last time the chunk was accessed.
     - metadata: A dictionary to hold meta information related to the chunk, including index
     fields.
     """
@@ -32,5 +35,9 @@ class DocumentChunk(DataPoint):
     cut_type: str
     is_part_of: Document
     contains: List[Union[Entity, Event]] = None
+    
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )  
 
     metadata: dict = {"index_fields": ["text"]}
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 36da2e344..3e48ea02a 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -1,11 +1,14 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.engine.models.EntityType import EntityType
 from typing import Optional
-
+from datetime import datetime, timezone  
+from pydantic import BaseModel, Field
 
 class Entity(DataPoint):
     name: str
     is_a: Optional[EntityType] = None
     description: str
-
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )
     metadata: dict = {"index_fields": ["name"]}
diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index 94b9d3fb9..74634b71e 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -1,10 +1,11 @@
 from typing import Any, Optional
-
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
+from datetime import datetime, timezone  
 
 logger = get_logger("ChunksRetriever")
 
@@ -27,38 +28,26 @@ class ChunksRetriever(BaseRetriever):
     ):
         self.top_k = top_k
 
-    async def get_context(self, query: str) -> Any:
-        """
-        Retrieves document chunks context based on the query.
-
-        Searches for document chunks relevant to the specified query using a vector engine.
-        Raises a NoDataError if no data is found in the system.
-
-        Parameters:
-        -----------
-
-            - query (str): The query string to search for relevant document chunks.
-
-        Returns:
-        --------
-
-            - Any: A list of document chunk payloads retrieved from the search.
-        """
-        logger.info(
-            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
-        )
-
-        vector_engine = get_vector_engine()
-
-        try:
-            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
-            logger.info(f"Found {len(found_chunks)} chunks from vector search")
-        except CollectionNotFoundError as error:
-            logger.error("DocumentChunk_text collection not found in vector database")
-            raise NoDataError("No data found in the system, please add data first.") from error
-
-        chunk_payloads = [result.payload for result in found_chunks]
-        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
+    async def get_context(self, query: str) -> Any:  
+        """Retrieves document chunks context based on the query."""  
+        logger.info(  
+            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"  
+        )  
+      
+        vector_engine = get_vector_engine()  
+      
+        try:  
+            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)  
+            logger.info(f"Found {len(found_chunks)} chunks from vector search")  
+              
+            # NEW: Update access timestamps  
+            await update_node_access_timestamps(found_chunks, "DocumentChunk")  
+        except CollectionNotFoundError as error:  
+            logger.error("DocumentChunk_text collection not found in vector database")  
+            raise NoDataError("No data found in the system, please add data first.") from error  
+      
+        chunk_payloads = [result.payload for result in found_chunks]  
+        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")  
         return chunk_payloads
 
     async def get_completion(
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 87b224946..7f996274e 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -4,6 +4,7 @@ from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
 
 logger = get_logger("SummariesRetriever")
@@ -47,20 +48,19 @@ class SummariesRetriever(BaseRetriever):
             f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
         )
 
-        vector_engine = get_vector_engine()
-
-        try:
-            summaries_results = await vector_engine.search(
-                "TextSummary_text", query, limit=self.top_k
-            )
-            logger.info(f"Found {len(summaries_results)} summaries from vector search")
-        except CollectionNotFoundError as error:
-            logger.error("TextSummary_text collection not found in vector database")
-            raise NoDataError("No data found in the system, please add data first.") from error
-
-        summary_payloads = [summary.payload for summary in summaries_results]
-        logger.info(f"Returning {len(summary_payloads)} summary payloads")
-        return summary_payloads
+        vector_engine = get_vector_engine()  
+          
+        try:  
+            summaries_results = await vector_engine.search(  
+                "TextSummary_text", query, limit=self.top_k  
+            )  
+              
+            await update_node_access_timestamps(summaries_results, "TextSummary")
+              
+        except CollectionNotFoundError as error:  
+            raise NoDataError("No data found in the system, please add data first.") from error  
+          
+        return [summary.payload for summary in summaries_results]  
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
new file mode 100644
index 000000000..ca5ed88cd
--- /dev/null
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -0,0 +1,61 @@
+
+"""Utilities for tracking data access in retrievers."""  
+  
+import json  
+from datetime import datetime, timezone  
+from typing import List, Any  
+  
+from cognee.infrastructure.databases.graph import get_graph_engine  
+from cognee.shared.logging_utils import get_logger  
+  
+logger = get_logger(__name__)  
+  
+  
+async def update_node_access_timestamps(items: List[Any], node_type: str):  
+    """  
+    Update last_accessed_at for nodes in Kuzu graph database.  
+      
+    Parameters  
+    ----------  
+    items : List[Any]  
+        List of items with payload containing 'id' field (from vector search results)  
+    node_type : str  
+        Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary')  
+    """  
+    if not items:  
+        return  
+      
+    graph_engine = await get_graph_engine()  
+    # Convert to milliseconds since epoch (matching the field format)  
+    timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
+      
+    for item in items:  
+        # Extract ID from payload (vector search results have this structure)  
+        item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
+        if not item_id:  
+            continue  
+              
+        try:  
+            # Get current node properties from Kuzu's Node table  
+            result = await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props",  
+                {"id": str(item_id), "node_type": node_type}  
+            )  
+              
+            if result and len(result) > 0 and result[0][0]:  
+                # Parse existing properties JSON  
+                props = json.loads(result[0][0]) if result[0][0] else {}  
+                # Update last_accessed_at with millisecond timestamp  
+                props["last_accessed_at"] = timestamp_ms  
+                  
+                # Write back to graph database  
+                await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props",  
+                    {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)}  
+                )  
+        except Exception as e:  
+            logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}")  
+            continue  
+      
+    logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes")
+
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 75ed82d50..46f9a8d8b 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -1,5 +1,7 @@
-from typing import Union
 
+from pydantic import BaseModel, Field
+from typing import Union
+from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart
@@ -17,7 +19,9 @@ class TextSummary(DataPoint):
 
     text: str
     made_from: DocumentChunk
-
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )
     metadata: dict = {"index_fields": ["text"]}
 
 

From 3f27c5592b58af29369125362510e96b72c56cbc Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 29 Oct 2025 20:17:27 +0530
Subject: [PATCH 02/25] feat: adding last_accessed_at field to the models and
 updating the retrievers to update the timestamp

---
 cognee/modules/retrieval/chunks_retriever.py  | 48 +++++++++++--------
 .../modules/retrieval/summaries_retriever.py  | 28 ++++++-----
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index 74634b71e..f821fc902 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -29,26 +29,34 @@ class ChunksRetriever(BaseRetriever):
         self.top_k = top_k
 
     async def get_context(self, query: str) -> Any:  
-        """Retrieves document chunks context based on the query."""  
-        logger.info(  
-            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"  
-        )  
-      
-        vector_engine = get_vector_engine()  
-      
-        try:  
-            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)  
-            logger.info(f"Found {len(found_chunks)} chunks from vector search")  
-              
-            # NEW: Update access timestamps  
-            await update_node_access_timestamps(found_chunks, "DocumentChunk")  
-        except CollectionNotFoundError as error:  
-            logger.error("DocumentChunk_text collection not found in vector database")  
-            raise NoDataError("No data found in the system, please add data first.") from error  
-      
-        chunk_payloads = [result.payload for result in found_chunks]  
-        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")  
-        return chunk_payloads
+        """
+        Retrieves document chunks context based on the query.
+        Searches for document chunks relevant to the specified query using a vector engine.
+        Raises a NoDataError if no data is found in the system.
+        Parameters:
+        -----------
+            - query (str): The query string to search for relevant document chunks.
+        Returns:
+        --------
+            - Any: A list of document chunk payloads retrieved from the search.
+        """
+        logger.info(
+            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
+        )
+
+        vector_engine = get_vector_engine()
+
+        try:
+            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
+            logger.info(f"Found {len(found_chunks)} chunks from vector search")
+            await update_node_access_timestamps(found_chunks, "DocumentChunk")
+
+        except CollectionNotFoundError as error:
+            logger.error("DocumentChunk_text collection not found in vector database")
+            raise NoDataError("No data found in the system, please add data first.") from error
+
+        chunk_payloads = [result.payload for result in found_chunks]
+        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 7f996274e..9ac8b096d 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -48,19 +48,23 @@ class SummariesRetriever(BaseRetriever):
             f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
         )
 
-        vector_engine = get_vector_engine()  
-          
-        try:  
-            summaries_results = await vector_engine.search(  
-                "TextSummary_text", query, limit=self.top_k  
-            )  
-              
+        vector_engine = get_vector_engine()
+
+        try:
+            summaries_results = await vector_engine.search(
+                "TextSummary_text", query, limit=self.top_k
+            )
+            logger.info(f"Found {len(summaries_results)} summaries from vector search")
+            
             await update_node_access_timestamps(summaries_results, "TextSummary")
-              
-        except CollectionNotFoundError as error:  
-            raise NoDataError("No data found in the system, please add data first.") from error  
-          
-        return [summary.payload for summary in summaries_results]  
+        
+        except CollectionNotFoundError as error:
+            logger.error("TextSummary_text collection not found in vector database")
+            raise NoDataError("No data found in the system, please add data first.") from error
+
+        summary_payloads = [summary.payload for summary in summaries_results]
+        logger.info(f"Returning {len(summary_payloads)} summary payloads")
+        return summary_payloads
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs

From 5f6f0502c832d129749b453121c6f5be565044bc Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 31 Oct 2025 00:00:18 +0530
Subject: [PATCH 03/25] fix: removing last_acessed_at from individual model and
 adding it to DataPoint

---
 cognee/infrastructure/engine/models/DataPoint.py | 3 +++
 cognee/modules/chunking/models/DocumentChunk.py  | 5 -----
 cognee/modules/engine/models/Entity.py           | 3 ---
 cognee/tasks/summarization/models.py             | 3 ---
 4 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
index 812380eaa..3178713c8 100644
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@@ -43,6 +43,9 @@ class DataPoint(BaseModel):
     updated_at: int = Field(
         default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
     )
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )
     ontology_valid: bool = False
     version: int = 1  # Default version
     topological_rank: Optional[int] = 0
diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index c4c6a2ed3..601454802 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -35,9 +35,4 @@ class DocumentChunk(DataPoint):
     cut_type: str
     is_part_of: Document
     contains: List[Union[Entity, Event]] = None
-    
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )  
-
     metadata: dict = {"index_fields": ["text"]}
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 3e48ea02a..4083cd2e6 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -8,7 +8,4 @@ class Entity(DataPoint):
     name: str
     is_a: Optional[EntityType] = None
     description: str
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )
     metadata: dict = {"index_fields": ["name"]}
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 46f9a8d8b..8cee2ade3 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -19,9 +19,6 @@ class TextSummary(DataPoint):
 
     text: str
     made_from: DocumentChunk
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )
     metadata: dict = {"index_fields": ["text"]}
 
 

From 6f06e4a5eb1143ddcb2ad08132486630b8a2deae Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 31 Oct 2025 00:17:13 +0530
Subject: [PATCH 04/25] fix: removing node_type and try except

---
 cognee/modules/retrieval/chunks_retriever.py  |  2 +-
 .../modules/retrieval/summaries_retriever.py  |  2 +-
 .../retrieval/utils/access_tracking.py        | 55 ++++++++++---------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index f821fc902..be1f95811 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -49,7 +49,7 @@ class ChunksRetriever(BaseRetriever):
         try:
             found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
             logger.info(f"Found {len(found_chunks)} chunks from vector search")
-            await update_node_access_timestamps(found_chunks, "DocumentChunk")
+            await update_node_access_timestamps(found_chunks)
 
         except CollectionNotFoundError as error:
             logger.error("DocumentChunk_text collection not found in vector database")
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 9ac8b096d..0df750d22 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -56,7 +56,7 @@ class SummariesRetriever(BaseRetriever):
             )
             logger.info(f"Found {len(summaries_results)} summaries from vector search")
             
-            await update_node_access_timestamps(summaries_results, "TextSummary")
+            await update_node_access_timestamps(summaries_results)
         
         except CollectionNotFoundError as error:
             logger.error("TextSummary_text collection not found in vector database")
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index ca5ed88cd..79afd25db 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -1,4 +1,4 @@
-
+  
 """Utilities for tracking data access in retrievers."""  
   
 import json  
@@ -11,51 +11,54 @@ from cognee.shared.logging_utils import get_logger
 logger = get_logger(__name__)  
   
   
-async def update_node_access_timestamps(items: List[Any], node_type: str):  
+async def update_node_access_timestamps(items: List[Any]):  
     """  
     Update last_accessed_at for nodes in Kuzu graph database.  
+    Automatically determines node type from the graph database.  
       
     Parameters  
     ----------  
     items : List[Any]  
         List of items with payload containing 'id' field (from vector search results)  
-    node_type : str  
-        Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary')  
     """  
     if not items:  
         return  
       
     graph_engine = await get_graph_engine()  
-    # Convert to milliseconds since epoch (matching the field format)  
     timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
       
     for item in items:  
-        # Extract ID from payload (vector search results have this structure)  
+        # Extract ID from payload  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
         if not item_id:  
             continue  
               
-        try:  
-            # Get current node properties from Kuzu's Node table  
-            result = await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props",  
-                {"id": str(item_id), "node_type": node_type}  
+        # try:  
+        # Query to get both node type and properties in one call  
+        result = await graph_engine.query(  
+            "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props",  
+            {"id": str(item_id)}  
+        )  
+          
+        if result and len(result) > 0 and result[0]:  
+            node_type = result[0][0]  # First column: node_type  
+            props_json = result[0][1]  # Second column: properties  
+              
+            # Parse existing properties JSON  
+            props = json.loads(props_json) if props_json else {}  
+            # Update last_accessed_at with millisecond timestamp  
+            props["last_accessed_at"] = timestamp_ms  
+              
+            # Write back to graph database  
+            await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                {"id": str(item_id), "props": json.dumps(props)}  
             )  
               
-            if result and len(result) > 0 and result[0][0]:  
-                # Parse existing properties JSON  
-                props = json.loads(result[0][0]) if result[0][0] else {}  
-                # Update last_accessed_at with millisecond timestamp  
-                props["last_accessed_at"] = timestamp_ms  
+            logger.debug(f"Updated access timestamp for {node_type} node {item_id}")  
                   
-                # Write back to graph database  
-                await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props",  
-                    {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)}  
-                )  
-        except Exception as e:  
-            logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}")  
-            continue  
+        # except Exception as e:  
+        #     logger.error(f"Failed to update timestamp for node {item_id}: {e}")  
+        #     continue  
       
-    logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes")
-
+    logger.debug(f"Updated access timestamps for {len(items)} nodes")

From f1afd1f0a2a5433dc341c485b08ce33d1bc16252 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 31 Oct 2025 15:49:34 +0530
Subject: [PATCH 05/25] feat: adding cleanup function and adding
 update_node_acess_timestamps in completion retriever and graph_completion
 retriever

---
 .../modules/retrieval/completion_retriever.py |   3 +-
 .../retrieval/graph_completion_retriever.py   |  13 +-
 cognee/tasks/cleanup/cleanup_unused_data.py   | 232 ++++++++++++++++++
 3 files changed, 246 insertions(+), 2 deletions(-)
 create mode 100644 cognee/tasks/cleanup/cleanup_unused_data.py

diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py
index bb568924d..fc8ef747f 100644
--- a/cognee/modules/retrieval/completion_retriever.py
+++ b/cognee/modules/retrieval/completion_retriever.py
@@ -8,6 +8,7 @@ from cognee.modules.retrieval.utils.session_cache import (
     save_conversation_history,
     get_conversation_history,
 )
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
@@ -65,7 +66,7 @@ class CompletionRetriever(BaseRetriever):
 
             if len(found_chunks) == 0:
                 return ""
-
+            await update_node_access_timestamps(found_chunks)
             # Combine all chunks text returned from vector search (number of chunks is determined by top_k
             chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks]
             combined_context = "\n".join(chunks_payload)
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
index b7ab4edae..ac7e45e3c 100644
--- a/cognee/modules/retrieval/graph_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -16,6 +16,7 @@ from cognee.modules.retrieval.utils.session_cache import (
 )
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.modules.retrieval.utils.models import CogneeUserInteraction
 from cognee.modules.engine.models.node_set import NodeSet
 from cognee.infrastructure.databases.graph import get_graph_engine
@@ -138,7 +139,17 @@ class GraphCompletionRetriever(BaseGraphRetriever):
             return []
 
         # context = await self.resolve_edges_to_text(triplets)
-
+        entity_nodes = []  
+        seen_ids = set()  
+        for triplet in triplets:  
+            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node1.id)})  
+                seen_ids.add(triplet.node1.id)  
+            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node2.id)})  
+                seen_ids.add(triplet.node2.id)  
+          
+        await update_node_access_timestamps(entity_nodes) 
         return triplets
 
     async def get_completion(
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
new file mode 100644
index 000000000..e97692bb4
--- /dev/null
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -0,0 +1,232 @@
+"""  
+Task for automatically deleting unused data from the memify pipeline.  
+  
+This task identifies and removes data (chunks, entities, summaries) that hasn't  
+been accessed by retrievers for a specified period, helping maintain system  
+efficiency and storage optimization.  
+"""  
+  
+import json  
+from datetime import datetime, timezone, timedelta  
+from typing import Optional, Dict, Any  
+from uuid import UUID  
+  
+from cognee.infrastructure.databases.graph import get_graph_engine  
+from cognee.infrastructure.databases.vector import get_vector_engine  
+from cognee.shared.logging_utils import get_logger  
+  
+logger = get_logger(__name__)  
+  
+  
+async def cleanup_unused_data(  
+    minutes_threshold: int = 30,  
+    dry_run: bool = True,  
+    user_id: Optional[UUID] = None  
+) -> Dict[str, Any]:  
+    """  
+    Identify and remove unused data from the memify pipeline.  
+      
+    Parameters  
+    ----------  
+    minutes_threshold : int  
+        Minutes since last access to consider data unused (default: 30)  
+    dry_run : bool  
+        If True, only report what would be deleted without actually deleting (default: True)  
+    user_id : UUID, optional  
+        Limit cleanup to specific user's data (default: None)  
+      
+    Returns  
+    -------  
+    Dict[str, Any]  
+        Cleanup results with status, counts, and timestamp  
+    """  
+    logger.info(  
+        "Starting cleanup task",  
+        minutes_threshold=minutes_threshold,  
+        dry_run=dry_run,  
+        user_id=str(user_id) if user_id else None  
+    )  
+      
+    # Calculate cutoff timestamp in milliseconds  
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+    cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
+      
+    logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")  
+      
+    # Find unused nodes  
+    unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)  
+      
+    total_unused = sum(len(nodes) for nodes in unused_nodes.values())  
+    logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})  
+      
+    if dry_run:  
+        return {  
+            "status": "dry_run",  
+            "unused_count": total_unused,  
+            "deleted_count": {  
+                "data_items": 0,  
+                "chunks": 0,  
+                "entities": 0,  
+                "summaries": 0,  
+                "associations": 0  
+            },  
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
+            "preview": {  
+                "chunks": len(unused_nodes["DocumentChunk"]),  
+                "entities": len(unused_nodes["Entity"]),  
+                "summaries": len(unused_nodes["TextSummary"])  
+            }  
+        }  
+      
+    # Delete unused nodes  
+    deleted_counts = await _delete_unused_nodes(unused_nodes)  
+      
+    logger.info("Cleanup completed", deleted_counts=deleted_counts)  
+      
+    return {  
+        "status": "completed",  
+        "unused_count": total_unused,  
+        "deleted_count": {  
+            "data_items": 0,  
+            "chunks": deleted_counts["DocumentChunk"],  
+            "entities": deleted_counts["Entity"],  
+            "summaries": deleted_counts["TextSummary"],  
+            "associations": deleted_counts["associations"]  
+        },  
+        "cleanup_date": datetime.now(timezone.utc).isoformat()  
+    }  
+  
+  
+async def _find_unused_nodes(  
+    cutoff_timestamp_ms: int,  
+    user_id: Optional[UUID] = None  
+) -> Dict[str, list]:  
+    """  
+    Query Kuzu for nodes with old last_accessed_at timestamps.  
+      
+    Parameters  
+    ----------  
+    cutoff_timestamp_ms : int  
+        Cutoff timestamp in milliseconds since epoch  
+    user_id : UUID, optional  
+        Filter by user ID if provided  
+      
+    Returns  
+    -------  
+    Dict[str, list]  
+        Dictionary mapping node types to lists of unused node IDs  
+    """  
+    graph_engine = await get_graph_engine()  
+      
+    # Query all nodes with their properties  
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {  
+        "DocumentChunk": [],  
+        "Entity": [],  
+        "TextSummary": []  
+    }  
+      
+    for node_id, node_type, props_json in results:  
+        # Only process tracked node types  
+        if node_type not in unused_nodes:  
+            continue  
+          
+        # Parse properties JSON  
+        if props_json:  
+            try:  
+                props = json.loads(props_json)  
+                last_accessed = props.get("last_accessed_at")  
+                  
+                # Check if node is unused (never accessed or accessed before cutoff)  
+                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+                    # TODO: Add user_id filtering when user ownership is implemented  
+                    unused_nodes[node_type].append(node_id)  
+                    logger.debug(  
+                        f"Found unused {node_type}",  
+                        node_id=node_id,  
+                        last_accessed=last_accessed  
+                    )  
+            except json.JSONDecodeError:  
+                logger.warning(f"Failed to parse properties for node {node_id}")  
+                continue  
+      
+    return unused_nodes  
+  
+  
+async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:  
+    """  
+    Delete unused nodes from graph and vector databases.  
+      
+    Parameters  
+    ----------  
+    unused_nodes : Dict[str, list]  
+        Dictionary mapping node types to lists of node IDs to delete  
+      
+    Returns  
+    -------  
+    Dict[str, int]  
+        Count of deleted items by type  
+    """  
+    graph_engine = await get_graph_engine()  
+    vector_engine = get_vector_engine()  
+      
+    deleted_counts = {  
+        "DocumentChunk": 0,  
+        "Entity": 0,  
+        "TextSummary": 0,  
+        "associations": 0  
+    }  
+      
+    # Count associations before deletion  
+    for node_type, node_ids in unused_nodes.items():  
+        if not node_ids:  
+            continue  
+          
+        # Count edges connected to these nodes  
+        for node_id in node_ids:  
+            result = await graph_engine.query(  
+                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",  
+                {"id": node_id}  
+            )  
+            if result and len(result) > 0:  
+                deleted_counts["associations"] += result[0][0]  
+      
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)  
+    for node_type, node_ids in unused_nodes.items():  
+        if not node_ids:  
+            continue  
+          
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")  
+          
+        # Delete nodes in batches  
+        await graph_engine.delete_nodes(node_ids)  
+        deleted_counts[node_type] = len(node_ids)  
+      
+    # Delete from vector database  
+    vector_collections = {  
+        "DocumentChunk": "DocumentChunk_text",  
+        "Entity": "Entity_name",  
+        "TextSummary": "TextSummary_text"  
+    }  
+      
+    for node_type, collection_name in vector_collections.items():  
+        node_ids = unused_nodes[node_type]  
+        if not node_ids:  
+            continue  
+          
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
+          
+        try:  
+            # Delete from vector collection  
+            if await vector_engine.has_collection(collection_name):  
+                for node_id in node_ids:  
+                    try:  
+                        await vector_engine.delete(collection_name, {"id": str(node_id)})  
+                    except Exception as e:  
+                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")  
+        except Exception as e:  
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
+      
+    return deleted_counts

From 5080e8f8a5c20d092b917b66eb52a577fe899231 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 3 Nov 2025 00:59:04 +0530
Subject: [PATCH 06/25] feat: genarlizing getting entities from triplets

---
 cognee/modules/graph/utils/__init__.py              |  1 +
 .../graph/utils/get_entity_nodes_from_triplets.py   | 13 +++++++++++++
 .../modules/retrieval/graph_completion_retriever.py | 12 +++---------
 3 files changed, 17 insertions(+), 9 deletions(-)
 create mode 100644 cognee/modules/graph/utils/get_entity_nodes_from_triplets.py

diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py
index ebc648495..4c0b29d47 100644
--- a/cognee/modules/graph/utils/__init__.py
+++ b/cognee/modules/graph/utils/__init__.py
@@ -5,3 +5,4 @@ from .retrieve_existing_edges import retrieve_existing_edges
 from .convert_node_to_data_point import convert_node_to_data_point
 from .deduplicate_nodes_and_edges import deduplicate_nodes_and_edges
 from .resolve_edges_to_text import resolve_edges_to_text
+from .get_entity_nodes_from_triplets import get_entity_nodes_from_triplets
diff --git a/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
new file mode 100644
index 000000000..598a36854
--- /dev/null
+++ b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
@@ -0,0 +1,13 @@
+
+def get_entity_nodes_from_triplets(triplets):
+        entity_nodes = []
+        seen_ids = set()
+        for triplet in triplets:  
+            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node1.id)})  
+                seen_ids.add(triplet.node1.id)  
+            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node2.id)})  
+                seen_ids.add(triplet.node2.id)
+
+        return entity_nodes
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
index ac7e45e3c..122cc943f 100644
--- a/cognee/modules/retrieval/graph_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -22,6 +22,7 @@ from cognee.modules.engine.models.node_set import NodeSet
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.context_global_variables import session_user
 from cognee.infrastructure.databases.cache.config import CacheConfig
+from cognee.modules.graph.utils import get_entity_nodes_from_triplets
 
 logger = get_logger("GraphCompletionRetriever")
 
@@ -139,15 +140,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
             return []
 
         # context = await self.resolve_edges_to_text(triplets)
-        entity_nodes = []  
-        seen_ids = set()  
-        for triplet in triplets:  
-            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
-                entity_nodes.append({"id": str(triplet.node1.id)})  
-                seen_ids.add(triplet.node1.id)  
-            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
-                entity_nodes.append({"id": str(triplet.node2.id)})  
-                seen_ids.add(triplet.node2.id)  
+
+        entity_nodes = get_entity_nodes_from_triplets(triplets)
           
         await update_node_access_timestamps(entity_nodes) 
         return triplets

From d34fd9237bf41c6b421bd556541b50ea68246e45 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 4 Nov 2025 22:04:32 +0530
Subject: [PATCH 07/25] feat: adding last_acessed in the Data model

---
 .../e1ec1dcb50b6_add_last_accessed_to_data.py |  30 ++++++
 cognee/modules/data/models/Data.py            |   1 +
 .../retrieval/utils/access_tracking.py        | 102 ++++++++++++------
 3 files changed, 100 insertions(+), 33 deletions(-)
 create mode 100644 alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
new file mode 100644
index 000000000..0ccefa63b
--- /dev/null
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -0,0 +1,30 @@
+"""add_last_accessed_to_data
+
+Revision ID: e1ec1dcb50b6
+Revises: 211ab850ef3d
+Create Date: 2025-11-04 21:45:52.642322
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'e1ec1dcb50b6'
+down_revision: Union[str, None] = '211ab850ef3d'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+  
+def upgrade() -> None:  
+    op.add_column('data',   
+        sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
+    )  
+    # Optionally initialize with created_at values for existing records  
+    op.execute("UPDATE data SET last_accessed = created_at")  
+  
+  
+def downgrade() -> None:  
+    op.drop_column('data', 'last_accessed')
diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py
index ef228f2e1..27ab7481e 100644
--- a/cognee/modules/data/models/Data.py
+++ b/cognee/modules/data/models/Data.py
@@ -36,6 +36,7 @@ class Data(Base):
     data_size = Column(Integer, nullable=True)  # File size in bytes
     created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
     updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
+    last_accessed = Column(DateTime(timezone=True), nullable=True)
 
     datasets = relationship(
         "Dataset",
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 79afd25db..621e09e27 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -1,20 +1,27 @@
-  
 """Utilities for tracking data access in retrievers."""  
   
 import json  
 from datetime import datetime, timezone  
 from typing import List, Any  
+from uuid import UUID  
   
 from cognee.infrastructure.databases.graph import get_graph_engine  
+from cognee.infrastructure.databases.relational import get_relational_engine  
+from cognee.modules.data.models import Data  
 from cognee.shared.logging_utils import get_logger  
+from sqlalchemy import update  
   
 logger = get_logger(__name__)  
   
   
 async def update_node_access_timestamps(items: List[Any]):  
     """  
-    Update last_accessed_at for nodes in Kuzu graph database.  
-    Automatically determines node type from the graph database.  
+    Update last_accessed_at for nodes in graph database and corresponding Data records in SQL.  
+      
+    This function:  
+    1. Updates last_accessed_at in the graph database nodes (in properties JSON)  
+    2. Traverses to find origin TextDocument nodes  
+    3. Updates last_accessed in the SQL Data table for those documents  
       
     Parameters  
     ----------  
@@ -26,39 +33,68 @@ async def update_node_access_timestamps(items: List[Any]):
       
     graph_engine = await get_graph_engine()  
     timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
+    timestamp_dt = datetime.now(timezone.utc)  
       
+    # Extract node IDs  
+    node_ids = []  
     for item in items:  
-        # Extract ID from payload  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
-        if not item_id:  
-            continue  
-              
-        # try:  
-        # Query to get both node type and properties in one call  
-        result = await graph_engine.query(  
-            "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props",  
-            {"id": str(item_id)}  
-        )  
-          
-        if result and len(result) > 0 and result[0]:  
-            node_type = result[0][0]  # First column: node_type  
-            props_json = result[0][1]  # Second column: properties  
-              
-            # Parse existing properties JSON  
-            props = json.loads(props_json) if props_json else {}  
-            # Update last_accessed_at with millisecond timestamp  
-            props["last_accessed_at"] = timestamp_ms  
-              
-            # Write back to graph database  
-            await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                {"id": str(item_id), "props": json.dumps(props)}  
+        if item_id:  
+            node_ids.append(str(item_id))  
+      
+    if not node_ids:  
+        return  
+      
+    try:  
+        # Step 1: Batch update graph nodes  
+        for node_id in node_ids:  
+            result = await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) RETURN n.properties",  
+                {"id": node_id}  
             )  
               
-            logger.debug(f"Updated access timestamp for {node_type} node {item_id}")  
+            if result and result[0]:  
+                props = json.loads(result[0][0]) if result[0][0] else {}  
+                props["last_accessed_at"] = timestamp_ms  
                   
-        # except Exception as e:  
-        #     logger.error(f"Failed to update timestamp for node {item_id}: {e}")  
-        #     continue  
-      
-    logger.debug(f"Updated access timestamps for {len(items)} nodes")
+                await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                    {"id": node_id, "props": json.dumps(props)}  
+                )  
+          
+        logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")  
+          
+        # Step 2: Find origin TextDocument nodes  
+        origin_query = """  
+        UNWIND $node_ids AS node_id  
+        MATCH (n:Node {id: node_id})  
+        OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node)  
+        WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from')   
+        AND chunk.type = 'DocumentChunk'  
+        OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node)  
+        WHERE e2.relationship_name = 'is_part_of'  
+        AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
+        RETURN DISTINCT doc.id as doc_id  
+        """  
+          
+        result = await graph_engine.query(origin_query, {"node_ids": node_ids})  
+          
+        # Extract document IDs  
+        doc_ids = [row[0] for row in result if row and row[0]] if result else []  
+          
+        # Step 3: Update SQL Data table  
+        if doc_ids:  
+            db_engine = get_relational_engine()  
+            async with db_engine.get_async_session() as session:  
+                stmt = update(Data).where(  
+                    Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
+                ).values(last_accessed=timestamp_dt)  
+                  
+                await session.execute(stmt)  
+                await session.commit()  
+                  
+            logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL")  
+          
+    except Exception as e:  
+        logger.error(f"Failed to update timestamps: {e}")  
+        raise

From 3c0e915812a4ffb8662419647572c6229ed963a9 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 12:25:51 +0530
Subject: [PATCH 08/25] fix: removing hard relations

---
 .../modules/retrieval/utils/access_tracking.py   | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 621e09e27..36c0b7f50 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -20,7 +20,7 @@ async def update_node_access_timestamps(items: List[Any]):
       
     This function:  
     1. Updates last_accessed_at in the graph database nodes (in properties JSON)  
-    2. Traverses to find origin TextDocument nodes  
+    2. Traverses to find origin TextDocument nodes (without hardcoded relationship names)  
     3. Updates last_accessed in the SQL Data table for those documents  
       
     Parameters  
@@ -64,23 +64,21 @@ async def update_node_access_timestamps(items: List[Any]):
           
         logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")  
           
-        # Step 2: Find origin TextDocument nodes  
+        # Step 2: Find origin TextDocument nodes (without hardcoded relationship names)  
         origin_query = """  
         UNWIND $node_ids AS node_id  
         MATCH (n:Node {id: node_id})  
         OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node)  
-        WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from')   
-        AND chunk.type = 'DocumentChunk'  
-        OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node)  
-        WHERE e2.relationship_name = 'is_part_of'  
-        AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
+        WHERE chunk.type = 'DocumentChunk'  
+        OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node)  
+        WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
         RETURN DISTINCT doc.id as doc_id  
         """  
           
         result = await graph_engine.query(origin_query, {"node_ids": node_ids})  
           
-        # Extract document IDs  
-        doc_ids = [row[0] for row in result if row and row[0]] if result else []  
+        # Extract and deduplicate document IDs  
+        doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else []  
           
         # Step 3: Update SQL Data table  
         if doc_ids:  

From 9041a804ecc2d0be1903c2de0ac875f32fcc553c Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 18:32:49 +0530
Subject: [PATCH 09/25] fix: add text_doc flag

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 520 ++++++++++++--------
 1 file changed, 312 insertions(+), 208 deletions(-)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index e97692bb4..c9c711fe2 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,232 +1,336 @@
-"""  
-Task for automatically deleting unused data from the memify pipeline.  
-  
-This task identifies and removes data (chunks, entities, summaries) that hasn't  
-been accessed by retrievers for a specified period, helping maintain system  
-efficiency and storage optimization.  
-"""  
-  
-import json  
-from datetime import datetime, timezone, timedelta  
-from typing import Optional, Dict, Any  
-from uuid import UUID  
-  
-from cognee.infrastructure.databases.graph import get_graph_engine  
-from cognee.infrastructure.databases.vector import get_vector_engine  
-from cognee.shared.logging_utils import get_logger  
-  
-logger = get_logger(__name__)  
+"""    
+Task for automatically deleting unused data from the memify pipeline.    
+    
+This task identifies and removes data (chunks, entities, summaries) that hasn't    
+been accessed by retrievers for a specified period, helping maintain system    
+efficiency and storage optimization.    
+"""    
+    
+import json    
+from datetime import datetime, timezone, timedelta    
+from typing import Optional, Dict, Any    
+from uuid import UUID    
+    
+from cognee.infrastructure.databases.graph import get_graph_engine    
+from cognee.infrastructure.databases.vector import get_vector_engine    
+from cognee.infrastructure.databases.relational import get_relational_engine  
+from cognee.modules.data.models import Data, DatasetData  
+from cognee.shared.logging_utils import get_logger    
+from sqlalchemy import select, or_  
+import cognee  
+    
+logger = get_logger(__name__)    
+    
+    
+async def cleanup_unused_data(    
+    minutes_threshold: int = 30,    
+    dry_run: bool = True,    
+    user_id: Optional[UUID] = None,  
+    text_doc: bool = False  
+) -> Dict[str, Any]:    
+    """    
+    Identify and remove unused data from the memify pipeline.    
+        
+    Parameters    
+    ----------    
+    minutes_threshold : int    
+        Minutes since last access to consider data unused (default: 30)    
+    dry_run : bool    
+        If True, only report what would be deleted without actually deleting (default: True)    
+    user_id : UUID, optional    
+        Limit cleanup to specific user's data (default: None)  
+    text_doc : bool  
+        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()  
+        for proper whole-document deletion (default: False)  
+        
+    Returns    
+    -------    
+    Dict[str, Any]    
+        Cleanup results with status, counts, and timestamp    
+    """    
+    logger.info(    
+        "Starting cleanup task",    
+        minutes_threshold=minutes_threshold,    
+        dry_run=dry_run,    
+        user_id=str(user_id) if user_id else None,  
+        text_doc=text_doc  
+    )    
+        
+    # Calculate cutoff timestamp  
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+      
+    if text_doc:  
+        # SQL-based approach: Find unused TextDocuments and use cognee.delete()  
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)  
+    else:  
+        # Graph-based approach: Find unused nodes directly from graph  
+        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
+        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")    
+            
+        # Find unused nodes    
+        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)    
+            
+        total_unused = sum(len(nodes) for nodes in unused_nodes.values())    
+        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})    
+            
+        if dry_run:    
+            return {    
+                "status": "dry_run",    
+                "unused_count": total_unused,    
+                "deleted_count": {    
+                    "data_items": 0,    
+                    "chunks": 0,    
+                    "entities": 0,    
+                    "summaries": 0,    
+                    "associations": 0    
+                },    
+                "cleanup_date": datetime.now(timezone.utc).isoformat(),    
+                "preview": {    
+                    "chunks": len(unused_nodes["DocumentChunk"]),    
+                    "entities": len(unused_nodes["Entity"]),    
+                    "summaries": len(unused_nodes["TextSummary"])    
+                }    
+            }    
+            
+        # Delete unused nodes    
+        deleted_counts = await _delete_unused_nodes(unused_nodes)    
+            
+        logger.info("Cleanup completed", deleted_counts=deleted_counts)    
+            
+        return {    
+            "status": "completed",    
+            "unused_count": total_unused,    
+            "deleted_count": {    
+                "data_items": 0,    
+                "chunks": deleted_counts["DocumentChunk"],    
+                "entities": deleted_counts["Entity"],    
+                "summaries": deleted_counts["TextSummary"],    
+                "associations": deleted_counts["associations"]    
+            },    
+            "cleanup_date": datetime.now(timezone.utc).isoformat()    
+        }  
   
   
-async def cleanup_unused_data(  
-    minutes_threshold: int = 30,  
-    dry_run: bool = True,  
+async def _cleanup_via_sql(  
+    cutoff_date: datetime,  
+    dry_run: bool,  
     user_id: Optional[UUID] = None  
 ) -> Dict[str, Any]:  
     """  
-    Identify and remove unused data from the memify pipeline.  
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().  
       
     Parameters  
     ----------  
-    minutes_threshold : int  
-        Minutes since last access to consider data unused (default: 30)  
+    cutoff_date : datetime  
+        Cutoff date for last_accessed filtering  
     dry_run : bool  
-        If True, only report what would be deleted without actually deleting (default: True)  
-    user_id : UUID, optional  
-        Limit cleanup to specific user's data (default: None)  
-      
-    Returns  
-    -------  
-    Dict[str, Any]  
-        Cleanup results with status, counts, and timestamp  
-    """  
-    logger.info(  
-        "Starting cleanup task",  
-        minutes_threshold=minutes_threshold,  
-        dry_run=dry_run,  
-        user_id=str(user_id) if user_id else None  
-    )  
-      
-    # Calculate cutoff timestamp in milliseconds  
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
-    cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
-      
-    logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")  
-      
-    # Find unused nodes  
-    unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)  
-      
-    total_unused = sum(len(nodes) for nodes in unused_nodes.values())  
-    logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})  
-      
-    if dry_run:  
-        return {  
-            "status": "dry_run",  
-            "unused_count": total_unused,  
-            "deleted_count": {  
-                "data_items": 0,  
-                "chunks": 0,  
-                "entities": 0,  
-                "summaries": 0,  
-                "associations": 0  
-            },  
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
-            "preview": {  
-                "chunks": len(unused_nodes["DocumentChunk"]),  
-                "entities": len(unused_nodes["Entity"]),  
-                "summaries": len(unused_nodes["TextSummary"])  
-            }  
-        }  
-      
-    # Delete unused nodes  
-    deleted_counts = await _delete_unused_nodes(unused_nodes)  
-      
-    logger.info("Cleanup completed", deleted_counts=deleted_counts)  
-      
-    return {  
-        "status": "completed",  
-        "unused_count": total_unused,  
-        "deleted_count": {  
-            "data_items": 0,  
-            "chunks": deleted_counts["DocumentChunk"],  
-            "entities": deleted_counts["Entity"],  
-            "summaries": deleted_counts["TextSummary"],  
-            "associations": deleted_counts["associations"]  
-        },  
-        "cleanup_date": datetime.now(timezone.utc).isoformat()  
-    }  
-  
-  
-async def _find_unused_nodes(  
-    cutoff_timestamp_ms: int,  
-    user_id: Optional[UUID] = None  
-) -> Dict[str, list]:  
-    """  
-    Query Kuzu for nodes with old last_accessed_at timestamps.  
-      
-    Parameters  
-    ----------  
-    cutoff_timestamp_ms : int  
-        Cutoff timestamp in milliseconds since epoch  
+        If True, only report what would be deleted  
     user_id : UUID, optional  
         Filter by user ID if provided  
       
     Returns  
     -------  
-    Dict[str, list]  
-        Dictionary mapping node types to lists of unused node IDs  
+    Dict[str, Any]  
+        Cleanup results  
     """  
-    graph_engine = await get_graph_engine()  
+    db_engine = get_relational_engine()  
       
-    # Query all nodes with their properties  
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {  
-        "DocumentChunk": [],  
-        "Entity": [],  
-        "TextSummary": []  
-    }  
-      
-    for node_id, node_type, props_json in results:  
-        # Only process tracked node types  
-        if node_type not in unused_nodes:  
-            continue  
-          
-        # Parse properties JSON  
-        if props_json:  
-            try:  
-                props = json.loads(props_json)  
-                last_accessed = props.get("last_accessed_at")  
-                  
-                # Check if node is unused (never accessed or accessed before cutoff)  
-                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-                    # TODO: Add user_id filtering when user ownership is implemented  
-                    unused_nodes[node_type].append(node_id)  
-                    logger.debug(  
-                        f"Found unused {node_type}",  
-                        node_id=node_id,  
-                        last_accessed=last_accessed  
-                    )  
-            except json.JSONDecodeError:  
-                logger.warning(f"Failed to parse properties for node {node_id}")  
-                continue  
-      
-    return unused_nodes  
-  
-  
-async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:  
-    """  
-    Delete unused nodes from graph and vector databases.  
-      
-    Parameters  
-    ----------  
-    unused_nodes : Dict[str, list]  
-        Dictionary mapping node types to lists of node IDs to delete  
-      
-    Returns  
-    -------  
-    Dict[str, int]  
-        Count of deleted items by type  
-    """  
-    graph_engine = await get_graph_engine()  
-    vector_engine = get_vector_engine()  
-      
-    deleted_counts = {  
-        "DocumentChunk": 0,  
-        "Entity": 0,  
-        "TextSummary": 0,  
-        "associations": 0  
-    }  
-      
-    # Count associations before deletion  
-    for node_type, node_ids in unused_nodes.items():  
-        if not node_ids:  
-            continue  
-          
-        # Count edges connected to these nodes  
-        for node_id in node_ids:  
-            result = await graph_engine.query(  
-                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",  
-                {"id": node_id}  
+    async with db_engine.get_async_session() as session:  
+        # Query for Data records with old last_accessed timestamps  
+        query = select(Data, DatasetData).join(  
+            DatasetData, Data.id == DatasetData.data_id  
+        ).where(  
+            or_(  
+                Data.last_accessed < cutoff_date,  
+                Data.last_accessed.is_(None)  
             )  
-            if result and len(result) > 0:  
-                deleted_counts["associations"] += result[0][0]  
+        )  
+          
+        if user_id:  
+            from cognee.modules.data.models import Dataset  
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(  
+                Dataset.owner_id == user_id  
+            )  
+          
+        result = await session.execute(query)  
+        unused_data = result.all()  
       
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)  
-    for node_type, node_ids in unused_nodes.items():  
-        if not node_ids:  
-            continue  
-          
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")  
-          
-        # Delete nodes in batches  
-        await graph_engine.delete_nodes(node_ids)  
-        deleted_counts[node_type] = len(node_ids)  
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")  
       
-    # Delete from vector database  
-    vector_collections = {  
-        "DocumentChunk": "DocumentChunk_text",  
-        "Entity": "Entity_name",  
-        "TextSummary": "TextSummary_text"  
-    }  
+    if dry_run:  
+        return {  
+            "status": "dry_run",  
+            "unused_count": len(unused_data),  
+            "deleted_count": {  
+                "data_items": 0,  
+                "documents": 0  
+            },  
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
+            "preview": {  
+                "documents": len(unused_data)  
+            }  
+        }  
       
-    for node_type, collection_name in vector_collections.items():  
-        node_ids = unused_nodes[node_type]  
-        if not node_ids:  
-            continue  
-          
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
-          
+    # Delete each document using cognee.delete()  
+    deleted_count = 0  
+    from cognee.modules.users.methods import get_default_user  
+    user = await get_default_user() if user_id is None else None  
+      
+    for data, dataset_data in unused_data:  
         try:  
-            # Delete from vector collection  
-            if await vector_engine.has_collection(collection_name):  
-                for node_id in node_ids:  
-                    try:  
-                        await vector_engine.delete(collection_name, {"id": str(node_id)})  
-                    except Exception as e:  
-                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")  
+            await cognee.delete(  
+                data_id=data.id,  
+                dataset_id=dataset_data.dataset_id,  
+                mode="hard",  # Use hard mode to also remove orphaned entities  
+                user=user  
+            )  
+            deleted_count += 1  
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")  
         except Exception as e:  
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
+            logger.error(f"Failed to delete document {data.id}: {e}")  
       
+    logger.info("Cleanup completed", deleted_count=deleted_count)  
+      
+    return {  
+        "status": "completed",  
+        "unused_count": len(unused_data),  
+        "deleted_count": {  
+            "data_items": deleted_count,  
+            "documents": deleted_count  
+        },  
+        "cleanup_date": datetime.now(timezone.utc).isoformat()  
+    }  
+    
+    
+async def _find_unused_nodes(    
+    cutoff_timestamp_ms: int,    
+    user_id: Optional[UUID] = None    
+) -> Dict[str, list]:    
+    """    
+    Query Kuzu for nodes with old last_accessed_at timestamps.    
+        
+    Parameters    
+    ----------    
+    cutoff_timestamp_ms : int    
+        Cutoff timestamp in milliseconds since epoch    
+    user_id : UUID, optional    
+        Filter by user ID if provided    
+        
+    Returns    
+    -------    
+    Dict[str, list]    
+        Dictionary mapping node types to lists of unused node IDs    
+    """    
+    graph_engine = await get_graph_engine()    
+        
+    # Query all nodes with their properties    
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"    
+    results = await graph_engine.query(query)    
+        
+    unused_nodes = {    
+        "DocumentChunk": [],    
+        "Entity": [],    
+        "TextSummary": []    
+    }    
+        
+    for node_id, node_type, props_json in results:    
+        # Only process tracked node types    
+        if node_type not in unused_nodes:    
+            continue    
+            
+        # Parse properties JSON    
+        if props_json:    
+            try:    
+                props = json.loads(props_json)    
+                last_accessed = props.get("last_accessed_at")    
+                    
+                # Check if node is unused (never accessed or accessed before cutoff)    
+                if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
+                    unused_nodes[node_type].append(node_id)    
+                    logger.debug(    
+                        f"Found unused {node_type}",    
+                        node_id=node_id,    
+                        last_accessed=last_accessed    
+                    )    
+            except json.JSONDecodeError:    
+                logger.warning(f"Failed to parse properties for node {node_id}")    
+                continue    
+        
+    return unused_nodes    
+    
+    
+async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:    
+    """    
+    Delete unused nodes from graph and vector databases.    
+        
+    Parameters    
+    ----------    
+    unused_nodes : Dict[str, list]    
+        Dictionary mapping node types to lists of node IDs to delete    
+        
+    Returns    
+    -------    
+    Dict[str, int]    
+        Count of deleted items by type    
+    """    
+    graph_engine = await get_graph_engine()    
+    vector_engine = get_vector_engine()    
+        
+    deleted_counts = {    
+        "DocumentChunk": 0,    
+        "Entity": 0,    
+        "TextSummary": 0,    
+        "associations": 0    
+    }    
+        
+    # Count associations before deletion    
+    for node_type, node_ids in unused_nodes.items():    
+        if not node_ids:    
+            continue    
+            
+        # Count edges connected to these nodes    
+        for node_id in node_ids:    
+            result = await graph_engine.query(    
+                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",    
+                {"id": node_id}    
+            )    
+            if result and len(result) > 0:    
+                deleted_counts["associations"] += result[0][0]    
+        
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)    
+    for node_type, node_ids in unused_nodes.items():    
+        if not node_ids:    
+            continue    
+            
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")    
+            
+        # Delete nodes in batches    
+        await graph_engine.delete_nodes(node_ids)    
+        deleted_counts[node_type] = len(node_ids)    
+        
+    # Delete from vector database    
+    vector_collections = {    
+        "DocumentChunk": "DocumentChunk_text",    
+        "Entity": "Entity_name",    
+        "TextSummary": "TextSummary_text"    
+    }    
+        
+    for node_type, collection_name in vector_collections.items():    
+        node_ids = unused_nodes[node_type]    
+        if not node_ids:    
+            continue    
+            
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
+            
+        try:    
+            # Delete from vector collection    
+            if await vector_engine.has_collection(collection_name):    
+                for node_id in node_ids:    
+                    try:    
+                        await vector_engine.delete(collection_name, {"id": str(node_id)})    
+                    except Exception as e:    
+                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")    
+        except Exception as e:    
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
+        
     return deleted_counts

From ff263c0132b170b3c03961606db56c2a174d2b90 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 18:40:58 +0530
Subject: [PATCH 10/25] fix: add column check in migration

---
 .../e1ec1dcb50b6_add_last_accessed_to_data.py | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
index 0ccefa63b..267e11fb2 100644
--- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -17,14 +17,30 @@ down_revision: Union[str, None] = '211ab850ef3d'
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 
+def _get_column(inspector, table, name, schema=None):  
+    for col in inspector.get_columns(table, schema=schema):  
+        if col["name"] == name:  
+            return col  
+    return None  
+  
   
 def upgrade() -> None:  
-    op.add_column('data',   
-        sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
-    )  
-    # Optionally initialize with created_at values for existing records  
-    op.execute("UPDATE data SET last_accessed = created_at")  
+    conn = op.get_bind()  
+    insp = sa.inspect(conn)  
+  
+    last_accessed_column = _get_column(insp, "data", "last_accessed")   
+    if not last_accessed_column:  
+        op.add_column('data',   
+            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
+        )  
+        # Optionally initialize with created_at values for existing records  
+        op.execute("UPDATE data SET last_accessed = created_at")  
   
   
 def downgrade() -> None:  
-    op.drop_column('data', 'last_accessed')
+    conn = op.get_bind()  
+    insp = sa.inspect(conn)  
+      
+    last_accessed_column = _get_column(insp, "data", "last_accessed")  
+    if last_accessed_column:  
+        op.drop_column('data', 'last_accessed')

From c5f0c4af87ff13bf8e3cbe0f4e9163ece44c3094 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 20:22:17 +0530
Subject: [PATCH 11/25] fix: add text_doc flag

---
 cognee/modules/retrieval/utils/access_tracking.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 36c0b7f50..65d597a93 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -67,12 +67,9 @@ async def update_node_access_timestamps(items: List[Any]):
         # Step 2: Find origin TextDocument nodes (without hardcoded relationship names)  
         origin_query = """  
         UNWIND $node_ids AS node_id  
-        MATCH (n:Node {id: node_id})  
-        OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node)  
-        WHERE chunk.type = 'DocumentChunk'  
-        OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node)  
-        WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
-        RETURN DISTINCT doc.id as doc_id  
+        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)  
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']  
+        RETURN DISTINCT doc.id  
         """  
           
         result = await graph_engine.query(origin_query, {"node_ids": node_ids})  

From fdf037b3d0117bd29f0c541ed027895c070678df Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Thu, 6 Nov 2025 23:00:56 +0530
Subject: [PATCH 12/25] fix: min to days

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index c9c711fe2..4df622a2c 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -23,7 +23,7 @@ logger = get_logger(__name__)
     
     
 async def cleanup_unused_data(    
-    minutes_threshold: int = 30,    
+    days_threshold: Optional[int],    
     dry_run: bool = True,    
     user_id: Optional[UUID] = None,  
     text_doc: bool = False  
@@ -33,8 +33,8 @@ async def cleanup_unused_data(
         
     Parameters    
     ----------    
-    minutes_threshold : int    
-        Minutes since last access to consider data unused (default: 30)    
+    days_threshold : int    
+        days since last access to consider data unused     
     dry_run : bool    
         If True, only report what would be deleted without actually deleting (default: True)    
     user_id : UUID, optional    
@@ -50,14 +50,14 @@ async def cleanup_unused_data(
     """    
     logger.info(    
         "Starting cleanup task",    
-        minutes_threshold=minutes_threshold,    
+        days_threshold=days_threshold,    
         dry_run=dry_run,    
         user_id=str(user_id) if user_id else None,  
         text_doc=text_doc  
     )    
         
     # Calculate cutoff timestamp  
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
       
     if text_doc:  
         # SQL-based approach: Find unused TextDocuments and use cognee.delete()  

From 84c8e07ddd980af7c11b89c7e510b38e5c44f119 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 7 Nov 2025 12:03:17 +0530
Subject: [PATCH 13/25] fix: remove uneccessary imports

---
 cognee/modules/chunking/models/DocumentChunk.py | 2 --
 cognee/modules/engine/models/Entity.py          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index a9fb08a9e..e2b216a9b 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -1,7 +1,5 @@
 from typing import List, Union
 
-from pydantic import BaseModel, Field
-from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.engine.models.Edge import Edge
 from cognee.modules.data.processing.document_types import Document
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 4083cd2e6..a34a6503c 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -1,8 +1,6 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.engine.models.EntityType import EntityType
 from typing import Optional
-from datetime import datetime, timezone  
-from pydantic import BaseModel, Field
 
 class Entity(DataPoint):
     name: str

From 84bd2f38f7513c244ed1040937a1e5a5297cec2e Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 7 Nov 2025 12:12:46 +0530
Subject: [PATCH 14/25] fix: remove uneccessary imports

---
 cognee/tasks/summarization/models.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 8cee2ade3..8420cfaa5 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -1,7 +1,5 @@
 
-from pydantic import BaseModel, Field
 from typing import Union
-from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart

From d351c9a009d12a8a8a4869afa7aee38c61482e21 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 10 Nov 2025 21:58:01 +0530
Subject: [PATCH 15/25] fix: return chunk payload

---
 cognee/modules/retrieval/chunks_retriever.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index be1f95811..b7a90238a 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -57,6 +57,7 @@ class ChunksRetriever(BaseRetriever):
 
         chunk_payloads = [result.payload for result in found_chunks]
         logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
+        return chunk_payloads
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None

From 7bd7079aac9fcb003bcc20e118bc65d066e9029c Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 18 Nov 2025 22:17:23 +0530
Subject: [PATCH 16/25] fix: vecto_engine.delte_data_points

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 33 ++++++++++-----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index 4df622a2c..fd4b68204 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -315,22 +315,21 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
         "TextSummary": "TextSummary_text"    
     }    
         
-    for node_type, collection_name in vector_collections.items():    
-        node_ids = unused_nodes[node_type]    
-        if not node_ids:    
-            continue    
-            
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
-            
-        try:    
-            # Delete from vector collection    
-            if await vector_engine.has_collection(collection_name):    
-                for node_id in node_ids:    
-                    try:    
-                        await vector_engine.delete(collection_name, {"id": str(node_id)})    
-                    except Exception as e:    
-                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")    
-        except Exception as e:    
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
+
+    for node_type, collection_name in vector_collections.items():
+        node_ids = unused_nodes[node_type]
+        if not node_ids:
+            continue
+
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")
+
+        try:
+            if await vector_engine.has_collection(collection_name):
+                await vector_engine.delete_data_points(
+                    collection_name,
+                    [str(node_id) for node_id in node_ids]
+                )
+        except Exception as e:
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")
         
     return deleted_counts

From 5fac3b40b94e4c81a7d9828ca9d2d84ab5e82bc1 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 18 Nov 2025 22:26:59 +0530
Subject: [PATCH 17/25] fix: test file for cleanup unused data

---
 cognee/tests/test_cleanup_unused_data.py | 244 +++++++++++++++++++++++
 1 file changed, 244 insertions(+)
 create mode 100644 cognee/tests/test_cleanup_unused_data.py

diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py
new file mode 100644
index 000000000..c21b9f5ea
--- /dev/null
+++ b/cognee/tests/test_cleanup_unused_data.py
@@ -0,0 +1,244 @@
+import os    
+import pathlib    
+import cognee    
+from datetime import datetime, timezone, timedelta    
+from uuid import UUID    
+from sqlalchemy import select, update    
+from cognee.modules.data.models import Data, DatasetData    
+from cognee.infrastructure.databases.relational import get_relational_engine    
+from cognee.modules.users.methods import get_default_user    
+from cognee.shared.logging_utils import get_logger    
+from cognee.modules.search.types import SearchType    
+    
+logger = get_logger()    
+    
+    
+async def test_textdocument_cleanup_with_sql():    
+    """    
+    End-to-end test for TextDocument cleanup based on last_accessed timestamps.    
+        
+    Tests:    
+    1. Add and cognify a document    
+    2. Perform search to populate last_accessed timestamp    
+    3. Verify last_accessed is set in SQL Data table    
+    4. Manually age the timestamp beyond cleanup threshold    
+    5. Run cleanup with text_doc=True    
+    6. Verify document was deleted from all databases (relational, graph, and vector)  
+    """    
+    # Setup test directories    
+    data_directory_path = str(    
+        pathlib.Path(    
+            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")    
+        ).resolve()    
+    )    
+    cognee_directory_path = str(    
+        pathlib.Path(    
+            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")    
+        ).resolve()    
+    )    
+        
+    cognee.config.data_root_directory(data_directory_path)    
+    cognee.config.system_root_directory(cognee_directory_path)    
+        
+    # Initialize database    
+    from cognee.modules.engine.operations.setup import setup    
+        
+    # Clean slate    
+    await cognee.prune.prune_data()    
+    await cognee.prune.prune_system(metadata=True)    
+        
+    logger.info("🧪 Testing TextDocument cleanup based on last_accessed")    
+        
+    # Step 1: Add and cognify a test document    
+    dataset_name = "test_cleanup_dataset"    
+    test_text = """    
+    Machine learning is a subset of artificial intelligence that enables systems to learn    
+    and improve from experience without being explicitly programmed. Deep learning uses    
+    neural networks with multiple layers to process data.    
+    """    
+        
+    await setup()    
+    user = await get_default_user()    
+    await cognee.add([test_text], dataset_name=dataset_name, user=user)    
+        
+    cognify_result = await cognee.cognify([dataset_name], user=user)    
+        
+    # Extract dataset_id from cognify result (ds_id is already a UUID)    
+    dataset_id = None    
+    for ds_id, pipeline_result in cognify_result.items():    
+        dataset_id = ds_id  # Don't wrap in UUID() - it's already a UUID object    
+        break    
+        
+    assert dataset_id is not None, "Failed to get dataset_id from cognify result"    
+    logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")    
+        
+    # Step 2: Perform search to trigger last_accessed update    
+    logger.info("Triggering search to update last_accessed...")    
+    search_results = await cognee.search(    
+        query_type=SearchType.CHUNKS,    
+        query_text="machine learning",    
+        datasets=[dataset_name],    
+        user=user    
+    )    
+    logger.info(f"✅ Search completed, found {len(search_results)} results")    
+        
+    # Step 3: Verify last_accessed was set in SQL Data table    
+    db_engine = get_relational_engine()    
+    async with db_engine.get_async_session() as session:    
+        # Get the Data record for this dataset    
+        result = await session.execute(    
+            select(Data, DatasetData)    
+            .join(DatasetData, Data.id == DatasetData.data_id)    
+            .where(DatasetData.dataset_id == dataset_id)    
+        )    
+        data_records = result.all()    
+        assert len(data_records) > 0, "No Data records found for the dataset"    
+        data_record = data_records[0][0]  
+        data_id = data_record.id    
+            
+        # Verify last_accessed is set (should be set by search operation)    
+        assert data_record.last_accessed is not None, (    
+            "last_accessed should be set after search operation"    
+        )    
+            
+        original_last_accessed = data_record.last_accessed    
+        logger.info(f"✅ last_accessed verified: {original_last_accessed}")    
+        
+    # Step 4: Manually age the timestamp to be older than cleanup threshold    
+    days_threshold = 30   
+    aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10)    
+        
+    async with db_engine.get_async_session() as session:    
+        stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)    
+        await session.execute(stmt)    
+        await session.commit()    
+        
+    # Query in a NEW session to avoid cached values    
+    async with db_engine.get_async_session() as session:    
+        result = await session.execute(select(Data).where(Data.id == data_id))    
+        updated_data = result.scalar_one_or_none()    
+            
+        # Make both timezone-aware for comparison    
+        retrieved_timestamp = updated_data.last_accessed    
+        if retrieved_timestamp.tzinfo is None:    
+            # If database returned naive datetime, make it UTC-aware    
+            retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)    
+            
+        assert retrieved_timestamp == aged_timestamp, (    
+            f"Timestamp should be updated to aged value. "    
+            f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}"    
+        )  
+          
+    # Step 5: Test cleanup with text_doc=True    
+    from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data    
+        
+    # First do a dry run    
+    logger.info("Testing dry run with text_doc=True...")    
+    dry_run_result = await cleanup_unused_data(    
+        days_threshold=30,    
+        dry_run=True,    
+        user_id=user.id,    
+        text_doc=True    
+    )    
+        
+    assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'"    
+    assert dry_run_result['unused_count'] > 0, (    
+        "Should find at least one unused document"    
+    )    
+    logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")    
+        
+    # Now run actual cleanup    
+    logger.info("Executing cleanup with text_doc=True...")    
+    cleanup_result = await cleanup_unused_data(    
+        days_threshold=30,    
+        dry_run=False,    
+        user_id=user.id,    
+        text_doc=True    
+    )    
+        
+    assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"    
+    assert cleanup_result["deleted_count"]["documents"] > 0, (    
+        "At least one document should be deleted"    
+    )    
+    logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents")    
+        
+    # Step 6: Verify the document was actually deleted from SQL    
+    async with db_engine.get_async_session() as session:    
+        deleted_data = (    
+            await session.execute(select(Data).where(Data.id == data_id))    
+        ).scalar_one_or_none()    
+            
+        assert deleted_data is None, (    
+            "Data record should be deleted after cleanup"    
+        )    
+        logger.info("✅ Confirmed: Data record was deleted from SQL database")    
+        
+    # Verify the dataset-data link was also removed    
+    async with db_engine.get_async_session() as session:    
+        dataset_data_link = (    
+            await session.execute(    
+                select(DatasetData).where(    
+                    DatasetData.data_id == data_id,    
+                    DatasetData.dataset_id == dataset_id    
+                )    
+            )    
+        ).scalar_one_or_none()    
+            
+        assert dataset_data_link is None, (    
+            "DatasetData link should be deleted after cleanup"    
+        )    
+        logger.info("✅ Confirmed: DatasetData link was deleted")    
+        
+    # Verify graph nodes were cleaned up    
+    from cognee.infrastructure.databases.graph import get_graph_engine    
+        
+    graph_engine = await get_graph_engine()    
+        
+    # Try to find the TextDocument node - it should not exist    
+    result = await graph_engine.query(    
+        "MATCH (n:Node {id: $id}) RETURN n",    
+        {"id": str(data_id)}    
+    )    
+        
+    assert len(result) == 0, (    
+        "TextDocument node should be deleted from graph database"    
+    )    
+    logger.info("✅ Confirmed: TextDocument node was deleted from graph database")    
+      
+    # Verify vector database was cleaned up  
+    from cognee.infrastructure.databases.vector import get_vector_engine  
+      
+    vector_engine = get_vector_engine()  
+      
+    # Check each collection that should have been cleaned up  
+    vector_collections = [  
+        "DocumentChunk_text",  
+        "Entity_name",   
+        "TextSummary_text"  
+    ]  
+      
+    for collection_name in vector_collections:  
+        if await vector_engine.has_collection(collection_name):  
+            # Try to retrieve the deleted data points  
+            try:  
+                results = await vector_engine.retrieve(collection_name, [str(data_id)])  
+                assert len(results) == 0, (  
+                    f"Data points should be deleted from {collection_name} collection"  
+                )  
+                logger.info(f"✅ Confirmed: {collection_name} collection is clean")  
+            except Exception as e:  
+                # Collection might be empty or not exist, which is fine  
+                logger.info(f"✅ Confirmed: {collection_name} collection is empty or doesn't exist")  
+                pass  
+      
+    logger.info("✅ Confirmed: Vector database entries were deleted")  
+        
+    logger.info("🎉 All cleanup tests passed!")    
+        
+    return True    
+    
+    
+if __name__ == "__main__":    
+    import asyncio    
+    success = asyncio.run(test_textdocument_cleanup_with_sql())    
+    exit(0 if success else 1)

From 43290af1b23d24d6ab8b5d57c243abe1cee8787e Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 19 Nov 2025 21:00:16 +0530
Subject: [PATCH 18/25] fix: set last_acessed to current timestamp

---
 alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
index 267e11fb2..a16c99e9f 100644
--- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -34,7 +34,7 @@ def upgrade() -> None:
             sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
         )  
         # Optionally initialize with created_at values for existing records  
-        op.execute("UPDATE data SET last_accessed = created_at")  
+        op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
   
   
 def downgrade() -> None:  

From b52c1a1e25e6edffe112462836ab315b36bec567 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 24 Nov 2025 12:50:39 +0530
Subject: [PATCH 19/25] fix: flag to enable and disable last_accessed

---
 .../e1ec1dcb50b6_add_last_accessed_to_data.py | 88 ++++++++++---------
 .../retrieval/utils/access_tracking.py        |  7 +-
 cognee/tasks/cleanup/cleanup_unused_data.py   | 40 ++++++++-
 3 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
index a16c99e9f..f1a36ae59 100644
--- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -1,46 +1,52 @@
-"""add_last_accessed_to_data
-
-Revision ID: e1ec1dcb50b6
-Revises: 211ab850ef3d
-Create Date: 2025-11-04 21:45:52.642322
-
-"""
-from typing import Sequence, Union
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision: str = 'e1ec1dcb50b6'
-down_revision: Union[str, None] = '211ab850ef3d'
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-def _get_column(inspector, table, name, schema=None):  
-    for col in inspector.get_columns(table, schema=schema):  
-        if col["name"] == name:  
-            return col  
-    return None  
+"""add_last_accessed_to_data  
+  
+Revision ID: e1ec1dcb50b6  
+Revises: 211ab850ef3d  
+Create Date: 2025-11-04 21:45:52.642322  
+  
+"""  
+import os  
+from typing import Sequence, Union  
+  
+from alembic import op  
+import sqlalchemy as sa  
   
   
-def upgrade() -> None:  
-    conn = op.get_bind()  
-    insp = sa.inspect(conn)  
-  
-    last_accessed_column = _get_column(insp, "data", "last_accessed")   
-    if not last_accessed_column:  
-        op.add_column('data',   
-            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
-        )  
-        # Optionally initialize with created_at values for existing records  
-        op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
+# revision identifiers, used by Alembic.  
+revision: str = 'e1ec1dcb50b6'  
+down_revision: Union[str, None] = '211ab850ef3d'  
+branch_labels: Union[str, Sequence[str], None] = None  
+depends_on: Union[str, Sequence[str], None] = None  
   
   
-def downgrade() -> None:  
-    conn = op.get_bind()  
-    insp = sa.inspect(conn)  
-      
-    last_accessed_column = _get_column(insp, "data", "last_accessed")  
-    if last_accessed_column:  
+def _get_column(inspector, table, name, schema=None):    
+    for col in inspector.get_columns(table, schema=schema):    
+        if col["name"] == name:    
+            return col    
+    return None    
+    
+    
+def upgrade() -> None:    
+    conn = op.get_bind()    
+    insp = sa.inspect(conn)    
+    
+    last_accessed_column = _get_column(insp, "data", "last_accessed")     
+    if not last_accessed_column:    
+        # Always create the column for schema consistency  
+        op.add_column('data',     
+            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)    
+        )    
+          
+        # Only initialize existing records if feature is enabled  
+        enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true"  
+        if enable_last_accessed:  
+            op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
+    
+    
+def downgrade() -> None:    
+    conn = op.get_bind()    
+    insp = sa.inspect(conn)    
+        
+    last_accessed_column = _get_column(insp, "data", "last_accessed")    
+    if last_accessed_column:    
         op.drop_column('data', 'last_accessed')
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 65d597a93..6df0284ec 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -4,7 +4,7 @@ import json
 from datetime import datetime, timezone  
 from typing import List, Any  
 from uuid import UUID  
-  
+import os 
 from cognee.infrastructure.databases.graph import get_graph_engine  
 from cognee.infrastructure.databases.relational import get_relational_engine  
 from cognee.modules.data.models import Data  
@@ -27,7 +27,10 @@ async def update_node_access_timestamps(items: List[Any]):
     ----------  
     items : List[Any]  
         List of items with payload containing 'id' field (from vector search results)  
-    """  
+    """ 
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
+        return  
+          
     if not items:  
         return  
       
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index fd4b68204..175452a0a 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -10,7 +10,7 @@ import json
 from datetime import datetime, timezone, timedelta    
 from typing import Optional, Dict, Any    
 from uuid import UUID    
-    
+import os    
 from cognee.infrastructure.databases.graph import get_graph_engine    
 from cognee.infrastructure.databases.vector import get_vector_engine    
 from cognee.infrastructure.databases.relational import get_relational_engine  
@@ -47,7 +47,43 @@ async def cleanup_unused_data(
     -------    
     Dict[str, Any]    
         Cleanup results with status, counts, and timestamp    
-    """    
+    """   
+    # Check 1: Environment variable must be enabled  
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
+        logger.warning(  
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."  
+        )  
+        return {  
+            "status": "skipped",  
+            "reason": "ENABLE_LAST_ACCESSED not enabled",  
+            "unused_count": 0,  
+            "deleted_count": {},  
+            "cleanup_date": datetime.now(timezone.utc).isoformat()  
+        }  
+      
+    # Check 2: Verify tracking has actually been running  
+    db_engine = get_relational_engine()  
+    async with db_engine.get_async_session() as session:  
+        # Count records with non-NULL last_accessed  
+        tracked_count = await session.execute(  
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))  
+        )  
+        tracked_records = tracked_count.scalar()  
+          
+        if tracked_records == 0:  
+            logger.warning(  
+                "Cleanup skipped: No records have been tracked yet. "  
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "  
+                "Wait for retrievers to update timestamps before running cleanup."  
+            )  
+            return {  
+                "status": "skipped",  
+                "reason": "No tracked records found - tracking may be newly enabled",  
+                "unused_count": 0,  
+                "deleted_count": {},  
+                "cleanup_date": datetime.now(timezone.utc).isoformat()  
+            }  
+      
     logger.info(    
         "Starting cleanup task",    
         days_threshold=days_threshold,    

From 5cb6510205742e7a5abf2afe23d2527b229931d0 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 24 Nov 2025 13:12:46 +0530
Subject: [PATCH 20/25] fix: import

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index 175452a0a..a90d96b5c 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -18,6 +18,7 @@ from cognee.modules.data.models import Data, DatasetData
 from cognee.shared.logging_utils import get_logger    
 from sqlalchemy import select, or_  
 import cognee  
+import sqlalchemy as sa
     
 logger = get_logger(__name__)    
     

From 12ce80005ceccafac38a63da458e6df376776b61 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 26 Nov 2025 17:32:50 +0530
Subject: [PATCH 21/25] fix: generalized queries

---
 .../retrieval/utils/access_tracking.py        | 147 ++--
 cognee/tasks/cleanup/cleanup_unused_data.py   | 778 ++++++++++--------
 2 files changed, 516 insertions(+), 409 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 6df0284ec..12a66f8bc 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -13,24 +13,10 @@ from sqlalchemy import update
   
 logger = get_logger(__name__)  
   
-  
 async def update_node_access_timestamps(items: List[Any]):  
-    """  
-    Update last_accessed_at for nodes in graph database and corresponding Data records in SQL.  
-      
-    This function:  
-    1. Updates last_accessed_at in the graph database nodes (in properties JSON)  
-    2. Traverses to find origin TextDocument nodes (without hardcoded relationship names)  
-    3. Updates last_accessed in the SQL Data table for those documents  
-      
-    Parameters  
-    ----------  
-    items : List[Any]  
-        List of items with payload containing 'id' field (from vector search results)  
-    """ 
     if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
         return  
-          
+      
     if not items:  
         return  
       
@@ -49,50 +35,95 @@ async def update_node_access_timestamps(items: List[Any]):
         return  
       
     try:  
-        # Step 1: Batch update graph nodes  
-        for node_id in node_ids:  
-            result = await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) RETURN n.properties",  
-                {"id": node_id}  
-            )  
+        # Detect database provider and use appropriate queries  
+        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+          
+        if provider == "kuzu":  
+            await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms)  
+        elif provider == "neo4j":  
+            await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms)  
+        elif provider == "neptune":  
+            await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms)  
+        else:  
+            logger.warning(f"Unsupported graph provider: {provider}")  
+            return  
               
-            if result and result[0]:  
-                props = json.loads(result[0][0]) if result[0][0] else {}  
-                props["last_accessed_at"] = timestamp_ms  
-                  
-                await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                    {"id": node_id, "props": json.dumps(props)}  
-                )  
-          
-        logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")  
-          
-        # Step 2: Find origin TextDocument nodes (without hardcoded relationship names)  
-        origin_query = """  
-        UNWIND $node_ids AS node_id  
-        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)  
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']  
-        RETURN DISTINCT doc.id  
-        """  
-          
-        result = await graph_engine.query(origin_query, {"node_ids": node_ids})  
-          
-        # Extract and deduplicate document IDs  
-        doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else []  
-          
-        # Step 3: Update SQL Data table  
+        # Find origin documents and update SQL  
+        doc_ids = await _find_origin_documents(graph_engine, node_ids, provider)  
         if doc_ids:  
-            db_engine = get_relational_engine()  
-            async with db_engine.get_async_session() as session:  
-                stmt = update(Data).where(  
-                    Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
-                ).values(last_accessed=timestamp_dt)  
-                  
-                await session.execute(stmt)  
-                await session.commit()  
-                  
-            logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL")  
-          
+            await _update_sql_records(doc_ids, timestamp_dt)  
+              
     except Exception as e:  
         logger.error(f"Failed to update timestamps: {e}")  
-        raise
+        raise  
+  
+async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms):  
+    """Kuzu-specific node updates"""  
+    for node_id in node_ids:  
+        result = await graph_engine.query(  
+            "MATCH (n:Node {id: $id}) RETURN n.properties",  
+            {"id": node_id}  
+        )  
+          
+        if result and result[0]:  
+            props = json.loads(result[0][0]) if result[0][0] else {}  
+            props["last_accessed_at"] = timestamp_ms  
+              
+            await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                {"id": node_id, "props": json.dumps(props)}  
+            )  
+  
+async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms):  
+    """Neo4j-specific node updates"""  
+    for node_id in node_ids:  
+        await graph_engine.query(  
+            "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
+            {"id": node_id, "timestamp": timestamp_ms}  
+        )  
+  
+async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms):  
+    """Neptune-specific node updates"""  
+    for node_id in node_ids:  
+        await graph_engine.query(  
+            "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
+            {"id": node_id, "timestamp": timestamp_ms}  
+        )  
+  
+async def _find_origin_documents(graph_engine, node_ids, provider):  
+    """Find origin documents with provider-specific queries"""  
+    if provider == "kuzu":  
+        query = """  
+        UNWIND $node_ids AS node_id    
+        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
+        RETURN DISTINCT doc.id  
+        """  
+    elif provider == "neo4j":  
+        query = """  
+        UNWIND $node_ids AS node_id    
+        MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__)    
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
+        RETURN DISTINCT doc.id  
+        """  
+    elif provider == "neptune":  
+        query = """  
+        UNWIND $node_ids AS node_id    
+        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
+        RETURN DISTINCT doc.id  
+        """  
+      
+    result = await graph_engine.query(query, {"node_ids": node_ids})  
+    return list(set([row[0] for row in result if row and row[0]])) if result else []  
+  
+async def _update_sql_records(doc_ids, timestamp_dt):  
+    """Update SQL Data table (same for all providers)"""  
+    db_engine = get_relational_engine()  
+    async with db_engine.get_async_session() as session:  
+        stmt = update(Data).where(  
+            Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
+        ).values(last_accessed=timestamp_dt)  
+          
+        await session.execute(stmt)  
+        await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index a90d96b5c..b89c939a8 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,372 +1,448 @@
-"""    
-Task for automatically deleting unused data from the memify pipeline.    
-    
-This task identifies and removes data (chunks, entities, summaries) that hasn't    
-been accessed by retrievers for a specified period, helping maintain system    
-efficiency and storage optimization.    
-"""    
-    
-import json    
-from datetime import datetime, timezone, timedelta    
-from typing import Optional, Dict, Any    
-from uuid import UUID    
-import os    
-from cognee.infrastructure.databases.graph import get_graph_engine    
-from cognee.infrastructure.databases.vector import get_vector_engine    
-from cognee.infrastructure.databases.relational import get_relational_engine  
-from cognee.modules.data.models import Data, DatasetData  
-from cognee.shared.logging_utils import get_logger    
-from sqlalchemy import select, or_  
-import cognee  
-import sqlalchemy as sa
-    
-logger = get_logger(__name__)    
+"""      
+Task for automatically deleting unused data from the memify pipeline.      
+      
+This task identifies and removes data (chunks, entities, summaries)) that hasn't      
+been accessed by retrievers for a specified period, helping maintain system      
+efficiency and storage optimization.      
+"""      
+      
+import json      
+from datetime import datetime, timezone, timedelta      
+from typing import Optional, Dict, Any      
+from uuid import UUID      
+import os      
+from cognee.infrastructure.databases.graph import get_graph_engine      
+from cognee.infrastructure.databases.vector import get_vector_engine      
+from cognee.infrastructure.databases.relational import get_relational_engine    
+from cognee.modules.data.models import Data, DatasetData    
+from cognee.shared.logging_utils import get_logger      
+from sqlalchemy import select, or_    
+import cognee    
+import sqlalchemy as sa  
+      
+logger = get_logger(__name__)      
+      
+      
+async def cleanup_unused_data(      
+    minutes_threshold: Optional[int],      
+    dry_run: bool = True,      
+    user_id: Optional[UUID] = None,    
+    text_doc: bool = False    
+) -> Dict[str, Any]:      
+    """      
+    Identify and remove unused data from the memify pipeline.      
+          
+    Parameters      
+    ----------      
+    minutes_threshold : int      
+        days since last access to consider data unused       
+    dry_run : bool      
+        If True, only report what would be delete without actually deleting (default: True)      
+    user_id : UUID, optional      
+        Limit cleanup to specific user's data (default: None)    
+    text_doc : bool    
+        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()    
+        for proper whole-document deletion (default: False)    
+          
+    Returns      
+    -------      
+    Dict[str, Any]      
+        Cleanup results with status, counts, and timestamp      
+    """     
+    # Check 1: Environment variable must be enabled    
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":    
+        logger.warning(    
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."    
+        )    
+        return {    
+            "status": "skipped",    
+            "reason": "ENABLE_LAST_ACCESSED not enabled",    
+            "unused_count": 0,    
+            "deleted_count": {},    
+            "cleanup_date": datetime.now(timezone.utc).isoformat()    
+        }    
+        
+    # Check 2: Verify tracking has actually been running    
+    db_engine = get_relational_engine()    
+    async with db_engine.get_async_session() as session:    
+        # Count records with non-NULL last_accessed    
+        tracked_count = await session.execute(    
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))    
+        )    
+        tracked_records = tracked_count.scalar()    
+            
+        if tracked_records == 0:    
+            logger.warning(    
+                "Cleanup skipped: No records have been tracked yet. "    
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "    
+                "Wait for retrievers to update timestamps before running cleanup."    
+            )    
+            return {    
+                "status": "skipped",    
+                "reason": "No tracked records found - tracking may be newly enabled",    
+                "unused_count": 0,    
+                "deleted_count": {},    
+                "cleanup_date": datetime.now(timezone.utc).isoformat()    
+            }    
+        
+    logger.info(      
+        "Starting cleanup task",      
+        minutes_threshold=minutes_threshold,      
+        dry_run=dry_run,      
+        user_id=str(user_id) if user_id else None,    
+        text_doc=text_doc    
+    )      
+          
+    # Calculate cutoff timestamp    
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+        
+    if text_doc:    
+        # SQL-based approach: Find unused TextDocuments and use cognee.delete()    
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)    
+    else:    
+        # Graph-based approach: Find unused nodes directly from graph    
+        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)    
+        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")      
+              
+        # Detect database provider and find unused nodes  
+        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider)  
+              
+        total_unused = sum(len(nodes) for nodes in unused_nodes.values())      
+        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})      
+              
+        if dry_run:      
+            return {      
+                "status": "dry_run",      
+                "unused_count": total_unused,      
+                "deleted_count": {      
+                    "data_items": 0,      
+                    "chunks": 0,      
+                    "entities": 0,      
+                    "summaries": 0,      
+                    "associations": 0      
+                },      
+                "cleanup_date": datetime.now(timezone.utc).isoformat(),      
+                "preview": {      
+                    "chunks": len(unused_nodes["DocumentChunk"]),      
+                    "entities": len(unused_nodes["Entity"]),      
+                    "summaries": len(unused_nodes["TextSummary"])      
+                }      
+            }      
+              
+        # Delete unused nodes with provider-specific logic  
+        deleted_counts = await _delete_unused_nodes(unused_nodes, provider)  
+              
+        logger.info("Cleanup completed", deleted_counts=deleted_counts)      
+              
+        return {      
+            "status": "completed",      
+            "unused_count": total_unused,      
+            "deleted_count": {      
+                "data_items": 0,      
+                "chunks": deleted_counts["DocumentChunk"],      
+                "entities": deleted_counts["Entity"],      
+                "summaries": deleted_counts["TextSummary"],      
+                "associations": deleted_counts["associations"]      
+            },      
+            "cleanup_date": datetime.now(timezone.utc).isoformat()      
+        }    
     
     
-async def cleanup_unused_data(    
-    days_threshold: Optional[int],    
-    dry_run: bool = True,    
-    user_id: Optional[UUID] = None,  
-    text_doc: bool = False  
+async def _cleanup_via_sql(    
+    cutoff_date: datetime,    
+    dry_run: bool,    
+    user_id: Optional[UUID] = None    
 ) -> Dict[str, Any]:    
     """    
-    Identify and remove unused data from the memify pipeline.    
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().    
         
     Parameters    
     ----------    
-    days_threshold : int    
-        days since last access to consider data unused     
+    cutoff_date : datetime    
+        Cutoff date for last_accessed filtering    
     dry_run : bool    
-        If True, only report what would be deleted without actually deleting (default: True)    
-    user_id : UUID, optional    
-        Limit cleanup to specific user's data (default: None)  
-    text_doc : bool  
-        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()  
-        for proper whole-document deletion (default: False)  
-        
-    Returns    
-    -------    
-    Dict[str, Any]    
-        Cleanup results with status, counts, and timestamp    
-    """   
-    # Check 1: Environment variable must be enabled  
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
-        logger.warning(  
-            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."  
-        )  
-        return {  
-            "status": "skipped",  
-            "reason": "ENABLE_LAST_ACCESSED not enabled",  
-            "unused_count": 0,  
-            "deleted_count": {},  
-            "cleanup_date": datetime.now(timezone.utc).isoformat()  
-        }  
-      
-    # Check 2: Verify tracking has actually been running  
-    db_engine = get_relational_engine()  
-    async with db_engine.get_async_session() as session:  
-        # Count records with non-NULL last_accessed  
-        tracked_count = await session.execute(  
-            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))  
-        )  
-        tracked_records = tracked_count.scalar()  
-          
-        if tracked_records == 0:  
-            logger.warning(  
-                "Cleanup skipped: No records have been tracked yet. "  
-                "ENABLE_LAST_ACCESSED may have been recently enabled. "  
-                "Wait for retrievers to update timestamps before running cleanup."  
-            )  
-            return {  
-                "status": "skipped",  
-                "reason": "No tracked records found - tracking may be newly enabled",  
-                "unused_count": 0,  
-                "deleted_count": {},  
-                "cleanup_date": datetime.now(timezone.utc).isoformat()  
-            }  
-      
-    logger.info(    
-        "Starting cleanup task",    
-        days_threshold=days_threshold,    
-        dry_run=dry_run,    
-        user_id=str(user_id) if user_id else None,  
-        text_doc=text_doc  
-    )    
-        
-    # Calculate cutoff timestamp  
-    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
-      
-    if text_doc:  
-        # SQL-based approach: Find unused TextDocuments and use cognee.delete()  
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)  
-    else:  
-        # Graph-based approach: Find unused nodes directly from graph  
-        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
-        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")    
-            
-        # Find unused nodes    
-        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)    
-            
-        total_unused = sum(len(nodes) for nodes in unused_nodes.values())    
-        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})    
-            
-        if dry_run:    
-            return {    
-                "status": "dry_run",    
-                "unused_count": total_unused,    
-                "deleted_count": {    
-                    "data_items": 0,    
-                    "chunks": 0,    
-                    "entities": 0,    
-                    "summaries": 0,    
-                    "associations": 0    
-                },    
-                "cleanup_date": datetime.now(timezone.utc).isoformat(),    
-                "preview": {    
-                    "chunks": len(unused_nodes["DocumentChunk"]),    
-                    "entities": len(unused_nodes["Entity"]),    
-                    "summaries": len(unused_nodes["TextSummary"])    
-                }    
-            }    
-            
-        # Delete unused nodes    
-        deleted_counts = await _delete_unused_nodes(unused_nodes)    
-            
-        logger.info("Cleanup completed", deleted_counts=deleted_counts)    
-            
-        return {    
-            "status": "completed",    
-            "unused_count": total_unused,    
-            "deleted_count": {    
-                "data_items": 0,    
-                "chunks": deleted_counts["DocumentChunk"],    
-                "entities": deleted_counts["Entity"],    
-                "summaries": deleted_counts["TextSummary"],    
-                "associations": deleted_counts["associations"]    
-            },    
-            "cleanup_date": datetime.now(timezone.utc).isoformat()    
-        }  
-  
-  
-async def _cleanup_via_sql(  
-    cutoff_date: datetime,  
-    dry_run: bool,  
-    user_id: Optional[UUID] = None  
-) -> Dict[str, Any]:  
-    """  
-    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().  
-      
-    Parameters  
-    ----------  
-    cutoff_date : datetime  
-        Cutoff date for last_accessed filtering  
-    dry_run : bool  
-        If True, only report what would be deleted  
-    user_id : UUID, optional  
-        Filter by user ID if provided  
-      
-    Returns  
-    -------  
-    Dict[str, Any]  
-        Cleanup results  
-    """  
-    db_engine = get_relational_engine()  
-      
-    async with db_engine.get_async_session() as session:  
-        # Query for Data records with old last_accessed timestamps  
-        query = select(Data, DatasetData).join(  
-            DatasetData, Data.id == DatasetData.data_id  
-        ).where(  
-            or_(  
-                Data.last_accessed < cutoff_date,  
-                Data.last_accessed.is_(None)  
-            )  
-        )  
-          
-        if user_id:  
-            from cognee.modules.data.models import Dataset  
-            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(  
-                Dataset.owner_id == user_id  
-            )  
-          
-        result = await session.execute(query)  
-        unused_data = result.all()  
-      
-    logger.info(f"Found {len(unused_data)} unused documents in SQL")  
-      
-    if dry_run:  
-        return {  
-            "status": "dry_run",  
-            "unused_count": len(unused_data),  
-            "deleted_count": {  
-                "data_items": 0,  
-                "documents": 0  
-            },  
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
-            "preview": {  
-                "documents": len(unused_data)  
-            }  
-        }  
-      
-    # Delete each document using cognee.delete()  
-    deleted_count = 0  
-    from cognee.modules.users.methods import get_default_user  
-    user = await get_default_user() if user_id is None else None  
-      
-    for data, dataset_data in unused_data:  
-        try:  
-            await cognee.delete(  
-                data_id=data.id,  
-                dataset_id=dataset_data.dataset_id,  
-                mode="hard",  # Use hard mode to also remove orphaned entities  
-                user=user  
-            )  
-            deleted_count += 1  
-            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")  
-        except Exception as e:  
-            logger.error(f"Failed to delete document {data.id}: {e}")  
-      
-    logger.info("Cleanup completed", deleted_count=deleted_count)  
-      
-    return {  
-        "status": "completed",  
-        "unused_count": len(unused_data),  
-        "deleted_count": {  
-            "data_items": deleted_count,  
-            "documents": deleted_count  
-        },  
-        "cleanup_date": datetime.now(timezone.utc).isoformat()  
-    }  
-    
-    
-async def _find_unused_nodes(    
-    cutoff_timestamp_ms: int,    
-    user_id: Optional[UUID] = None    
-) -> Dict[str, list]:    
-    """    
-    Query Kuzu for nodes with old last_accessed_at timestamps.    
-        
-    Parameters    
-    ----------    
-    cutoff_timestamp_ms : int    
-        Cutoff timestamp in milliseconds since epoch    
+        If True, only report what would be deleted    
     user_id : UUID, optional    
         Filter by user ID if provided    
         
     Returns    
     -------    
-    Dict[str, list]    
-        Dictionary mapping node types to lists of unused node IDs    
+    Dict[str, Any]    
+        Cleanup results    
     """    
-    graph_engine = await get_graph_engine()    
+    db_engine = get_relational_engine()    
         
-    # Query all nodes with their properties    
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"    
-    results = await graph_engine.query(query)    
-        
-    unused_nodes = {    
-        "DocumentChunk": [],    
-        "Entity": [],    
-        "TextSummary": []    
-    }    
-        
-    for node_id, node_type, props_json in results:    
-        # Only process tracked node types    
-        if node_type not in unused_nodes:    
-            continue    
-            
-        # Parse properties JSON    
-        if props_json:    
-            try:    
-                props = json.loads(props_json)    
-                last_accessed = props.get("last_accessed_at")    
-                    
-                # Check if node is unused (never accessed or accessed before cutoff)    
-                if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
-                    unused_nodes[node_type].append(node_id)    
-                    logger.debug(    
-                        f"Found unused {node_type}",    
-                        node_id=node_id,    
-                        last_accessed=last_accessed    
-                    )    
-            except json.JSONDecodeError:    
-                logger.warning(f"Failed to parse properties for node {node_id}")    
-                continue    
-        
-    return unused_nodes    
-    
-    
-async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:    
-    """    
-    Delete unused nodes from graph and vector databases.    
-        
-    Parameters    
-    ----------    
-    unused_nodes : Dict[str, list]    
-        Dictionary mapping node types to lists of node IDs to delete    
-        
-    Returns    
-    -------    
-    Dict[str, int]    
-        Count of deleted items by type    
-    """    
-    graph_engine = await get_graph_engine()    
-    vector_engine = get_vector_engine()    
-        
-    deleted_counts = {    
-        "DocumentChunk": 0,    
-        "Entity": 0,    
-        "TextSummary": 0,    
-        "associations": 0    
-    }    
-        
-    # Count associations before deletion    
-    for node_type, node_ids in unused_nodes.items():    
-        if not node_ids:    
-            continue    
-            
-        # Count edges connected to these nodes    
-        for node_id in node_ids:    
-            result = await graph_engine.query(    
-                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",    
-                {"id": node_id}    
+    async with db_engine.get_async_session() as session:    
+        # Query for Data records with old last_accessed timestamps    
+        query = select(Data, DatasetData).join(    
+            DatasetData, Data.id == DatasetData.data_id    
+        ).where(    
+            or_(    
+                Data.last_accessed < cutoff_date,    
+                Data.last_accessed.is_(None)    
             )    
-            if result and len(result) > 0:    
-                deleted_counts["associations"] += result[0][0]    
-        
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)    
-    for node_type, node_ids in unused_nodes.items():    
-        if not node_ids:    
-            continue    
+        )    
             
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")    
+        if user_id:    
+            from cognee.modules.data.models import Dataset    
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(    
+                Dataset.owner_id == user_id    
+            )    
             
-        # Delete nodes in batches    
-        await graph_engine.delete_nodes(node_ids)    
-        deleted_counts[node_type] = len(node_ids)    
+        result = await session.execute(query)    
+        unused_data = result.all()    
         
-    # Delete from vector database    
-    vector_collections = {    
-        "DocumentChunk": "DocumentChunk_text",    
-        "Entity": "Entity_name",    
-        "TextSummary": "TextSummary_text"    
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")    
+        
+    if dry_run:    
+        return {    
+            "status": "dry_run",    
+            "unused_count": len(unused_data),    
+            "deleted_count": {    
+                "data_items": 0,    
+                "documents": 0    
+            },    
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),    
+            "preview": {    
+                "documents": len(unused_data)    
+            }    
+        }    
+        
+    # Delete each document using cognee.delete()    
+    deleted_count = 0    
+    from cognee.modules.users.methods import get_default_user    
+    user = await get_default_user() if user_id is None else None    
+        
+    for data, dataset_data in unused_data:    
+        try:    
+            await cognee.delete(    
+                data_id=data.id,    
+                dataset_id=dataset_data.dataset_id,    
+                mode="hard",  # Use hard mode to also remove orphaned entities    
+                user=user    
+            )    
+            deleted_count += 1    
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")    
+        except Exception as e:    
+            logger.error(f"Failed to delete document {data.id}: {e}")    
+        
+    logger.info("Cleanup completed", deleted_count=deleted_count)    
+        
+    return {    
+        "status": "completed",    
+        "unused_count": len(unused_data),    
+        "deleted_count": {    
+            "data_items": deleted_count,    
+            "documents": deleted_count    
+        },    
+        "cleanup_date": datetime.now(timezone.utc).isoformat()    
     }    
-        
-
-    for node_type, collection_name in vector_collections.items():
-        node_ids = unused_nodes[node_type]
-        if not node_ids:
-            continue
-
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")
-
-        try:
-            if await vector_engine.has_collection(collection_name):
-                await vector_engine.delete_data_points(
-                    collection_name,
-                    [str(node_id) for node_id in node_ids]
-                )
-        except Exception as e:
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")
-        
+      
+      
+async def _find_unused_nodes(      
+    cutoff_timestamp_ms: int,      
+    user_id: Optional[UUID] = None,  
+    provider: str = "kuzu"  
+) -> Dict[str, list]:      
+    """      
+    Find unused nodes with provider-specific queries.      
+          
+    Parameters      
+    ----------      
+    cutoff_timestamp_ms : int      
+        Cutoff timestamp in milliseconds since epoch      
+    user_id : UUID, optional      
+        Filter by user ID if provided    
+    provider : str    
+        Graph database provider (kuzu, neo4j, neptune)  
+          
+    Returns      
+    -------      
+    Dict[str, list]      
+        Dictionary mapping node types to lists of unused node IDs      
+    """      
+    graph_engine = await get_graph_engine()      
+          
+    if provider == "kuzu":  
+        return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms)  
+    elif provider == "neo4j":  
+        return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms)  
+    elif provider == "neptune":  
+        return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms)  
+    else:  
+        logger.warning(f"Unsupported graph provider: {provider}")  
+        return {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+  
+  
+async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms):  
+    """Kuzu-specific unused node detection"""  
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+      
+    for node_id, node_type, props_json in results:  
+        if node_type not in unused_nodes:  
+            continue  
+              
+        if props_json:  
+            try:  
+                props = json.loads(props_json)  
+                last_accessed = props.get("last_accessed_at")  
+                  
+                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+                    unused_nodes[node_type].append(node_id)  
+                    logger.debug(  
+                        f"Found unused {node_type}",  
+                        node_id=node_id,  
+                        last_accessed=last_accessed  
+                    )  
+            except json.JSONDecodeError:  
+                logger.warning(f"Failed to parse properties for node {node_id}")  
+                continue  
+      
+    return unused_nodes  
+  
+  
+async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms):  
+    """Neo4j-specific unused node detection"""  
+    query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+      
+    for row in results:  
+        node_id = row["n"]["id"]  
+        node_type = row["n"]["type"]  
+        last_accessed = row["n"].get("last_accessed_at")  
+          
+        if node_type not in unused_nodes:  
+            continue  
+              
+        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+            unused_nodes[node_type].append(node_id)  
+            logger.debug(  
+                f"Found unused {node_type}",  
+                node_id=node_id,  
+                last_accessed=last_accessed  
+            )  
+      
+    return unused_nodes  
+  
+  
+async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms):  
+    """Neptune-specific unused node detection"""  
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+      
+    for row in results:  
+        node_id = row["n"]["id"]  
+        node_type = row["n"]["type"]  
+        last_accessed = row["n"].get("last_accessed_at")  
+          
+        if node_type not in unused_nodes:  
+            continue  
+              
+        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+            unused_nodes[node_type].append(node_id)  
+            logger.debug(  
+                f"Found unused {node_type}",  
+                node_id=node_id,  
+                last_accessed=last_accessed  
+            )  
+      
+    return unused_nodes  
+  
+  
+async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]:      
+    """      
+    Delete unused nodes from graph and vector databases.      
+          
+    Parameters      
+    ----------      
+    unused_nodes : Dict[str, list]      
+        Dictionary mapping node types to lists of node IDs to delete    
+    provider : str    
+        Graph database provider (kuzu, neo4j, neptune)  
+          
+    Returns      
+    -------      
+    Dict[str, int]      
+        Count of deleted items by type      
+    """      
+    graph_engine = await get_graph_engine()      
+    vector_engine = get_vector_engine()      
+          
+    deleted_counts = {      
+        "DocumentChunk": 0,      
+        "Entity": 0,      
+        "TextSummary": 0,      
+        "associations": 0      
+    }      
+          
+    # Count associations before deletion      
+    for node_type, node_ids in unused_nodes.items():      
+        if not node_ids:      
+            continue      
+              
+        # Count edges connected to these nodes      
+        for node_id in node_ids:      
+            if provider == "kuzu":  
+                result = await graph_engine.query(      
+                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
+                    {"id": node_id}      
+                )  
+            elif provider == "neo4j":  
+                result = await graph_engine.query(      
+                    "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)",      
+                    {"id": node_id}      
+                )  
+            elif provider == "neptune":  
+                result = await graph_engine.query(      
+                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
+                    {"id": node_id}      
+                )  
+              
+            if result and len(result) > 0:      
+                count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"]  
+                deleted_counts["associations"] += count  
+      
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)      
+    for node_type, node_ids in unused_nodes.items():      
+        if not node_ids:      
+            continue      
+              
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")      
+              
+        # Delete nodes in batches      
+        await graph_engine.delete_nodes(node_ids)      
+        deleted_counts[node_type] = len(node_ids)      
+          
+    # Delete from vector database      
+    vector_collections = {      
+        "DocumentChunk": "DocumentChunk_text",      
+        "Entity": "Entity_name",      
+        "TextSummary": "TextSummary_text"      
+    }      
+          
+  
+    for node_type, collection_name in vector_collections.items():  
+        node_ids = unused_nodes[node_type]  
+        if not node_ids:  
+            continue  
+  
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
+  
+        try:  
+            if await vector_engine.has_collection(collection_name):  
+                await vector_engine.delete_data_points(  
+                    collection_name,  
+                    [str(node_id) for node_id in node_ids]  
+                )  
+        except Exception as e:  
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
+          
     return deleted_counts

From 6a4d31356bb613e5cf74e7972445f804796ee6d4 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 2 Dec 2025 18:55:47 +0530
Subject: [PATCH 22/25] fix: using graph projection instead of conditions

---
 .../retrieval/utils/access_tracking.py        | 156 ++--
 cognee/tasks/cleanup/cleanup_unused_data.py   | 759 ++++++++----------
 2 files changed, 418 insertions(+), 497 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 12a66f8bc..935c47157 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -4,118 +4,116 @@ import json
 from datetime import datetime, timezone  
 from typing import List, Any  
 from uuid import UUID  
-import os 
+import os   
 from cognee.infrastructure.databases.graph import get_graph_engine  
 from cognee.infrastructure.databases.relational import get_relational_engine  
 from cognee.modules.data.models import Data  
 from cognee.shared.logging_utils import get_logger  
 from sqlalchemy import update  
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
   
 logger = get_logger(__name__)  
   
 async def update_node_access_timestamps(items: List[Any]):  
     if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
         return  
-      
+          
     if not items:  
         return  
-      
+          
     graph_engine = await get_graph_engine()  
     timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
     timestamp_dt = datetime.now(timezone.utc)  
-      
+          
     # Extract node IDs  
     node_ids = []  
     for item in items:  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
         if item_id:  
             node_ids.append(str(item_id))  
-      
+          
     if not node_ids:  
         return  
-      
-    try:  
-        # Detect database provider and use appropriate queries  
-        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
           
-        if provider == "kuzu":  
-            await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms)  
-        elif provider == "neo4j":  
-            await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms)  
-        elif provider == "neptune":  
-            await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms)  
-        else:  
-            logger.warning(f"Unsupported graph provider: {provider}")  
-            return  
+    try:  
+        # Update nodes using graph projection ( database-agnostic approach  
+        await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)  
               
         # Find origin documents and update SQL  
-        doc_ids = await _find_origin_documents(graph_engine, node_ids, provider)  
+        doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
         if doc_ids:  
             await _update_sql_records(doc_ids, timestamp_dt)  
-              
+                  
     except Exception as e:  
         logger.error(f"Failed to update timestamps: {e}")  
         raise  
   
-async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms):  
-    """Kuzu-specific node updates"""  
-    for node_id in node_ids:  
-        result = await graph_engine.query(  
-            "MATCH (n:Node {id: $id}) RETURN n.properties",  
-            {"id": node_id}  
-        )  
-          
-        if result and result[0]:  
-            props = json.loads(result[0][0]) if result[0][0] else {}  
-            props["last_accessed_at"] = timestamp_ms  
-              
-            await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                {"id": node_id, "props": json.dumps(props)}  
-            )  
-  
-async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms):  
-    """Neo4j-specific node updates"""  
-    for node_id in node_ids:  
-        await graph_engine.query(  
-            "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
-            {"id": node_id, "timestamp": timestamp_ms}  
-        )  
-  
-async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms):  
-    """Neptune-specific node updates"""  
-    for node_id in node_ids:  
-        await graph_engine.query(  
-            "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
-            {"id": node_id, "timestamp": timestamp_ms}  
-        )  
-  
-async def _find_origin_documents(graph_engine, node_ids, provider):  
-    """Find origin documents with provider-specific queries"""  
-    if provider == "kuzu":  
-        query = """  
-        UNWIND $node_ids AS node_id    
-        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
-        RETURN DISTINCT doc.id  
-        """  
-    elif provider == "neo4j":  
-        query = """  
-        UNWIND $node_ids AS node_id    
-        MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__)    
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
-        RETURN DISTINCT doc.id  
-        """  
-    elif provider == "neptune":  
-        query = """  
-        UNWIND $node_ids AS node_id    
-        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
-        RETURN DISTINCT doc.id  
-        """  
+async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):  
+    """Update nodes using graph projection - works with any graph database"""  
+    # Project the graph with necessary properties  
+    memory_fragment = CogneeGraph()  
+    await memory_fragment.project_graph_from_db(  
+        graph_engine,  
+        node_properties_to_project=["id"],  
+        edge_properties_to_project=[]  
+    )  
       
-    result = await graph_engine.query(query, {"node_ids": node_ids})  
-    return list(set([row[0] for row in result if row and row[0]])) if result else []  
+    # Update each node's last_accessed_at property  
+    for node_id in node_ids:  
+        node = memory_fragment.get_node(node_id)  
+        if node:  
+            # Update the node in the database  
+            provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+              
+            if provider == "kuzu":  
+                # Kuzu stores properties as JSON  
+                result = await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) RETURN n.properties",  
+                    {"id": node_id}  
+                )  
+                  
+                if result and result[0]:  
+                    props = json.loads(result[0][0]) if result[0][0] else {}  
+                    props["last_accessed_at"] = timestamp_ms  
+                      
+                    await graph_engine.query(  
+                        "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                        {"id": node_id, "props": json.dumps(props)}  
+                    )  
+            elif provider == "neo4j":  
+                await graph_engine.query(  
+                    "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
+                    {"id": node_id, "timestamp": timestamp_ms}  
+                )  
+            elif provider == "neptune":  
+                await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
+                    {"id": node_id, "timestamp": timestamp_ms}  
+                )  
+  
+async def _find_origin_documents_via_projection(graph_engine, node_ids):  
+    """Find origin documents using graph projection instead of DB queries"""  
+    # Project the entire graph with necessary properties  
+    memory_fragment = CogneeGraph()  
+    await memory_fragment.project_graph_from_db(  
+        graph_engine,  
+        node_properties_to_project=["id", "type"],  
+        edge_properties_to_project=["relationship_name"]  
+    )  
+      
+    # Find origin documents by traversing the in-memory graph  
+    doc_ids = set()  
+    for node_id in node_ids:  
+        node = memory_fragment.get_node(node_id)  
+        if node and node.get_attribute("type") == "DocumentChunk":  
+            # Traverse edges to find connected documents  
+            for edge in node.get_skeleton_edges():  
+                # Get the neighbor node  
+                neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node()  
+                if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:  
+                    doc_ids.add(neighbor.id)  
+      
+    return list(doc_ids)  
   
 async def _update_sql_records(doc_ids, timestamp_dt):  
     """Update SQL Data table (same for all providers)"""  
@@ -124,6 +122,6 @@ async def _update_sql_records(doc_ids, timestamp_dt):
         stmt = update(Data).where(  
             Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
         ).values(last_accessed=timestamp_dt)  
-          
+              
         await session.execute(stmt)  
         await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index b89c939a8..c70b97a00 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,448 +1,371 @@
-"""      
-Task for automatically deleting unused data from the memify pipeline.      
-      
-This task identifies and removes data (chunks, entities, summaries)) that hasn't      
-been accessed by retrievers for a specified period, helping maintain system      
-efficiency and storage optimization.      
-"""      
-      
-import json      
-from datetime import datetime, timezone, timedelta      
-from typing import Optional, Dict, Any      
-from uuid import UUID      
-import os      
-from cognee.infrastructure.databases.graph import get_graph_engine      
-from cognee.infrastructure.databases.vector import get_vector_engine      
-from cognee.infrastructure.databases.relational import get_relational_engine    
-from cognee.modules.data.models import Data, DatasetData    
-from cognee.shared.logging_utils import get_logger      
-from sqlalchemy import select, or_    
-import cognee    
-import sqlalchemy as sa  
-      
-logger = get_logger(__name__)      
+"""        
+Task for automatically deleting unused data from the memify pipeline.        
+        
+This task identifies and removes data (chunks, entities, summaries)) that hasn't        
+been accessed by retrievers for a specified period, helping maintain system        
+efficiency and storage optimization.        
+"""        
+        
+import json        
+from datetime import datetime, timezone, timedelta        
+from typing import Optional, Dict, Any        
+from uuid import UUID        
+import os        
+from cognee.infrastructure.databases.graph import get_graph_engine        
+from cognee.infrastructure.databases.vector import get_vector_engine        
+from cognee.infrastructure.databases.relational import get_relational_engine      
+from cognee.modules.data.models import Data, DatasetData      
+from cognee.shared.logging_utils import get_logger        
+from sqlalchemy import select, or_      
+import cognee      
+import sqlalchemy as sa    
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
+        
+logger = get_logger(__name__)        
+        
+        
+async def cleanup_unused_data(        
+    minutes_threshold: Optional[int],        
+    dry_run: bool = True,        
+    user_id: Optional[UUID] = None,      
+    text_doc: bool = False      
+) -> Dict[str, Any]:        
+    """        
+    Identify and remove unused data from the memify pipeline.        
+            
+    Parameters        
+    ----------        
+    minutes_threshold : int        
+        days since last access to consider data unused         
+    dry_run : bool        
+        If True, only report what would be delete without actually deleting (default: True)        
+    user_id : UUID, optional        
+        Limit cleanup to specific user's data (default: None)      
+    text_doc : bool      
+        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
+        for proper whole-document deletion (default: False)      
+            
+    Returns        
+    -------        
+    Dict[str, Any]        
+        Cleanup results with status, counts, and timestamp        
+    """       
+    # Check 1: Environment variable must be enabled      
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":      
+        logger.warning(      
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."      
+        )      
+        return {      
+            "status": "skipped",      
+            "reason": "ENABLE_LAST_ACCESSED not enabled",      
+            "unused_count": 0,      
+            "deleted_count": {},      
+            "cleanup_date": datetime.now(timezone.utc).isoformat()      
+        }      
+          
+    # Check 2: Verify tracking has actually been running      
+    db_engine = get_relational_engine()      
+    async with db_engine.get_async_session() as session:      
+        # Count records with non-NULL last_accessed      
+        tracked_count = await session.execute(      
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))      
+        )      
+        tracked_records = tracked_count.scalar()      
+              
+        if tracked_records == 0:      
+            logger.warning(      
+                "Cleanup skipped: No records have been tracked yet. "      
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "      
+                "Wait for retrievers to update timestamps before running cleanup."      
+            )      
+            return {      
+                "status": "skipped",      
+                "reason": "No tracked records found - tracking may be newly enabled",      
+                "unused_count": 0,      
+                "deleted_count": {},      
+                "cleanup_date": datetime.now(timezone.utc).isoformat()      
+            }      
+          
+    logger.info(        
+        "Starting cleanup task",        
+        minutes_threshold=minutes_threshold,        
+        dry_run=dry_run,        
+        user_id=str(user_id) if user_id else None,      
+        text_doc=text_doc      
+    )        
+            
+    # Calculate cutoff timestamp      
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)    
+          
+    if text_doc:      
+        # SQL-based approach: Find unused TextDocuments and use cognee.delete()      
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
+    else:      
+        # Graph-based approach: Find unused nodes using projection (database-agnostic)      
+        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)      
+        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")        
+                
+        # Find unused nodes using graph projection    
+        unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms)    
+                
+        total_unused = sum(len(nodes) for nodes in unused_nodes.values())        
+        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})        
+                
+        if dry_run:        
+            return {        
+                "status": "dry_run",        
+                "unused_count": total_unused,        
+                "deleted_count": {        
+                    "data_items": 0,        
+                    "chunks": 0,        
+                    "entities": 0,        
+                    "summaries": 0,        
+                    "associations": 0        
+                },        
+                "cleanup_date": datetime.now(timezone.utc).isoformat(),        
+                "preview": {        
+                    "chunks": len(unused_nodes["DocumentChunk"]),        
+                    "entities": len(unused_nodes["Entity"]),        
+                    "summaries": len(unused_nodes["TextSummary"])        
+                }        
+            }        
+                
+        # Delete unused nodes (provider-agnostic deletion)    
+        deleted_counts = await _delete_unused_nodes(unused_nodes)    
+                
+        logger.info("Cleanup completed", deleted_counts=deleted_counts)        
+                
+        return {        
+            "status": "completed",        
+            "unused_count": total_unused,        
+            "deleted_count": {        
+                "data_items": 0,        
+                "chunks": deleted_counts["DocumentChunk"],        
+                "entities": deleted_counts["Entity"],        
+                "summaries": deleted_counts["TextSummary"],        
+                "associations": deleted_counts["associations"]        
+            },        
+            "cleanup_date": datetime.now(timezone.utc).isoformat()        
+        }      
       
       
-async def cleanup_unused_data(      
-    minutes_threshold: Optional[int],      
-    dry_run: bool = True,      
-    user_id: Optional[UUID] = None,    
-    text_doc: bool = False    
+async def _cleanup_via_sql(      
+    cutoff_date: datetime,      
+    dry_run: bool,      
+    user_id: Optional[UUID] = None      
 ) -> Dict[str, Any]:      
     """      
-    Identify and remove unused data from the memify pipeline.      
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().      
           
     Parameters      
     ----------      
-    minutes_threshold : int      
-        days since last access to consider data unused       
+    cutoff_date : datetime      
+        Cutoff date for last_accessed filtering      
     dry_run : bool      
-        If True, only report what would be delete without actually deleting (default: True)      
+        If True, only report what would be deleted      
     user_id : UUID, optional      
-        Limit cleanup to specific user's data (default: None)    
-    text_doc : bool    
-        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()    
-        for proper whole-document deletion (default: False)    
+        Filter by user ID if provided      
           
     Returns      
     -------      
     Dict[str, Any]      
-        Cleanup results with status, counts, and timestamp      
-    """     
-    # Check 1: Environment variable must be enabled    
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":    
-        logger.warning(    
-            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."    
-        )    
-        return {    
-            "status": "skipped",    
-            "reason": "ENABLE_LAST_ACCESSED not enabled",    
-            "unused_count": 0,    
-            "deleted_count": {},    
-            "cleanup_date": datetime.now(timezone.utc).isoformat()    
-        }    
-        
-    # Check 2: Verify tracking has actually been running    
-    db_engine = get_relational_engine()    
-    async with db_engine.get_async_session() as session:    
-        # Count records with non-NULL last_accessed    
-        tracked_count = await session.execute(    
-            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))    
-        )    
-        tracked_records = tracked_count.scalar()    
-            
-        if tracked_records == 0:    
-            logger.warning(    
-                "Cleanup skipped: No records have been tracked yet. "    
-                "ENABLE_LAST_ACCESSED may have been recently enabled. "    
-                "Wait for retrievers to update timestamps before running cleanup."    
-            )    
-            return {    
-                "status": "skipped",    
-                "reason": "No tracked records found - tracking may be newly enabled",    
-                "unused_count": 0,    
-                "deleted_count": {},    
-                "cleanup_date": datetime.now(timezone.utc).isoformat()    
-            }    
-        
-    logger.info(      
-        "Starting cleanup task",      
-        minutes_threshold=minutes_threshold,      
-        dry_run=dry_run,      
-        user_id=str(user_id) if user_id else None,    
-        text_doc=text_doc    
-    )      
+        Cleanup results      
+    """      
+    db_engine = get_relational_engine()      
           
-    # Calculate cutoff timestamp    
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
-        
-    if text_doc:    
-        # SQL-based approach: Find unused TextDocuments and use cognee.delete()    
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)    
-    else:    
-        # Graph-based approach: Find unused nodes directly from graph    
-        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)    
-        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")      
+    async with db_engine.get_async_session() as session:      
+        # Query for Data records with old last_accessed timestamps      
+        query = select(Data, DatasetData).join(      
+            DatasetData, Data.id == DatasetData.data_id      
+        ).where(      
+            or_(      
+                Data.last_accessed < cutoff_date,      
+                Data.last_accessed.is_(None)      
+            )      
+        )      
               
-        # Detect database provider and find unused nodes  
-        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
-        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider)  
-              
-        total_unused = sum(len(nodes) for nodes in unused_nodes.values())      
-        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})      
-              
-        if dry_run:      
-            return {      
-                "status": "dry_run",      
-                "unused_count": total_unused,      
-                "deleted_count": {      
-                    "data_items": 0,      
-                    "chunks": 0,      
-                    "entities": 0,      
-                    "summaries": 0,      
-                    "associations": 0      
-                },      
-                "cleanup_date": datetime.now(timezone.utc).isoformat(),      
-                "preview": {      
-                    "chunks": len(unused_nodes["DocumentChunk"]),      
-                    "entities": len(unused_nodes["Entity"]),      
-                    "summaries": len(unused_nodes["TextSummary"])      
-                }      
-            }      
-              
-        # Delete unused nodes with provider-specific logic  
-        deleted_counts = await _delete_unused_nodes(unused_nodes, provider)  
-              
-        logger.info("Cleanup completed", deleted_counts=deleted_counts)      
+        if user_id:      
+            from cognee.modules.data.models import Dataset      
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(      
+                Dataset.owner_id == user_id      
+            )      
               
+        result = await session.execute(query)      
+        unused_data = result.all()      
+          
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")      
+          
+    if dry_run:      
         return {      
-            "status": "completed",      
-            "unused_count": total_unused,      
+            "status": "dry_run",      
+            "unused_count": len(unused_data),      
             "deleted_count": {      
                 "data_items": 0,      
-                "chunks": deleted_counts["DocumentChunk"],      
-                "entities": deleted_counts["Entity"],      
-                "summaries": deleted_counts["TextSummary"],      
-                "associations": deleted_counts["associations"]      
+                "documents": 0      
             },      
-            "cleanup_date": datetime.now(timezone.utc).isoformat()      
-        }    
-    
-    
-async def _cleanup_via_sql(    
-    cutoff_date: datetime,    
-    dry_run: bool,    
-    user_id: Optional[UUID] = None    
-) -> Dict[str, Any]:    
-    """    
-    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().    
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),      
+            "preview": {      
+                "documents": len(unused_data)      
+            }      
+        }      
+          
+    # Delete each document using cognee.delete()      
+    deleted_count = 0      
+    from cognee.modules.users.methods import get_default_user      
+    user = await get_default_user() if user_id is None else None      
+          
+    for data, dataset_data in unused_data:      
+        try:      
+            await cognee.delete(      
+                data_id=data.id,      
+                dataset_id=dataset_data.dataset_id,      
+                mode="hard",  # Use hard mode to also remove orphaned entities      
+                user=user      
+            )      
+            deleted_count += 1      
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")      
+        except Exception as e:      
+            logger.error(f"Failed to delete document {data.id}: {e}")      
+          
+    logger.info("Cleanup completed", deleted_count=deleted_count)      
+          
+    return {      
+        "status": "completed",      
+        "unused_count": len(unused_data),      
+        "deleted_count": {      
+            "data_items": deleted_count,      
+            "documents": deleted_count      
+        },      
+        "cleanup_date": datetime.now(timezone.utc).isoformat()      
+    }      
         
-    Parameters    
-    ----------    
-    cutoff_date : datetime    
-        Cutoff date for last_accessed filtering    
-    dry_run : bool    
-        If True, only report what would be deleted    
-    user_id : UUID, optional    
-        Filter by user ID if provided    
         
-    Returns    
-    -------    
-    Dict[str, Any]    
-        Cleanup results    
-    """    
-    db_engine = get_relational_engine()    
+async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:        
+    """        
+    Find unused nodes using graph projection - database-agnostic approach.        
+            
+    Parameters        
+    ----------        
+    cutoff_timestamp_ms : int        
+        Cutoff timestamp in milliseconds since epoch        
+            
+    Returns        
+    -------        
+    Dict[str, list]        
+        Dictionary mapping node types to lists of unused node IDs        
+    """        
+    graph_engine = await get_graph_engine()        
+            
+    # Project the entire graph with necessary properties    
+    memory_fragment = CogneeGraph()    
+    await memory_fragment.project_graph_from_db(    
+        graph_engine,    
+        node_properties_to_project=["id", "type", "last_accessed_at"],    
+        edge_properties_to_project=[]    
+    )    
         
-    async with db_engine.get_async_session() as session:    
-        # Query for Data records with old last_accessed timestamps    
-        query = select(Data, DatasetData).join(    
-            DatasetData, Data.id == DatasetData.data_id    
-        ).where(    
-            or_(    
-                Data.last_accessed < cutoff_date,    
-                Data.last_accessed.is_(None)    
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}    
+        
+    # Get all nodes from the projected graph    
+    all_nodes = memory_fragment.get_nodes()    
+        
+    for node in all_nodes:    
+        node_type = node.get_attribute("type")    
+        if node_type not in unused_nodes:    
+            continue    
+                
+        # Check last_accessed_at property    
+        last_accessed = node.get_attribute("last_accessed_at")    
+            
+        if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
+            unused_nodes[node_type].append(node.id)    
+            logger.debug(    
+                f"Found unused {node_type}",    
+                node_id=node.id,    
+                last_accessed=last_accessed    
             )    
+        
+    return unused_nodes    
+    
+    
+async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:        
+    """        
+    Delete unused nodes from graph and vector databases.        
+            
+    Parameters        
+    ----------        
+    unused_nodes : Dict[str, list]        
+        Dictionary mapping node types to lists of node IDs to delete      
+            
+    Returns        
+    -------        
+    Dict[str, int]        
+        Count of deleted items by type        
+    """        
+    graph_engine = await get_graph_engine()        
+    vector_engine = get_vector_engine()        
+            
+    deleted_counts = {        
+        "DocumentChunk": 0,        
+        "Entity": 0,        
+        "TextSummary": 0,        
+        "associations": 0        
+    }        
+            
+    # Count associations before deletion (using graph projection for consistency)    
+    if any(unused_nodes.values()):    
+        memory_fragment = CogneeGraph()    
+        await memory_fragment.project_graph_from_db(    
+            graph_engine,    
+            node_properties_to_project=["id"],    
+            edge_properties_to_project=[]    
         )    
             
-        if user_id:    
-            from cognee.modules.data.models import Dataset    
-            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(    
-                Dataset.owner_id == user_id    
-            )    
+        for node_type, node_ids in unused_nodes.items():        
+            if not node_ids:        
+                continue        
+                    
+            # Count edges connected to these nodes    
+            for node_id in node_ids:    
+                node = memory_fragment.get_node(node_id)    
+                if node:    
+                    # Count edges from the in-memory graph    
+                    edge_count = len(node.get_skeleton_edges())    
+                    deleted_counts["associations"] += edge_count    
+        
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)        
+    for node_type, node_ids in unused_nodes.items():        
+        if not node_ids:        
+            continue        
+                
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")        
+                
+        # Delete nodes in batches (database-agnostic)        
+        await graph_engine.delete_nodes(node_ids)        
+        deleted_counts[node_type] = len(node_ids)        
             
-        result = await session.execute(query)    
-        unused_data = result.all()    
-        
-    logger.info(f"Found {len(unused_data)} unused documents in SQL")    
-        
-    if dry_run:    
-        return {    
-            "status": "dry_run",    
-            "unused_count": len(unused_data),    
-            "deleted_count": {    
-                "data_items": 0,    
-                "documents": 0    
-            },    
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),    
-            "preview": {    
-                "documents": len(unused_data)    
-            }    
-        }    
-        
-    # Delete each document using cognee.delete()    
-    deleted_count = 0    
-    from cognee.modules.users.methods import get_default_user    
-    user = await get_default_user() if user_id is None else None    
-        
-    for data, dataset_data in unused_data:    
+    # Delete from vector database        
+    vector_collections = {        
+        "DocumentChunk": "DocumentChunk_text",        
+        "Entity": "Entity_name",        
+        "TextSummary": "TextSummary_text"        
+    }        
+            
+    
+    for node_type, collection_name in vector_collections.items():    
+        node_ids = unused_nodes[node_type]    
+        if not node_ids:    
+            continue    
+    
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
+    
         try:    
-            await cognee.delete(    
-                data_id=data.id,    
-                dataset_id=dataset_data.dataset_id,    
-                mode="hard",  # Use hard mode to also remove orphaned entities    
-                user=user    
-            )    
-            deleted_count += 1    
-            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")    
+            if await vector_engine.has_collection(collection_name):    
+                await vector_engine.delete_data_points(    
+                    collection_name,    
+                    [str(node_id) for node_id in node_ids]    
+                )    
         except Exception as e:    
-            logger.error(f"Failed to delete document {data.id}: {e}")    
-        
-    logger.info("Cleanup completed", deleted_count=deleted_count)    
-        
-    return {    
-        "status": "completed",    
-        "unused_count": len(unused_data),    
-        "deleted_count": {    
-            "data_items": deleted_count,    
-            "documents": deleted_count    
-        },    
-        "cleanup_date": datetime.now(timezone.utc).isoformat()    
-    }    
-      
-      
-async def _find_unused_nodes(      
-    cutoff_timestamp_ms: int,      
-    user_id: Optional[UUID] = None,  
-    provider: str = "kuzu"  
-) -> Dict[str, list]:      
-    """      
-    Find unused nodes with provider-specific queries.      
-          
-    Parameters      
-    ----------      
-    cutoff_timestamp_ms : int      
-        Cutoff timestamp in milliseconds since epoch      
-    user_id : UUID, optional      
-        Filter by user ID if provided    
-    provider : str    
-        Graph database provider (kuzu, neo4j, neptune)  
-          
-    Returns      
-    -------      
-    Dict[str, list]      
-        Dictionary mapping node types to lists of unused node IDs      
-    """      
-    graph_engine = await get_graph_engine()      
-          
-    if provider == "kuzu":  
-        return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms)  
-    elif provider == "neo4j":  
-        return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms)  
-    elif provider == "neptune":  
-        return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms)  
-    else:  
-        logger.warning(f"Unsupported graph provider: {provider}")  
-        return {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-  
-  
-async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms):  
-    """Kuzu-specific unused node detection"""  
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-      
-    for node_id, node_type, props_json in results:  
-        if node_type not in unused_nodes:  
-            continue  
-              
-        if props_json:  
-            try:  
-                props = json.loads(props_json)  
-                last_accessed = props.get("last_accessed_at")  
-                  
-                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-                    unused_nodes[node_type].append(node_id)  
-                    logger.debug(  
-                        f"Found unused {node_type}",  
-                        node_id=node_id,  
-                        last_accessed=last_accessed  
-                    )  
-            except json.JSONDecodeError:  
-                logger.warning(f"Failed to parse properties for node {node_id}")  
-                continue  
-      
-    return unused_nodes  
-  
-  
-async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms):  
-    """Neo4j-specific unused node detection"""  
-    query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-      
-    for row in results:  
-        node_id = row["n"]["id"]  
-        node_type = row["n"]["type"]  
-        last_accessed = row["n"].get("last_accessed_at")  
-          
-        if node_type not in unused_nodes:  
-            continue  
-              
-        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-            unused_nodes[node_type].append(node_id)  
-            logger.debug(  
-                f"Found unused {node_type}",  
-                node_id=node_id,  
-                last_accessed=last_accessed  
-            )  
-      
-    return unused_nodes  
-  
-  
-async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms):  
-    """Neptune-specific unused node detection"""  
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-      
-    for row in results:  
-        node_id = row["n"]["id"]  
-        node_type = row["n"]["type"]  
-        last_accessed = row["n"].get("last_accessed_at")  
-          
-        if node_type not in unused_nodes:  
-            continue  
-              
-        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-            unused_nodes[node_type].append(node_id)  
-            logger.debug(  
-                f"Found unused {node_type}",  
-                node_id=node_id,  
-                last_accessed=last_accessed  
-            )  
-      
-    return unused_nodes  
-  
-  
-async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]:      
-    """      
-    Delete unused nodes from graph and vector databases.      
-          
-    Parameters      
-    ----------      
-    unused_nodes : Dict[str, list]      
-        Dictionary mapping node types to lists of node IDs to delete    
-    provider : str    
-        Graph database provider (kuzu, neo4j, neptune)  
-          
-    Returns      
-    -------      
-    Dict[str, int]      
-        Count of deleted items by type      
-    """      
-    graph_engine = await get_graph_engine()      
-    vector_engine = get_vector_engine()      
-          
-    deleted_counts = {      
-        "DocumentChunk": 0,      
-        "Entity": 0,      
-        "TextSummary": 0,      
-        "associations": 0      
-    }      
-          
-    # Count associations before deletion      
-    for node_type, node_ids in unused_nodes.items():      
-        if not node_ids:      
-            continue      
-              
-        # Count edges connected to these nodes      
-        for node_id in node_ids:      
-            if provider == "kuzu":  
-                result = await graph_engine.query(      
-                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
-                    {"id": node_id}      
-                )  
-            elif provider == "neo4j":  
-                result = await graph_engine.query(      
-                    "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)",      
-                    {"id": node_id}      
-                )  
-            elif provider == "neptune":  
-                result = await graph_engine.query(      
-                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
-                    {"id": node_id}      
-                )  
-              
-            if result and len(result) > 0:      
-                count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"]  
-                deleted_counts["associations"] += count  
-      
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)      
-    for node_type, node_ids in unused_nodes.items():      
-        if not node_ids:      
-            continue      
-              
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")      
-              
-        # Delete nodes in batches      
-        await graph_engine.delete_nodes(node_ids)      
-        deleted_counts[node_type] = len(node_ids)      
-          
-    # Delete from vector database      
-    vector_collections = {      
-        "DocumentChunk": "DocumentChunk_text",      
-        "Entity": "Entity_name",      
-        "TextSummary": "TextSummary_text"      
-    }      
-          
-  
-    for node_type, collection_name in vector_collections.items():  
-        node_ids = unused_nodes[node_type]  
-        if not node_ids:  
-            continue  
-  
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
-  
-        try:  
-            if await vector_engine.has_collection(collection_name):  
-                await vector_engine.delete_data_points(  
-                    collection_name,  
-                    [str(node_id) for node_id in node_ids]  
-                )  
-        except Exception as e:  
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
-          
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
+            
     return deleted_counts

From 5f00abf3e4f3b913ae67391d487104ea3b9ae872 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 2 Dec 2025 22:25:03 +0530
Subject: [PATCH 23/25] fix: fallback and document deletion

---
 .../retrieval/utils/access_tracking.py        | 73 +++++++++++--------
 cognee/tasks/cleanup/cleanup_unused_data.py   | 41 +++++++----
 2 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 935c47157..c7b06ee17 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -36,16 +36,22 @@ async def update_node_access_timestamps(items: List[Any]):
         return  
           
     try:  
-        # Update nodes using graph projection ( database-agnostic approach  
+        # Try to update nodes in graph database (may fail for unsupported DBs)  
         await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)  
+    except Exception as e:  
+        logger.warning(  
+            f"Failed to update node timestamps in graph database: {e}. "  
+            "Will update document-level timestamps in SQL instead."  
+        )  
               
-        # Find origin documents and update SQL  
+    # Always try to find origin documents and update SQL  
+    # This ensures document-level tracking works even if graph updates fail  
+    try:  
         doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
         if doc_ids:  
             await _update_sql_records(doc_ids, timestamp_dt)  
-                  
     except Exception as e:  
-        logger.error(f"Failed to update timestamps: {e}")  
+        logger.error(f"Failed to update SQL timestamps: {e}")  
         raise  
   
 async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):  
@@ -59,37 +65,42 @@ async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):
     )  
       
     # Update each node's last_accessed_at property  
+    provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+      
     for node_id in node_ids:  
         node = memory_fragment.get_node(node_id)  
         if node:  
-            # Update the node in the database  
-            provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
-              
-            if provider == "kuzu":  
-                # Kuzu stores properties as JSON  
-                result = await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) RETURN n.properties",  
-                    {"id": node_id}  
-                )  
-                  
-                if result and result[0]:  
-                    props = json.loads(result[0][0]) if result[0][0] else {}  
-                    props["last_accessed_at"] = timestamp_ms  
-                      
-                    await graph_engine.query(  
-                        "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                        {"id": node_id, "props": json.dumps(props)}  
+            try:  
+                # Update the node in the database  
+                if provider == "kuzu":  
+                    # Kuzu stores properties as JSON  
+                    result = await graph_engine.query(  
+                        "MATCH (n:Node {id: $id}) RETURN n.properties",  
+                        {"id": node_id}  
                     )  
-            elif provider == "neo4j":  
-                await graph_engine.query(  
-                    "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
-                    {"id": node_id, "timestamp": timestamp_ms}  
-                )  
-            elif provider == "neptune":  
-                await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
-                    {"id": node_id, "timestamp": timestamp_ms}  
-                )  
+                      
+                    if result and result[0]:  
+                        props = json.loads(result[0][0]) if result[0][0] else {}  
+                        props["last_accessed_at"] = timestamp_ms  
+                          
+                        await graph_engine.query(  
+                            "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                            {"id": node_id, "props": json.dumps(props)}  
+                        )  
+                elif provider == "neo4j":  
+                    await graph_engine.query(  
+                        "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
+                        {"id": node_id, "timestamp": timestamp_ms}  
+                    )  
+                elif provider == "neptune":  
+                    await graph_engine.query(  
+                        "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
+                        {"id": node_id, "timestamp": timestamp_ms}  
+                    )  
+            except Exception as e:  
+                # Log but continue with other nodes  
+                logger.debug(f"Failed to update node {node_id}: {e}")  
+                continue  
   
 async def _find_origin_documents_via_projection(graph_engine, node_ids):  
     """Find origin documents using graph projection instead of DB queries"""  
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index c70b97a00..3894635dd 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,9 +1,9 @@
 """        
 Task for automatically deleting unused data from the memify pipeline.        
         
-This task identifies and removes data (chunks, entities, summaries)) that hasn't        
+This task identifies and removes entire documents that haven't        
 been accessed by retrievers for a specified period, helping maintain system        
-efficiency and storage optimization.        
+efficiency and storage optimization through whole-document removal.        
 """        
         
 import json        
@@ -28,22 +28,26 @@ async def cleanup_unused_data(
     minutes_threshold: Optional[int],        
     dry_run: bool = True,        
     user_id: Optional[UUID] = None,      
-    text_doc: bool = False      
+    text_doc: bool = True,  # Changed default to True for document-level cleanup  
+    node_level: bool = False  # New parameter for explicit node-level cleanup  
 ) -> Dict[str, Any]:        
     """        
     Identify and remove unused data from the memify pipeline.        
-            
+        
     Parameters        
     ----------        
     minutes_threshold : int        
-        days since last access to consider data unused         
+        Minutes since last access to consider data unused         
     dry_run : bool        
-        If True, only report what would be delete without actually deleting (default: True)        
+        If True, only report what would be deleted without actually deleting (default: True)        
     user_id : UUID, optional        
         Limit cleanup to specific user's data (default: None)      
     text_doc : bool      
-        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
-        for proper whole-document deletion (default: False)      
+        If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
+        for proper whole-document deletion      
+    node_level : bool      
+        If True, perform chaotic node-level deletion of unused chunks, entities, and summaries      
+        (default: False - deprecated in favor of document-level cleanup)      
             
     Returns        
     -------        
@@ -91,17 +95,19 @@ async def cleanup_unused_data(
         minutes_threshold=minutes_threshold,        
         dry_run=dry_run,        
         user_id=str(user_id) if user_id else None,      
-        text_doc=text_doc      
+        text_doc=text_doc,      
+        node_level=node_level      
     )        
             
     # Calculate cutoff timestamp      
     cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)    
           
-    if text_doc:      
-        # SQL-based approach: Find unused TextDocuments and use cognee.delete()      
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
-    else:      
-        # Graph-based approach: Find unused nodes using projection (database-agnostic)      
+    if node_level:      
+        # Deprecated: Node-level approach (chaotic)      
+        logger.warning(      
+            "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. "      
+            "Consider using document-level cleanup (default) instead."      
+        )      
         cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)      
         logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")        
                 
@@ -147,6 +153,9 @@ async def cleanup_unused_data(
             },        
             "cleanup_date": datetime.now(timezone.utc).isoformat()        
         }      
+    else:      
+        # Default: Document-level approach (recommended)      
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
       
       
 async def _cleanup_via_sql(      
@@ -243,6 +252,7 @@ async def _cleanup_via_sql(
 async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:        
     """        
     Find unused nodes using graph projection - database-agnostic approach.        
+    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
             
     Parameters        
     ----------        
@@ -291,6 +301,7 @@ async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[st
 async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:        
     """        
     Delete unused nodes from graph and vector databases.        
+    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
             
     Parameters        
     ----------        
@@ -325,7 +336,7 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
             if not node_ids:        
                 continue        
                     
-            # Count edges connected to these nodes    
+            # Count edges from the in-memory graph    
             for node_id in node_ids:    
                 node = memory_fragment.get_node(node_id)    
                 if node:    

From 829a6f0d04bcfec6e9c9f94219a29d6ab5cd909d Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 10 Dec 2025 22:41:01 +0530
Subject: [PATCH 24/25] fix: only document level deletion

---
 .../retrieval/utils/access_tracking.py        |  80 +--
 cognee/tasks/cleanup/cleanup_unused_data.py   | 521 ++++++------------
 cognee/tests/test_cleanup_unused_data.py      | 388 ++++++-------
 3 files changed, 333 insertions(+), 656 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index c7b06ee17..54fd043b9 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -4,7 +4,7 @@ import json
 from datetime import datetime, timezone  
 from typing import List, Any  
 from uuid import UUID  
-import os   
+import os  
 from cognee.infrastructure.databases.graph import get_graph_engine  
 from cognee.infrastructure.databases.relational import get_relational_engine  
 from cognee.modules.data.models import Data  
@@ -14,38 +14,28 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
   
 logger = get_logger(__name__)  
   
+  
 async def update_node_access_timestamps(items: List[Any]):  
     if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
         return  
-          
+  
     if not items:  
         return  
-          
+  
     graph_engine = await get_graph_engine()  
-    timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
     timestamp_dt = datetime.now(timezone.utc)  
-          
+  
     # Extract node IDs  
     node_ids = []  
     for item in items:  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
         if item_id:  
             node_ids.append(str(item_id))  
-          
+  
     if not node_ids:  
         return  
-          
-    try:  
-        # Try to update nodes in graph database (may fail for unsupported DBs)  
-        await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)  
-    except Exception as e:  
-        logger.warning(  
-            f"Failed to update node timestamps in graph database: {e}. "  
-            "Will update document-level timestamps in SQL instead."  
-        )  
-              
-    # Always try to find origin documents and update SQL  
-    # This ensures document-level tracking works even if graph updates fail  
+  
+    # Focus on document-level tracking via projection  
     try:  
         doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
         if doc_ids:  
@@ -54,53 +44,6 @@ async def update_node_access_timestamps(items: List[Any]):
         logger.error(f"Failed to update SQL timestamps: {e}")  
         raise  
   
-async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):  
-    """Update nodes using graph projection - works with any graph database"""  
-    # Project the graph with necessary properties  
-    memory_fragment = CogneeGraph()  
-    await memory_fragment.project_graph_from_db(  
-        graph_engine,  
-        node_properties_to_project=["id"],  
-        edge_properties_to_project=[]  
-    )  
-      
-    # Update each node's last_accessed_at property  
-    provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
-      
-    for node_id in node_ids:  
-        node = memory_fragment.get_node(node_id)  
-        if node:  
-            try:  
-                # Update the node in the database  
-                if provider == "kuzu":  
-                    # Kuzu stores properties as JSON  
-                    result = await graph_engine.query(  
-                        "MATCH (n:Node {id: $id}) RETURN n.properties",  
-                        {"id": node_id}  
-                    )  
-                      
-                    if result and result[0]:  
-                        props = json.loads(result[0][0]) if result[0][0] else {}  
-                        props["last_accessed_at"] = timestamp_ms  
-                          
-                        await graph_engine.query(  
-                            "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                            {"id": node_id, "props": json.dumps(props)}  
-                        )  
-                elif provider == "neo4j":  
-                    await graph_engine.query(  
-                        "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
-                        {"id": node_id, "timestamp": timestamp_ms}  
-                    )  
-                elif provider == "neptune":  
-                    await graph_engine.query(  
-                        "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
-                        {"id": node_id, "timestamp": timestamp_ms}  
-                    )  
-            except Exception as e:  
-                # Log but continue with other nodes  
-                logger.debug(f"Failed to update node {node_id}: {e}")  
-                continue  
   
 async def _find_origin_documents_via_projection(graph_engine, node_ids):  
     """Find origin documents using graph projection instead of DB queries"""  
@@ -111,7 +54,7 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids):
         node_properties_to_project=["id", "type"],  
         edge_properties_to_project=["relationship_name"]  
     )  
-      
+  
     # Find origin documents by traversing the in-memory graph  
     doc_ids = set()  
     for node_id in node_ids:  
@@ -123,9 +66,10 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids):
                 neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node()  
                 if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:  
                     doc_ids.add(neighbor.id)  
-      
+  
     return list(doc_ids)  
   
+  
 async def _update_sql_records(doc_ids, timestamp_dt):  
     """Update SQL Data table (same for all providers)"""  
     db_engine = get_relational_engine()  
@@ -133,6 +77,6 @@ async def _update_sql_records(doc_ids, timestamp_dt):
         stmt = update(Data).where(  
             Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
         ).values(last_accessed=timestamp_dt)  
-              
+  
         await session.execute(stmt)  
         await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index 3894635dd..34cde1b6f 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,382 +1,187 @@
-"""        
-Task for automatically deleting unused data from the memify pipeline.        
+"""          
+Task for automatically deleting unused data from the memify pipeline.          
+          
+This task identifies and removes entire documents that haven't          
+been accessed by retrievers for a specified period, helping maintain system          
+efficiency and storage optimization through whole-document removal.          
+"""          
+          
+import json          
+from datetime import datetime, timezone, timedelta          
+from typing import Optional, Dict, Any          
+from uuid import UUID          
+import os          
+from cognee.infrastructure.databases.graph import get_graph_engine          
+from cognee.infrastructure.databases.vector import get_vector_engine          
+from cognee.infrastructure.databases.relational import get_relational_engine        
+from cognee.modules.data.models import Data, DatasetData        
+from cognee.shared.logging_utils import get_logger          
+from sqlalchemy import select, or_        
+import cognee        
+import sqlalchemy as sa      
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph    
+          
+logger = get_logger(__name__)          
+          
+          
+async def cleanup_unused_data(          
+    minutes_threshold: Optional[int],          
+    dry_run: bool = True,          
+    user_id: Optional[UUID] = None        
+) -> Dict[str, Any]:          
+    """          
+    Identify and remove unused data from the memify pipeline.          
+          
+    Parameters          
+    ----------          
+    minutes_threshold : int          
+        Minutes since last access to consider data unused           
+    dry_run : bool          
+        If True, only report what would be deleted without actually deleting (default: True)          
+    user_id : UUID, optional          
+        Limit cleanup to specific user's data (default: None)        
+              
+    Returns          
+    -------          
+    Dict[str, Any]          
+        Cleanup results with status, counts, and timestamp          
+    """         
+    # Check 1: Environment variable must be enabled        
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":        
+        logger.warning(        
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."        
+        )        
+        return {        
+            "status": "skipped",        
+            "reason": "ENABLE_LAST_ACCESSED not enabled",        
+            "unused_count": 0,        
+            "deleted_count": {},        
+            "cleanup_date": datetime.now(timezone.utc).isoformat()        
+        }        
+            
+    # Check 2: Verify tracking has actually been running        
+    db_engine = get_relational_engine()        
+    async with db_engine.get_async_session() as session:        
+        # Count records with non-NULL last_accessed        
+        tracked_count = await session.execute(        
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))        
+        )        
+        tracked_records = tracked_count.scalar()        
+                
+        if tracked_records == 0:        
+            logger.warning(        
+                "Cleanup skipped: No records have been tracked yet. "        
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "        
+                "Wait for retrievers to update timestamps before running cleanup."        
+            )        
+            return {        
+                "status": "skipped",        
+                "reason": "No tracked records found - tracking may be newly enabled",        
+                "unused_count": 0,        
+                "deleted_count": {},        
+                "cleanup_date": datetime.now(timezone.utc).isoformat()        
+            }        
+            
+    logger.info(          
+        "Starting cleanup task",          
+        minutes_threshold=minutes_threshold,          
+        dry_run=dry_run,          
+        user_id=str(user_id) if user_id else None        
+    )          
+              
+    # Calculate cutoff timestamp        
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)      
         
-This task identifies and removes entire documents that haven't        
-been accessed by retrievers for a specified period, helping maintain system        
-efficiency and storage optimization through whole-document removal.        
-"""        
-        
-import json        
-from datetime import datetime, timezone, timedelta        
-from typing import Optional, Dict, Any        
-from uuid import UUID        
-import os        
-from cognee.infrastructure.databases.graph import get_graph_engine        
-from cognee.infrastructure.databases.vector import get_vector_engine        
-from cognee.infrastructure.databases.relational import get_relational_engine      
-from cognee.modules.data.models import Data, DatasetData      
-from cognee.shared.logging_utils import get_logger        
-from sqlalchemy import select, or_      
-import cognee      
-import sqlalchemy as sa    
-from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
-        
-logger = get_logger(__name__)        
+    # Document-level approach (recommended)        
+    return await _cleanup_via_sql(cutoff_date, dry_run, user_id)        
         
         
-async def cleanup_unused_data(        
-    minutes_threshold: Optional[int],        
-    dry_run: bool = True,        
-    user_id: Optional[UUID] = None,      
-    text_doc: bool = True,  # Changed default to True for document-level cleanup  
-    node_level: bool = False  # New parameter for explicit node-level cleanup  
+async def _cleanup_via_sql(        
+    cutoff_date: datetime,        
+    dry_run: bool,        
+    user_id: Optional[UUID] = None        
 ) -> Dict[str, Any]:        
     """        
-    Identify and remove unused data from the memify pipeline.        
-        
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().        
+            
     Parameters        
     ----------        
-    minutes_threshold : int        
-        Minutes since last access to consider data unused         
+    cutoff_date : datetime        
+        Cutoff date for last_accessed filtering        
     dry_run : bool        
-        If True, only report what would be deleted without actually deleting (default: True)        
+        If True, only report what would be deleted        
     user_id : UUID, optional        
-        Limit cleanup to specific user's data (default: None)      
-    text_doc : bool      
-        If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
-        for proper whole-document deletion      
-    node_level : bool      
-        If True, perform chaotic node-level deletion of unused chunks, entities, and summaries      
-        (default: False - deprecated in favor of document-level cleanup)      
+        Filter by user ID if provided        
             
     Returns        
     -------        
     Dict[str, Any]        
-        Cleanup results with status, counts, and timestamp        
-    """       
-    # Check 1: Environment variable must be enabled      
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":      
-        logger.warning(      
-            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."      
-        )      
-        return {      
-            "status": "skipped",      
-            "reason": "ENABLE_LAST_ACCESSED not enabled",      
-            "unused_count": 0,      
-            "deleted_count": {},      
-            "cleanup_date": datetime.now(timezone.utc).isoformat()      
-        }      
-          
-    # Check 2: Verify tracking has actually been running      
-    db_engine = get_relational_engine()      
-    async with db_engine.get_async_session() as session:      
-        # Count records with non-NULL last_accessed      
-        tracked_count = await session.execute(      
-            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))      
-        )      
-        tracked_records = tracked_count.scalar()      
-              
-        if tracked_records == 0:      
-            logger.warning(      
-                "Cleanup skipped: No records have been tracked yet. "      
-                "ENABLE_LAST_ACCESSED may have been recently enabled. "      
-                "Wait for retrievers to update timestamps before running cleanup."      
-            )      
-            return {      
-                "status": "skipped",      
-                "reason": "No tracked records found - tracking may be newly enabled",      
-                "unused_count": 0,      
-                "deleted_count": {},      
-                "cleanup_date": datetime.now(timezone.utc).isoformat()      
-            }      
-          
-    logger.info(        
-        "Starting cleanup task",        
-        minutes_threshold=minutes_threshold,        
-        dry_run=dry_run,        
-        user_id=str(user_id) if user_id else None,      
-        text_doc=text_doc,      
-        node_level=node_level      
-    )        
+        Cleanup results        
+    """        
+    db_engine = get_relational_engine()        
             
-    # Calculate cutoff timestamp      
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)    
-          
-    if node_level:      
-        # Deprecated: Node-level approach (chaotic)      
-        logger.warning(      
-            "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. "      
-            "Consider using document-level cleanup (default) instead."      
-        )      
-        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)      
-        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")        
+    async with db_engine.get_async_session() as session:        
+        # Query for Data records with old last_accessed timestamps        
+        query = select(Data, DatasetData).join(        
+            DatasetData, Data.id == DatasetData.data_id        
+        ).where(        
+            or_(        
+                Data.last_accessed < cutoff_date,        
+                Data.last_accessed.is_(None)        
+            )        
+        )        
                 
-        # Find unused nodes using graph projection    
-        unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms)    
-                
-        total_unused = sum(len(nodes) for nodes in unused_nodes.values())        
-        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})        
-                
-        if dry_run:        
-            return {        
-                "status": "dry_run",        
-                "unused_count": total_unused,        
-                "deleted_count": {        
-                    "data_items": 0,        
-                    "chunks": 0,        
-                    "entities": 0,        
-                    "summaries": 0,        
-                    "associations": 0        
-                },        
-                "cleanup_date": datetime.now(timezone.utc).isoformat(),        
-                "preview": {        
-                    "chunks": len(unused_nodes["DocumentChunk"]),        
-                    "entities": len(unused_nodes["Entity"]),        
-                    "summaries": len(unused_nodes["TextSummary"])        
-                }        
-            }        
-                
-        # Delete unused nodes (provider-agnostic deletion)    
-        deleted_counts = await _delete_unused_nodes(unused_nodes)    
-                
-        logger.info("Cleanup completed", deleted_counts=deleted_counts)        
+        if user_id:        
+            from cognee.modules.data.models import Dataset        
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(        
+                Dataset.owner_id == user_id        
+            )        
                 
+        result = await session.execute(query)        
+        unused_data = result.all()        
+            
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")        
+            
+    if dry_run:        
         return {        
-            "status": "completed",        
-            "unused_count": total_unused,        
+            "status": "dry_run",        
+            "unused_count": len(unused_data),        
             "deleted_count": {        
                 "data_items": 0,        
-                "chunks": deleted_counts["DocumentChunk"],        
-                "entities": deleted_counts["Entity"],        
-                "summaries": deleted_counts["TextSummary"],        
-                "associations": deleted_counts["associations"]        
+                "documents": 0        
             },        
-            "cleanup_date": datetime.now(timezone.utc).isoformat()        
-        }      
-    else:      
-        # Default: Document-level approach (recommended)      
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
-      
-      
-async def _cleanup_via_sql(      
-    cutoff_date: datetime,      
-    dry_run: bool,      
-    user_id: Optional[UUID] = None      
-) -> Dict[str, Any]:      
-    """      
-    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().      
-          
-    Parameters      
-    ----------      
-    cutoff_date : datetime      
-        Cutoff date for last_accessed filtering      
-    dry_run : bool      
-        If True, only report what would be deleted      
-    user_id : UUID, optional      
-        Filter by user ID if provided      
-          
-    Returns      
-    -------      
-    Dict[str, Any]      
-        Cleanup results      
-    """      
-    db_engine = get_relational_engine()      
-          
-    async with db_engine.get_async_session() as session:      
-        # Query for Data records with old last_accessed timestamps      
-        query = select(Data, DatasetData).join(      
-            DatasetData, Data.id == DatasetData.data_id      
-        ).where(      
-            or_(      
-                Data.last_accessed < cutoff_date,      
-                Data.last_accessed.is_(None)      
-            )      
-        )      
-              
-        if user_id:      
-            from cognee.modules.data.models import Dataset      
-            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(      
-                Dataset.owner_id == user_id      
-            )      
-              
-        result = await session.execute(query)      
-        unused_data = result.all()      
-          
-    logger.info(f"Found {len(unused_data)} unused documents in SQL")      
-          
-    if dry_run:      
-        return {      
-            "status": "dry_run",      
-            "unused_count": len(unused_data),      
-            "deleted_count": {      
-                "data_items": 0,      
-                "documents": 0      
-            },      
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),      
-            "preview": {      
-                "documents": len(unused_data)      
-            }      
-        }      
-          
-    # Delete each document using cognee.delete()      
-    deleted_count = 0      
-    from cognee.modules.users.methods import get_default_user      
-    user = await get_default_user() if user_id is None else None      
-          
-    for data, dataset_data in unused_data:      
-        try:      
-            await cognee.delete(      
-                data_id=data.id,      
-                dataset_id=dataset_data.dataset_id,      
-                mode="hard",  # Use hard mode to also remove orphaned entities      
-                user=user      
-            )      
-            deleted_count += 1      
-            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")      
-        except Exception as e:      
-            logger.error(f"Failed to delete document {data.id}: {e}")      
-          
-    logger.info("Cleanup completed", deleted_count=deleted_count)      
-          
-    return {      
-        "status": "completed",      
-        "unused_count": len(unused_data),      
-        "deleted_count": {      
-            "data_items": deleted_count,      
-            "documents": deleted_count      
-        },      
-        "cleanup_date": datetime.now(timezone.utc).isoformat()      
-    }      
-        
-        
-async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:        
-    """        
-    Find unused nodes using graph projection - database-agnostic approach.        
-    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),        
+            "preview": {        
+                "documents": len(unused_data)        
+            }        
+        }        
             
-    Parameters        
-    ----------        
-    cutoff_timestamp_ms : int        
-        Cutoff timestamp in milliseconds since epoch        
+    # Delete each document using cognee.delete()        
+    deleted_count = 0        
+    from cognee.modules.users.methods import get_default_user        
+    user = await get_default_user() if user_id is None else None        
             
-    Returns        
-    -------        
-    Dict[str, list]        
-        Dictionary mapping node types to lists of unused node IDs        
-    """        
-    graph_engine = await get_graph_engine()        
+    for data, dataset_data in unused_data:        
+        try:        
+            await cognee.delete(        
+                data_id=data.id,        
+                dataset_id=dataset_data.dataset_id,        
+                mode="hard",  # Use hard mode to also remove orphaned entities        
+                user=user        
+            )        
+            deleted_count += 1        
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")        
+        except Exception as e:        
+            logger.error(f"Failed to delete document {data.id}: {e}")        
             
-    # Project the entire graph with necessary properties    
-    memory_fragment = CogneeGraph()    
-    await memory_fragment.project_graph_from_db(    
-        graph_engine,    
-        node_properties_to_project=["id", "type", "last_accessed_at"],    
-        edge_properties_to_project=[]    
-    )    
-        
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}    
-        
-    # Get all nodes from the projected graph    
-    all_nodes = memory_fragment.get_nodes()    
-        
-    for node in all_nodes:    
-        node_type = node.get_attribute("type")    
-        if node_type not in unused_nodes:    
-            continue    
-                
-        # Check last_accessed_at property    
-        last_accessed = node.get_attribute("last_accessed_at")    
+    logger.info("Cleanup completed", deleted_count=deleted_count)        
             
-        if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
-            unused_nodes[node_type].append(node.id)    
-            logger.debug(    
-                f"Found unused {node_type}",    
-                node_id=node.id,    
-                last_accessed=last_accessed    
-            )    
-        
-    return unused_nodes    
-    
-    
-async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:        
-    """        
-    Delete unused nodes from graph and vector databases.        
-    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
-            
-    Parameters        
-    ----------        
-    unused_nodes : Dict[str, list]        
-        Dictionary mapping node types to lists of node IDs to delete      
-            
-    Returns        
-    -------        
-    Dict[str, int]        
-        Count of deleted items by type        
-    """        
-    graph_engine = await get_graph_engine()        
-    vector_engine = get_vector_engine()        
-            
-    deleted_counts = {        
-        "DocumentChunk": 0,        
-        "Entity": 0,        
-        "TextSummary": 0,        
-        "associations": 0        
-    }        
-            
-    # Count associations before deletion (using graph projection for consistency)    
-    if any(unused_nodes.values()):    
-        memory_fragment = CogneeGraph()    
-        await memory_fragment.project_graph_from_db(    
-            graph_engine,    
-            node_properties_to_project=["id"],    
-            edge_properties_to_project=[]    
-        )    
-            
-        for node_type, node_ids in unused_nodes.items():        
-            if not node_ids:        
-                continue        
-                    
-            # Count edges from the in-memory graph    
-            for node_id in node_ids:    
-                node = memory_fragment.get_node(node_id)    
-                if node:    
-                    # Count edges from the in-memory graph    
-                    edge_count = len(node.get_skeleton_edges())    
-                    deleted_counts["associations"] += edge_count    
-        
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)        
-    for node_type, node_ids in unused_nodes.items():        
-        if not node_ids:        
-            continue        
-                
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")        
-                
-        # Delete nodes in batches (database-agnostic)        
-        await graph_engine.delete_nodes(node_ids)        
-        deleted_counts[node_type] = len(node_ids)        
-            
-    # Delete from vector database        
-    vector_collections = {        
-        "DocumentChunk": "DocumentChunk_text",        
-        "Entity": "Entity_name",        
-        "TextSummary": "TextSummary_text"        
-    }        
-            
-    
-    for node_type, collection_name in vector_collections.items():    
-        node_ids = unused_nodes[node_type]    
-        if not node_ids:    
-            continue    
-    
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
-    
-        try:    
-            if await vector_engine.has_collection(collection_name):    
-                await vector_engine.delete_data_points(    
-                    collection_name,    
-                    [str(node_id) for node_id in node_ids]    
-                )    
-        except Exception as e:    
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
-            
-    return deleted_counts
+    return {        
+        "status": "completed",        
+        "unused_count": len(unused_data),        
+        "deleted_count": {        
+            "data_items": deleted_count,        
+            "documents": deleted_count        
+        },        
+        "cleanup_date": datetime.now(timezone.utc).isoformat()        
+    }
diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py
index c21b9f5ea..e738dcba0 100644
--- a/cognee/tests/test_cleanup_unused_data.py
+++ b/cognee/tests/test_cleanup_unused_data.py
@@ -1,244 +1,172 @@
-import os    
-import pathlib    
-import cognee    
-from datetime import datetime, timezone, timedelta    
-from uuid import UUID    
-from sqlalchemy import select, update    
-from cognee.modules.data.models import Data, DatasetData    
-from cognee.infrastructure.databases.relational import get_relational_engine    
-from cognee.modules.users.methods import get_default_user    
-from cognee.shared.logging_utils import get_logger    
-from cognee.modules.search.types import SearchType    
-    
-logger = get_logger()    
-    
-    
-async def test_textdocument_cleanup_with_sql():    
-    """    
-    End-to-end test for TextDocument cleanup based on last_accessed timestamps.    
+import os      
+import pathlib        
+import cognee        
+from datetime import datetime, timezone, timedelta        
+from uuid import UUID        
+from sqlalchemy import select, update        
+from cognee.modules.data.models import Data, DatasetData        
+from cognee.infrastructure.databases.relational import get_relational_engine        
+from cognee.modules.users.methods import get_default_user        
+from cognee.shared.logging_utils import get_logger        
+from cognee.modules.search.types import SearchType        
         
-    Tests:    
-    1. Add and cognify a document    
-    2. Perform search to populate last_accessed timestamp    
-    3. Verify last_accessed is set in SQL Data table    
-    4. Manually age the timestamp beyond cleanup threshold    
-    5. Run cleanup with text_doc=True    
-    6. Verify document was deleted from all databases (relational, graph, and vector)  
-    """    
-    # Setup test directories    
-    data_directory_path = str(    
-        pathlib.Path(    
-            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")    
-        ).resolve()    
-    )    
-    cognee_directory_path = str(    
-        pathlib.Path(    
-            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")    
-        ).resolve()    
-    )    
+logger = get_logger()        
         
-    cognee.config.data_root_directory(data_directory_path)    
-    cognee.config.system_root_directory(cognee_directory_path)    
         
-    # Initialize database    
-    from cognee.modules.engine.operations.setup import setup    
+async def test_textdocument_cleanup_with_sql():        
+    """        
+    End-to-end test for TextDocument cleanup based on last_accessed timestamps.        
+    """        
+    # Enable last accessed tracking BEFORE any cognee operations    
+    os.environ["ENABLE_LAST_ACCESSED"] = "true"    
         
-    # Clean slate    
-    await cognee.prune.prune_data()    
-    await cognee.prune.prune_system(metadata=True)    
-        
-    logger.info("🧪 Testing TextDocument cleanup based on last_accessed")    
-        
-    # Step 1: Add and cognify a test document    
-    dataset_name = "test_cleanup_dataset"    
-    test_text = """    
-    Machine learning is a subset of artificial intelligence that enables systems to learn    
-    and improve from experience without being explicitly programmed. Deep learning uses    
-    neural networks with multiple layers to process data.    
-    """    
-        
-    await setup()    
-    user = await get_default_user()    
-    await cognee.add([test_text], dataset_name=dataset_name, user=user)    
-        
-    cognify_result = await cognee.cognify([dataset_name], user=user)    
-        
-    # Extract dataset_id from cognify result (ds_id is already a UUID)    
-    dataset_id = None    
-    for ds_id, pipeline_result in cognify_result.items():    
-        dataset_id = ds_id  # Don't wrap in UUID() - it's already a UUID object    
-        break    
-        
-    assert dataset_id is not None, "Failed to get dataset_id from cognify result"    
-    logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")    
-        
-    # Step 2: Perform search to trigger last_accessed update    
-    logger.info("Triggering search to update last_accessed...")    
-    search_results = await cognee.search(    
-        query_type=SearchType.CHUNKS,    
-        query_text="machine learning",    
-        datasets=[dataset_name],    
-        user=user    
-    )    
-    logger.info(f"✅ Search completed, found {len(search_results)} results")    
-        
-    # Step 3: Verify last_accessed was set in SQL Data table    
-    db_engine = get_relational_engine()    
-    async with db_engine.get_async_session() as session:    
-        # Get the Data record for this dataset    
-        result = await session.execute(    
-            select(Data, DatasetData)    
-            .join(DatasetData, Data.id == DatasetData.data_id)    
-            .where(DatasetData.dataset_id == dataset_id)    
-        )    
-        data_records = result.all()    
-        assert len(data_records) > 0, "No Data records found for the dataset"    
-        data_record = data_records[0][0]  
-        data_id = data_record.id    
+    # Setup test directories        
+    data_directory_path = str(        
+        pathlib.Path(        
+            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")        
+        ).resolve()        
+    )        
+    cognee_directory_path = str(        
+        pathlib.Path(        
+            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")        
+        ).resolve()        
+    )        
             
-        # Verify last_accessed is set (should be set by search operation)    
-        assert data_record.last_accessed is not None, (    
-            "last_accessed should be set after search operation"    
-        )    
+    cognee.config.data_root_directory(data_directory_path)        
+    cognee.config.system_root_directory(cognee_directory_path)        
             
-        original_last_accessed = data_record.last_accessed    
-        logger.info(f"✅ last_accessed verified: {original_last_accessed}")    
-        
-    # Step 4: Manually age the timestamp to be older than cleanup threshold    
-    days_threshold = 30   
-    aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10)    
-        
-    async with db_engine.get_async_session() as session:    
-        stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)    
-        await session.execute(stmt)    
-        await session.commit()    
-        
-    # Query in a NEW session to avoid cached values    
-    async with db_engine.get_async_session() as session:    
-        result = await session.execute(select(Data).where(Data.id == data_id))    
-        updated_data = result.scalar_one_or_none()    
+    # Initialize database        
+    from cognee.modules.engine.operations.setup import setup        
             
-        # Make both timezone-aware for comparison    
-        retrieved_timestamp = updated_data.last_accessed    
-        if retrieved_timestamp.tzinfo is None:    
-            # If database returned naive datetime, make it UTC-aware    
-            retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)    
+    # Clean slate        
+    await cognee.prune.prune_data()        
+    await cognee.prune.prune_system(metadata=True)        
             
-        assert retrieved_timestamp == aged_timestamp, (    
-            f"Timestamp should be updated to aged value. "    
-            f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}"    
-        )  
-          
-    # Step 5: Test cleanup with text_doc=True    
-    from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data    
-        
-    # First do a dry run    
-    logger.info("Testing dry run with text_doc=True...")    
-    dry_run_result = await cleanup_unused_data(    
-        days_threshold=30,    
-        dry_run=True,    
-        user_id=user.id,    
-        text_doc=True    
-    )    
-        
-    assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'"    
-    assert dry_run_result['unused_count'] > 0, (    
-        "Should find at least one unused document"    
-    )    
-    logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")    
-        
-    # Now run actual cleanup    
-    logger.info("Executing cleanup with text_doc=True...")    
-    cleanup_result = await cleanup_unused_data(    
-        days_threshold=30,    
-        dry_run=False,    
-        user_id=user.id,    
-        text_doc=True    
-    )    
-        
-    assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"    
-    assert cleanup_result["deleted_count"]["documents"] > 0, (    
-        "At least one document should be deleted"    
-    )    
-    logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents")    
-        
-    # Step 6: Verify the document was actually deleted from SQL    
-    async with db_engine.get_async_session() as session:    
-        deleted_data = (    
-            await session.execute(select(Data).where(Data.id == data_id))    
-        ).scalar_one_or_none()    
+    logger.info("🧪 Testing TextDocument cleanup based on last_accessed")        
             
-        assert deleted_data is None, (    
-            "Data record should be deleted after cleanup"    
-        )    
-        logger.info("✅ Confirmed: Data record was deleted from SQL database")    
-        
-    # Verify the dataset-data link was also removed    
-    async with db_engine.get_async_session() as session:    
-        dataset_data_link = (    
-            await session.execute(    
-                select(DatasetData).where(    
-                    DatasetData.data_id == data_id,    
-                    DatasetData.dataset_id == dataset_id    
-                )    
-            )    
-        ).scalar_one_or_none()    
+    # Step 1: Add and cognify a test document        
+    dataset_name = "test_cleanup_dataset"        
+    test_text = """        
+    Machine learning is a subset of artificial intelligence that enables systems to learn        
+    and improve from experience without being explicitly programmed. Deep learning uses        
+    neural networks with multiple layers to process data.        
+    """        
             
-        assert dataset_data_link is None, (    
-            "DatasetData link should be deleted after cleanup"    
-        )    
-        logger.info("✅ Confirmed: DatasetData link was deleted")    
+    await setup()        
+    user = await get_default_user()        
+    await cognee.add([test_text], dataset_name=dataset_name, user=user)        
+            
+    cognify_result = await cognee.cognify([dataset_name], user=user)        
+            
+    # Extract dataset_id from cognify result      
+    dataset_id = None        
+    for ds_id, pipeline_result in cognify_result.items():        
+        dataset_id = ds_id        
+        break        
+            
+    assert dataset_id is not None, "Failed to get dataset_id from cognify result"        
+    logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")        
+            
+    # Step 2: Perform search to trigger last_accessed update        
+    logger.info("Triggering search to update last_accessed...")        
+    search_results = await cognee.search(        
+        query_type=SearchType.CHUNKS,        
+        query_text="machine learning",        
+        datasets=[dataset_name],        
+        user=user        
+    )        
+    logger.info(f"✅ Search completed, found {len(search_results)} results")        
+    assert len(search_results) > 0, "Search should return results"    
+            
+    # Step 3: Verify last_accessed was set and get data_id    
+    db_engine = get_relational_engine()        
+    async with db_engine.get_async_session() as session:        
+        result = await session.execute(        
+            select(Data, DatasetData)        
+            .join(DatasetData, Data.id == DatasetData.data_id)        
+            .where(DatasetData.dataset_id == dataset_id)        
+        )        
+        data_records = result.all()        
+        assert len(data_records) > 0, "No Data records found for the dataset"        
+        data_record = data_records[0][0]      
+        data_id = data_record.id        
+                
+        # Verify last_accessed is set      
+        assert data_record.last_accessed is not None, (        
+            "last_accessed should be set after search operation"        
+        )        
+                
+        original_last_accessed = data_record.last_accessed        
+        logger.info(f"✅ last_accessed verified: {original_last_accessed}")        
+            
+    # Step 4: Manually age the timestamp      
+    minutes_threshold = 30       
+    aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10)        
+            
+    async with db_engine.get_async_session() as session:        
+        stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)        
+        await session.execute(stmt)        
+        await session.commit()        
+            
+    # Verify timestamp was updated      
+    async with db_engine.get_async_session() as session:        
+        result = await session.execute(select(Data).where(Data.id == data_id))        
+        updated_data = result.scalar_one_or_none()        
+        assert updated_data is not None, "Data record should exist"    
+        retrieved_timestamp = updated_data.last_accessed        
+        if retrieved_timestamp.tzinfo is None:        
+            retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)        
+        assert retrieved_timestamp == aged_timestamp, (        
+            f"Timestamp should be updated to aged value"        
+        )        
+              
+    # Step 5: Test cleanup (document-level is now the default)        
+    from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data        
+            
+    # First do a dry run        
+    logger.info("Testing dry run...")        
+    dry_run_result = await cleanup_unused_data(        
+        minutes_threshold=10,        
+        dry_run=True,        
+        user_id=user.id        
+    )        
+            
+    # Debug: Print the actual result    
+    logger.info(f"Dry run result: {dry_run_result}")    
         
-    # Verify graph nodes were cleaned up    
-    from cognee.infrastructure.databases.graph import get_graph_engine    
+    assert dry_run_result['status'] == 'dry_run', f"Status should be 'dry_run', got: {dry_run_result['status']}"        
+    assert dry_run_result['unused_count'] > 0, (        
+        "Should find at least one unused document"        
+    )        
+    logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")        
+            
+    # Now run actual cleanup        
+    logger.info("Executing cleanup...")        
+    cleanup_result = await cleanup_unused_data(        
+        minutes_threshold=30,        
+        dry_run=False,        
+        user_id=user.id        
+    )        
+            
+    assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"        
+    assert cleanup_result["deleted_count"]["documents"] > 0, (        
+        "At least one document should be deleted"        
+    )        
+    logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents")        
+            
+    # Step 6: Verify deletion      
+    async with db_engine.get_async_session() as session:        
+        deleted_data = (        
+            await session.execute(select(Data).where(Data.id == data_id))        
+        ).scalar_one_or_none()        
+        assert deleted_data is None, "Data record should be deleted"        
+        logger.info("✅ Confirmed: Data record was deleted")        
+            
+    logger.info("🎉 All cleanup tests passed!")        
+    return True        
         
-    graph_engine = await get_graph_engine()    
         
-    # Try to find the TextDocument node - it should not exist    
-    result = await graph_engine.query(    
-        "MATCH (n:Node {id: $id}) RETURN n",    
-        {"id": str(data_id)}    
-    )    
-        
-    assert len(result) == 0, (    
-        "TextDocument node should be deleted from graph database"    
-    )    
-    logger.info("✅ Confirmed: TextDocument node was deleted from graph database")    
-      
-    # Verify vector database was cleaned up  
-    from cognee.infrastructure.databases.vector import get_vector_engine  
-      
-    vector_engine = get_vector_engine()  
-      
-    # Check each collection that should have been cleaned up  
-    vector_collections = [  
-        "DocumentChunk_text",  
-        "Entity_name",   
-        "TextSummary_text"  
-    ]  
-      
-    for collection_name in vector_collections:  
-        if await vector_engine.has_collection(collection_name):  
-            # Try to retrieve the deleted data points  
-            try:  
-                results = await vector_engine.retrieve(collection_name, [str(data_id)])  
-                assert len(results) == 0, (  
-                    f"Data points should be deleted from {collection_name} collection"  
-                )  
-                logger.info(f"✅ Confirmed: {collection_name} collection is clean")  
-            except Exception as e:  
-                # Collection might be empty or not exist, which is fine  
-                logger.info(f"✅ Confirmed: {collection_name} collection is empty or doesn't exist")  
-                pass  
-      
-    logger.info("✅ Confirmed: Vector database entries were deleted")  
-        
-    logger.info("🎉 All cleanup tests passed!")    
-        
-    return True    
-    
-    
-if __name__ == "__main__":    
-    import asyncio    
-    success = asyncio.run(test_textdocument_cleanup_with_sql())    
+if __name__ == "__main__":        
+    import asyncio        
+    success = asyncio.run(test_textdocument_cleanup_with_sql())        
     exit(0 if success else 1)

From 2485c3f5f0c2b25572213fe7638467859679c8d2 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Thu, 11 Dec 2025 12:48:06 +0530
Subject: [PATCH 25/25] fix: only document level deletion

---
 cognee/infrastructure/engine/models/DataPoint.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
index 3178713c8..812380eaa 100644
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@@ -43,9 +43,6 @@ class DataPoint(BaseModel):
     updated_at: int = Field(
         default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
     )
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )
     ontology_valid: bool = False
     version: int = 1  # Default version
     topological_rank: Optional[int] = 0