From 3372679f7bb40c01ffd9e337ead27fe9f8981d54 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 29 Oct 2025 20:12:14 +0530
Subject: [PATCH 01/37] feat: adding last_accessed_at field to the models and
 updating the retrievers to update the timestamp

---
 .../modules/chunking/models/DocumentChunk.py  |  7 +++
 cognee/modules/engine/models/Entity.py        |  7 ++-
 cognee/modules/retrieval/chunks_retriever.py  | 55 +++++++----------
 .../modules/retrieval/summaries_retriever.py  | 28 ++++-----
 .../retrieval/utils/access_tracking.py        | 61 +++++++++++++++++++
 cognee/tasks/summarization/models.py          |  8 ++-
 6 files changed, 115 insertions(+), 51 deletions(-)
 create mode 100644 cognee/modules/retrieval/utils/access_tracking.py

diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index 9f8c57486..c4c6a2ed3 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -1,5 +1,7 @@
 from typing import List, Union
 
+from pydantic import BaseModel, Field
+from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.data.processing.document_types import Document
 from cognee.modules.engine.models import Entity
@@ -22,6 +24,7 @@ class DocumentChunk(DataPoint):
     - cut_type: The type of cut that defined this chunk.
     - is_part_of: The document to which this chunk belongs.
     - contains: A list of entities or events contained within the chunk (default is None).
+    - last_accessed_at: The timestamp of the last time the chunk was accessed.
     - metadata: A dictionary to hold meta information related to the chunk, including index
     fields.
     """
@@ -32,5 +35,9 @@ class DocumentChunk(DataPoint):
     cut_type: str
     is_part_of: Document
     contains: List[Union[Entity, Event]] = None
+    
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )  
 
     metadata: dict = {"index_fields": ["text"]}
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 36da2e344..3e48ea02a 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -1,11 +1,14 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.engine.models.EntityType import EntityType
 from typing import Optional
-
+from datetime import datetime, timezone  
+from pydantic import BaseModel, Field
 
 class Entity(DataPoint):
     name: str
     is_a: Optional[EntityType] = None
     description: str
-
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )
     metadata: dict = {"index_fields": ["name"]}
diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index 94b9d3fb9..74634b71e 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -1,10 +1,11 @@
 from typing import Any, Optional
-
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
+from datetime import datetime, timezone  
 
 logger = get_logger("ChunksRetriever")
 
@@ -27,38 +28,26 @@ class ChunksRetriever(BaseRetriever):
     ):
         self.top_k = top_k
 
-    async def get_context(self, query: str) -> Any:
-        """
-        Retrieves document chunks context based on the query.
-
-        Searches for document chunks relevant to the specified query using a vector engine.
-        Raises a NoDataError if no data is found in the system.
-
-        Parameters:
-        -----------
-
-            - query (str): The query string to search for relevant document chunks.
-
-        Returns:
-        --------
-
-            - Any: A list of document chunk payloads retrieved from the search.
-        """
-        logger.info(
-            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
-        )
-
-        vector_engine = get_vector_engine()
-
-        try:
-            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
-            logger.info(f"Found {len(found_chunks)} chunks from vector search")
-        except CollectionNotFoundError as error:
-            logger.error("DocumentChunk_text collection not found in vector database")
-            raise NoDataError("No data found in the system, please add data first.") from error
-
-        chunk_payloads = [result.payload for result in found_chunks]
-        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
+    async def get_context(self, query: str) -> Any:  
+        """Retrieves document chunks context based on the query."""  
+        logger.info(  
+            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"  
+        )  
+      
+        vector_engine = get_vector_engine()  
+      
+        try:  
+            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)  
+            logger.info(f"Found {len(found_chunks)} chunks from vector search")  
+              
+            # NEW: Update access timestamps  
+            await update_node_access_timestamps(found_chunks, "DocumentChunk")  
+        except CollectionNotFoundError as error:  
+            logger.error("DocumentChunk_text collection not found in vector database")  
+            raise NoDataError("No data found in the system, please add data first.") from error  
+      
+        chunk_payloads = [result.payload for result in found_chunks]  
+        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")  
         return chunk_payloads
 
     async def get_completion(
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 87b224946..7f996274e 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -4,6 +4,7 @@ from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
 
 logger = get_logger("SummariesRetriever")
@@ -47,20 +48,19 @@ class SummariesRetriever(BaseRetriever):
             f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
         )
 
-        vector_engine = get_vector_engine()
-
-        try:
-            summaries_results = await vector_engine.search(
-                "TextSummary_text", query, limit=self.top_k
-            )
-            logger.info(f"Found {len(summaries_results)} summaries from vector search")
-        except CollectionNotFoundError as error:
-            logger.error("TextSummary_text collection not found in vector database")
-            raise NoDataError("No data found in the system, please add data first.") from error
-
-        summary_payloads = [summary.payload for summary in summaries_results]
-        logger.info(f"Returning {len(summary_payloads)} summary payloads")
-        return summary_payloads
+        vector_engine = get_vector_engine()  
+          
+        try:  
+            summaries_results = await vector_engine.search(  
+                "TextSummary_text", query, limit=self.top_k  
+            )  
+              
+            await update_node_access_timestamps(summaries_results, "TextSummary")
+              
+        except CollectionNotFoundError as error:  
+            raise NoDataError("No data found in the system, please add data first.") from error  
+          
+        return [summary.payload for summary in summaries_results]  
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
new file mode 100644
index 000000000..ca5ed88cd
--- /dev/null
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -0,0 +1,61 @@
+
+"""Utilities for tracking data access in retrievers."""  
+  
+import json  
+from datetime import datetime, timezone  
+from typing import List, Any  
+  
+from cognee.infrastructure.databases.graph import get_graph_engine  
+from cognee.shared.logging_utils import get_logger  
+  
+logger = get_logger(__name__)  
+  
+  
+async def update_node_access_timestamps(items: List[Any], node_type: str):  
+    """  
+    Update last_accessed_at for nodes in Kuzu graph database.  
+      
+    Parameters  
+    ----------  
+    items : List[Any]  
+        List of items with payload containing 'id' field (from vector search results)  
+    node_type : str  
+        Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary')  
+    """  
+    if not items:  
+        return  
+      
+    graph_engine = await get_graph_engine()  
+    # Convert to milliseconds since epoch (matching the field format)  
+    timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
+      
+    for item in items:  
+        # Extract ID from payload (vector search results have this structure)  
+        item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
+        if not item_id:  
+            continue  
+              
+        try:  
+            # Get current node properties from Kuzu's Node table  
+            result = await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props",  
+                {"id": str(item_id), "node_type": node_type}  
+            )  
+              
+            if result and len(result) > 0 and result[0][0]:  
+                # Parse existing properties JSON  
+                props = json.loads(result[0][0]) if result[0][0] else {}  
+                # Update last_accessed_at with millisecond timestamp  
+                props["last_accessed_at"] = timestamp_ms  
+                  
+                # Write back to graph database  
+                await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props",  
+                    {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)}  
+                )  
+        except Exception as e:  
+            logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}")  
+            continue  
+      
+    logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes")
+
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 75ed82d50..46f9a8d8b 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -1,5 +1,7 @@
-from typing import Union
 
+from pydantic import BaseModel, Field
+from typing import Union
+from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart
@@ -17,7 +19,9 @@ class TextSummary(DataPoint):
 
     text: str
     made_from: DocumentChunk
-
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )
     metadata: dict = {"index_fields": ["text"]}
 
 

From 3f27c5592b58af29369125362510e96b72c56cbc Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 29 Oct 2025 20:17:27 +0530
Subject: [PATCH 02/37] feat: adding last_accessed_at field to the models and
 updating the retrievers to update the timestamp

---
 cognee/modules/retrieval/chunks_retriever.py  | 48 +++++++++++--------
 .../modules/retrieval/summaries_retriever.py  | 28 ++++++-----
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index 74634b71e..f821fc902 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -29,26 +29,34 @@ class ChunksRetriever(BaseRetriever):
         self.top_k = top_k
 
     async def get_context(self, query: str) -> Any:  
-        """Retrieves document chunks context based on the query."""  
-        logger.info(  
-            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"  
-        )  
-      
-        vector_engine = get_vector_engine()  
-      
-        try:  
-            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)  
-            logger.info(f"Found {len(found_chunks)} chunks from vector search")  
-              
-            # NEW: Update access timestamps  
-            await update_node_access_timestamps(found_chunks, "DocumentChunk")  
-        except CollectionNotFoundError as error:  
-            logger.error("DocumentChunk_text collection not found in vector database")  
-            raise NoDataError("No data found in the system, please add data first.") from error  
-      
-        chunk_payloads = [result.payload for result in found_chunks]  
-        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")  
-        return chunk_payloads
+        """
+        Retrieves document chunks context based on the query.
+        Searches for document chunks relevant to the specified query using a vector engine.
+        Raises a NoDataError if no data is found in the system.
+        Parameters:
+        -----------
+            - query (str): The query string to search for relevant document chunks.
+        Returns:
+        --------
+            - Any: A list of document chunk payloads retrieved from the search.
+        """
+        logger.info(
+            f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
+        )
+
+        vector_engine = get_vector_engine()
+
+        try:
+            found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
+            logger.info(f"Found {len(found_chunks)} chunks from vector search")
+            await update_node_access_timestamps(found_chunks, "DocumentChunk")
+
+        except CollectionNotFoundError as error:
+            logger.error("DocumentChunk_text collection not found in vector database")
+            raise NoDataError("No data found in the system, please add data first.") from error
+
+        chunk_payloads = [result.payload for result in found_chunks]
+        logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 7f996274e..9ac8b096d 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -48,19 +48,23 @@ class SummariesRetriever(BaseRetriever):
             f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'"
         )
 
-        vector_engine = get_vector_engine()  
-          
-        try:  
-            summaries_results = await vector_engine.search(  
-                "TextSummary_text", query, limit=self.top_k  
-            )  
-              
+        vector_engine = get_vector_engine()
+
+        try:
+            summaries_results = await vector_engine.search(
+                "TextSummary_text", query, limit=self.top_k
+            )
+            logger.info(f"Found {len(summaries_results)} summaries from vector search")
+            
             await update_node_access_timestamps(summaries_results, "TextSummary")
-              
-        except CollectionNotFoundError as error:  
-            raise NoDataError("No data found in the system, please add data first.") from error  
-          
-        return [summary.payload for summary in summaries_results]  
+        
+        except CollectionNotFoundError as error:
+            logger.error("TextSummary_text collection not found in vector database")
+            raise NoDataError("No data found in the system, please add data first.") from error
+
+        summary_payloads = [summary.payload for summary in summaries_results]
+        logger.info(f"Returning {len(summary_payloads)} summary payloads")
+        return summary_payloads
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs

From 5f6f0502c832d129749b453121c6f5be565044bc Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 31 Oct 2025 00:00:18 +0530
Subject: [PATCH 03/37] fix: removing last_acessed_at from individual model and
 adding it to DataPoint

---
 cognee/infrastructure/engine/models/DataPoint.py | 3 +++
 cognee/modules/chunking/models/DocumentChunk.py  | 5 -----
 cognee/modules/engine/models/Entity.py           | 3 ---
 cognee/tasks/summarization/models.py             | 3 ---
 4 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
index 812380eaa..3178713c8 100644
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@@ -43,6 +43,9 @@ class DataPoint(BaseModel):
     updated_at: int = Field(
         default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
     )
+    last_accessed_at: int = Field(  
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
+    )
     ontology_valid: bool = False
     version: int = 1  # Default version
     topological_rank: Optional[int] = 0
diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index c4c6a2ed3..601454802 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -35,9 +35,4 @@ class DocumentChunk(DataPoint):
     cut_type: str
     is_part_of: Document
     contains: List[Union[Entity, Event]] = None
-    
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )  
-
     metadata: dict = {"index_fields": ["text"]}
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 3e48ea02a..4083cd2e6 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -8,7 +8,4 @@ class Entity(DataPoint):
     name: str
     is_a: Optional[EntityType] = None
     description: str
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )
     metadata: dict = {"index_fields": ["name"]}
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 46f9a8d8b..8cee2ade3 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -19,9 +19,6 @@ class TextSummary(DataPoint):
 
     text: str
     made_from: DocumentChunk
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )
     metadata: dict = {"index_fields": ["text"]}
 
 

From 6f06e4a5eb1143ddcb2ad08132486630b8a2deae Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 31 Oct 2025 00:17:13 +0530
Subject: [PATCH 04/37] fix: removing node_type and try except

---
 cognee/modules/retrieval/chunks_retriever.py  |  2 +-
 .../modules/retrieval/summaries_retriever.py  |  2 +-
 .../retrieval/utils/access_tracking.py        | 55 ++++++++++---------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index f821fc902..be1f95811 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -49,7 +49,7 @@ class ChunksRetriever(BaseRetriever):
         try:
             found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
             logger.info(f"Found {len(found_chunks)} chunks from vector search")
-            await update_node_access_timestamps(found_chunks, "DocumentChunk")
+            await update_node_access_timestamps(found_chunks)
 
         except CollectionNotFoundError as error:
             logger.error("DocumentChunk_text collection not found in vector database")
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 9ac8b096d..0df750d22 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -56,7 +56,7 @@ class SummariesRetriever(BaseRetriever):
             )
             logger.info(f"Found {len(summaries_results)} summaries from vector search")
             
-            await update_node_access_timestamps(summaries_results, "TextSummary")
+            await update_node_access_timestamps(summaries_results)
         
         except CollectionNotFoundError as error:
             logger.error("TextSummary_text collection not found in vector database")
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index ca5ed88cd..79afd25db 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -1,4 +1,4 @@
-
+  
 """Utilities for tracking data access in retrievers."""  
   
 import json  
@@ -11,51 +11,54 @@ from cognee.shared.logging_utils import get_logger
 logger = get_logger(__name__)  
   
   
-async def update_node_access_timestamps(items: List[Any], node_type: str):  
+async def update_node_access_timestamps(items: List[Any]):  
     """  
     Update last_accessed_at for nodes in Kuzu graph database.  
+    Automatically determines node type from the graph database.  
       
     Parameters  
     ----------  
     items : List[Any]  
         List of items with payload containing 'id' field (from vector search results)  
-    node_type : str  
-        Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary')  
     """  
     if not items:  
         return  
       
     graph_engine = await get_graph_engine()  
-    # Convert to milliseconds since epoch (matching the field format)  
     timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
       
     for item in items:  
-        # Extract ID from payload (vector search results have this structure)  
+        # Extract ID from payload  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
         if not item_id:  
             continue  
               
-        try:  
-            # Get current node properties from Kuzu's Node table  
-            result = await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props",  
-                {"id": str(item_id), "node_type": node_type}  
+        # try:  
+        # Query to get both node type and properties in one call  
+        result = await graph_engine.query(  
+            "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props",  
+            {"id": str(item_id)}  
+        )  
+          
+        if result and len(result) > 0 and result[0]:  
+            node_type = result[0][0]  # First column: node_type  
+            props_json = result[0][1]  # Second column: properties  
+              
+            # Parse existing properties JSON  
+            props = json.loads(props_json) if props_json else {}  
+            # Update last_accessed_at with millisecond timestamp  
+            props["last_accessed_at"] = timestamp_ms  
+              
+            # Write back to graph database  
+            await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                {"id": str(item_id), "props": json.dumps(props)}  
             )  
               
-            if result and len(result) > 0 and result[0][0]:  
-                # Parse existing properties JSON  
-                props = json.loads(result[0][0]) if result[0][0] else {}  
-                # Update last_accessed_at with millisecond timestamp  
-                props["last_accessed_at"] = timestamp_ms  
+            logger.debug(f"Updated access timestamp for {node_type} node {item_id}")  
                   
-                # Write back to graph database  
-                await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props",  
-                    {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)}  
-                )  
-        except Exception as e:  
-            logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}")  
-            continue  
+        # except Exception as e:  
+        #     logger.error(f"Failed to update timestamp for node {item_id}: {e}")  
+        #     continue  
       
-    logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes")
-
+    logger.debug(f"Updated access timestamps for {len(items)} nodes")

From f1afd1f0a2a5433dc341c485b08ce33d1bc16252 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 31 Oct 2025 15:49:34 +0530
Subject: [PATCH 05/37] feat: adding cleanup function and adding
 update_node_acess_timestamps in completion retriever and graph_completion
 retriever

---
 .../modules/retrieval/completion_retriever.py |   3 +-
 .../retrieval/graph_completion_retriever.py   |  13 +-
 cognee/tasks/cleanup/cleanup_unused_data.py   | 232 ++++++++++++++++++
 3 files changed, 246 insertions(+), 2 deletions(-)
 create mode 100644 cognee/tasks/cleanup/cleanup_unused_data.py

diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py
index bb568924d..fc8ef747f 100644
--- a/cognee/modules/retrieval/completion_retriever.py
+++ b/cognee/modules/retrieval/completion_retriever.py
@@ -8,6 +8,7 @@ from cognee.modules.retrieval.utils.session_cache import (
     save_conversation_history,
     get_conversation_history,
 )
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
@@ -65,7 +66,7 @@ class CompletionRetriever(BaseRetriever):
 
             if len(found_chunks) == 0:
                 return ""
-
+            await update_node_access_timestamps(found_chunks)
             # Combine all chunks text returned from vector search (number of chunks is determined by top_k
             chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks]
             combined_context = "\n".join(chunks_payload)
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
index b7ab4edae..ac7e45e3c 100644
--- a/cognee/modules/retrieval/graph_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -16,6 +16,7 @@ from cognee.modules.retrieval.utils.session_cache import (
 )
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node
+from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
 from cognee.modules.retrieval.utils.models import CogneeUserInteraction
 from cognee.modules.engine.models.node_set import NodeSet
 from cognee.infrastructure.databases.graph import get_graph_engine
@@ -138,7 +139,17 @@ class GraphCompletionRetriever(BaseGraphRetriever):
             return []
 
         # context = await self.resolve_edges_to_text(triplets)
-
+        entity_nodes = []  
+        seen_ids = set()  
+        for triplet in triplets:  
+            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node1.id)})  
+                seen_ids.add(triplet.node1.id)  
+            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node2.id)})  
+                seen_ids.add(triplet.node2.id)  
+          
+        await update_node_access_timestamps(entity_nodes) 
         return triplets
 
     async def get_completion(
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
new file mode 100644
index 000000000..e97692bb4
--- /dev/null
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -0,0 +1,232 @@
+"""  
+Task for automatically deleting unused data from the memify pipeline.  
+  
+This task identifies and removes data (chunks, entities, summaries) that hasn't  
+been accessed by retrievers for a specified period, helping maintain system  
+efficiency and storage optimization.  
+"""  
+  
+import json  
+from datetime import datetime, timezone, timedelta  
+from typing import Optional, Dict, Any  
+from uuid import UUID  
+  
+from cognee.infrastructure.databases.graph import get_graph_engine  
+from cognee.infrastructure.databases.vector import get_vector_engine  
+from cognee.shared.logging_utils import get_logger  
+  
+logger = get_logger(__name__)  
+  
+  
+async def cleanup_unused_data(  
+    minutes_threshold: int = 30,  
+    dry_run: bool = True,  
+    user_id: Optional[UUID] = None  
+) -> Dict[str, Any]:  
+    """  
+    Identify and remove unused data from the memify pipeline.  
+      
+    Parameters  
+    ----------  
+    minutes_threshold : int  
+        Minutes since last access to consider data unused (default: 30)  
+    dry_run : bool  
+        If True, only report what would be deleted without actually deleting (default: True)  
+    user_id : UUID, optional  
+        Limit cleanup to specific user's data (default: None)  
+      
+    Returns  
+    -------  
+    Dict[str, Any]  
+        Cleanup results with status, counts, and timestamp  
+    """  
+    logger.info(  
+        "Starting cleanup task",  
+        minutes_threshold=minutes_threshold,  
+        dry_run=dry_run,  
+        user_id=str(user_id) if user_id else None  
+    )  
+      
+    # Calculate cutoff timestamp in milliseconds  
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+    cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
+      
+    logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")  
+      
+    # Find unused nodes  
+    unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)  
+      
+    total_unused = sum(len(nodes) for nodes in unused_nodes.values())  
+    logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})  
+      
+    if dry_run:  
+        return {  
+            "status": "dry_run",  
+            "unused_count": total_unused,  
+            "deleted_count": {  
+                "data_items": 0,  
+                "chunks": 0,  
+                "entities": 0,  
+                "summaries": 0,  
+                "associations": 0  
+            },  
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
+            "preview": {  
+                "chunks": len(unused_nodes["DocumentChunk"]),  
+                "entities": len(unused_nodes["Entity"]),  
+                "summaries": len(unused_nodes["TextSummary"])  
+            }  
+        }  
+      
+    # Delete unused nodes  
+    deleted_counts = await _delete_unused_nodes(unused_nodes)  
+      
+    logger.info("Cleanup completed", deleted_counts=deleted_counts)  
+      
+    return {  
+        "status": "completed",  
+        "unused_count": total_unused,  
+        "deleted_count": {  
+            "data_items": 0,  
+            "chunks": deleted_counts["DocumentChunk"],  
+            "entities": deleted_counts["Entity"],  
+            "summaries": deleted_counts["TextSummary"],  
+            "associations": deleted_counts["associations"]  
+        },  
+        "cleanup_date": datetime.now(timezone.utc).isoformat()  
+    }  
+  
+  
+async def _find_unused_nodes(  
+    cutoff_timestamp_ms: int,  
+    user_id: Optional[UUID] = None  
+) -> Dict[str, list]:  
+    """  
+    Query Kuzu for nodes with old last_accessed_at timestamps.  
+      
+    Parameters  
+    ----------  
+    cutoff_timestamp_ms : int  
+        Cutoff timestamp in milliseconds since epoch  
+    user_id : UUID, optional  
+        Filter by user ID if provided  
+      
+    Returns  
+    -------  
+    Dict[str, list]  
+        Dictionary mapping node types to lists of unused node IDs  
+    """  
+    graph_engine = await get_graph_engine()  
+      
+    # Query all nodes with their properties  
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {  
+        "DocumentChunk": [],  
+        "Entity": [],  
+        "TextSummary": []  
+    }  
+      
+    for node_id, node_type, props_json in results:  
+        # Only process tracked node types  
+        if node_type not in unused_nodes:  
+            continue  
+          
+        # Parse properties JSON  
+        if props_json:  
+            try:  
+                props = json.loads(props_json)  
+                last_accessed = props.get("last_accessed_at")  
+                  
+                # Check if node is unused (never accessed or accessed before cutoff)  
+                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+                    # TODO: Add user_id filtering when user ownership is implemented  
+                    unused_nodes[node_type].append(node_id)  
+                    logger.debug(  
+                        f"Found unused {node_type}",  
+                        node_id=node_id,  
+                        last_accessed=last_accessed  
+                    )  
+            except json.JSONDecodeError:  
+                logger.warning(f"Failed to parse properties for node {node_id}")  
+                continue  
+      
+    return unused_nodes  
+  
+  
+async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:  
+    """  
+    Delete unused nodes from graph and vector databases.  
+      
+    Parameters  
+    ----------  
+    unused_nodes : Dict[str, list]  
+        Dictionary mapping node types to lists of node IDs to delete  
+      
+    Returns  
+    -------  
+    Dict[str, int]  
+        Count of deleted items by type  
+    """  
+    graph_engine = await get_graph_engine()  
+    vector_engine = get_vector_engine()  
+      
+    deleted_counts = {  
+        "DocumentChunk": 0,  
+        "Entity": 0,  
+        "TextSummary": 0,  
+        "associations": 0  
+    }  
+      
+    # Count associations before deletion  
+    for node_type, node_ids in unused_nodes.items():  
+        if not node_ids:  
+            continue  
+          
+        # Count edges connected to these nodes  
+        for node_id in node_ids:  
+            result = await graph_engine.query(  
+                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",  
+                {"id": node_id}  
+            )  
+            if result and len(result) > 0:  
+                deleted_counts["associations"] += result[0][0]  
+      
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)  
+    for node_type, node_ids in unused_nodes.items():  
+        if not node_ids:  
+            continue  
+          
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")  
+          
+        # Delete nodes in batches  
+        await graph_engine.delete_nodes(node_ids)  
+        deleted_counts[node_type] = len(node_ids)  
+      
+    # Delete from vector database  
+    vector_collections = {  
+        "DocumentChunk": "DocumentChunk_text",  
+        "Entity": "Entity_name",  
+        "TextSummary": "TextSummary_text"  
+    }  
+      
+    for node_type, collection_name in vector_collections.items():  
+        node_ids = unused_nodes[node_type]  
+        if not node_ids:  
+            continue  
+          
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
+          
+        try:  
+            # Delete from vector collection  
+            if await vector_engine.has_collection(collection_name):  
+                for node_id in node_ids:  
+                    try:  
+                        await vector_engine.delete(collection_name, {"id": str(node_id)})  
+                    except Exception as e:  
+                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")  
+        except Exception as e:  
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
+      
+    return deleted_counts

From 5080e8f8a5c20d092b917b66eb52a577fe899231 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 3 Nov 2025 00:59:04 +0530
Subject: [PATCH 06/37] feat: genarlizing getting entities from triplets

---
 cognee/modules/graph/utils/__init__.py              |  1 +
 .../graph/utils/get_entity_nodes_from_triplets.py   | 13 +++++++++++++
 .../modules/retrieval/graph_completion_retriever.py | 12 +++---------
 3 files changed, 17 insertions(+), 9 deletions(-)
 create mode 100644 cognee/modules/graph/utils/get_entity_nodes_from_triplets.py

diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py
index ebc648495..4c0b29d47 100644
--- a/cognee/modules/graph/utils/__init__.py
+++ b/cognee/modules/graph/utils/__init__.py
@@ -5,3 +5,4 @@ from .retrieve_existing_edges import retrieve_existing_edges
 from .convert_node_to_data_point import convert_node_to_data_point
 from .deduplicate_nodes_and_edges import deduplicate_nodes_and_edges
 from .resolve_edges_to_text import resolve_edges_to_text
+from .get_entity_nodes_from_triplets import get_entity_nodes_from_triplets
diff --git a/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
new file mode 100644
index 000000000..598a36854
--- /dev/null
+++ b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
@@ -0,0 +1,13 @@
+
+def get_entity_nodes_from_triplets(triplets):
+        entity_nodes = []
+        seen_ids = set()
+        for triplet in triplets:  
+            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node1.id)})  
+                seen_ids.add(triplet.node1.id)  
+            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
+                entity_nodes.append({"id": str(triplet.node2.id)})  
+                seen_ids.add(triplet.node2.id)
+
+        return entity_nodes
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
index ac7e45e3c..122cc943f 100644
--- a/cognee/modules/retrieval/graph_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -22,6 +22,7 @@ from cognee.modules.engine.models.node_set import NodeSet
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.context_global_variables import session_user
 from cognee.infrastructure.databases.cache.config import CacheConfig
+from cognee.modules.graph.utils import get_entity_nodes_from_triplets
 
 logger = get_logger("GraphCompletionRetriever")
 
@@ -139,15 +140,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
             return []
 
         # context = await self.resolve_edges_to_text(triplets)
-        entity_nodes = []  
-        seen_ids = set()  
-        for triplet in triplets:  
-            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
-                entity_nodes.append({"id": str(triplet.node1.id)})  
-                seen_ids.add(triplet.node1.id)  
-            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
-                entity_nodes.append({"id": str(triplet.node2.id)})  
-                seen_ids.add(triplet.node2.id)  
+
+        entity_nodes = get_entity_nodes_from_triplets(triplets)
           
         await update_node_access_timestamps(entity_nodes) 
         return triplets

From 90d10e6f9af50c85fbbf282dd961719d5da7f922 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Mon, 3 Nov 2025 15:31:09 +0100
Subject: [PATCH 07/37] test: Add docs tests. Initial commit, still WIP.

---
 .github/workflows/docs_tests.yml              | 18 ++++++
 .../tests/docs/guides/custom_data_models.py   | 38 +++++++++++++
 cognee/tests/docs/guides/custom_prompts.py    | 30 ++++++++++
 .../docs/guides/custom_tasks_and_pipelines.py | 53 +++++++++++++++++
 .../tests/docs/guides/graph_visualization.py  | 13 +++++
 cognee/tests/docs/guides/low_level_llm.py     | 31 ++++++++++
 cognee/tests/docs/guides/memify_quickstart.py | 29 ++++++++++
 .../tests/docs/guides/ontology_quickstart.py  | 30 ++++++++++
 cognee/tests/docs/guides/s3_storage.py        | 25 ++++++++
 cognee/tests/docs/guides/search_basics.py     | 17 ++++++
 cognee/tests/docs/guides/temporal_cognify.py  | 57 +++++++++++++++++++
 11 files changed, 341 insertions(+)
 create mode 100644 .github/workflows/docs_tests.yml
 create mode 100644 cognee/tests/docs/guides/custom_data_models.py
 create mode 100644 cognee/tests/docs/guides/custom_prompts.py
 create mode 100644 cognee/tests/docs/guides/custom_tasks_and_pipelines.py
 create mode 100644 cognee/tests/docs/guides/graph_visualization.py
 create mode 100644 cognee/tests/docs/guides/low_level_llm.py
 create mode 100644 cognee/tests/docs/guides/memify_quickstart.py
 create mode 100644 cognee/tests/docs/guides/ontology_quickstart.py
 create mode 100644 cognee/tests/docs/guides/s3_storage.py
 create mode 100644 cognee/tests/docs/guides/search_basics.py
 create mode 100644 cognee/tests/docs/guides/temporal_cognify.py

diff --git a/.github/workflows/docs_tests.yml b/.github/workflows/docs_tests.yml
new file mode 100644
index 000000000..b3c538668
--- /dev/null
+++ b/.github/workflows/docs_tests.yml
@@ -0,0 +1,18 @@
+name: Docs Test Suite
+permissions:
+  contents: read
+
+on:
+  release:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  RUNTIME__LOG_LEVEL: ERROR
+  ENV: 'dev'
+
+jobs:
+
diff --git a/cognee/tests/docs/guides/custom_data_models.py b/cognee/tests/docs/guides/custom_data_models.py
new file mode 100644
index 000000000..0eb314227
--- /dev/null
+++ b/cognee/tests/docs/guides/custom_data_models.py
@@ -0,0 +1,38 @@
+import asyncio
+from typing import Any
+from pydantic import SkipValidation
+
+import cognee
+from cognee.infrastructure.engine import DataPoint
+from cognee.infrastructure.engine.models.Edge import Edge
+from cognee.tasks.storage import add_data_points
+
+
+class Person(DataPoint):
+    name: str
+    # Keep it simple for forward refs / mixed values
+    knows: SkipValidation[Any] = None  # single Person or list[Person]
+    # Recommended: specify which fields to index for search
+    metadata: dict = {"index_fields": ["name"]}
+
+
+async def main():
+    # Start clean (optional in your app)
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    alice = Person(name="Alice")
+    bob = Person(name="Bob")
+    charlie = Person(name="Charlie")
+
+    # Create relationships - field name becomes edge label
+    alice.knows = bob
+    # You can also do lists: alice.knows = [bob, charlie]
+
+    # Optional: add weights and custom relationship types
+    bob.knows = (Edge(weight=0.9, relationship_type="friend_of"), charlie)
+
+    await add_data_points([alice, bob, charlie])
+
+
+asyncio.run(main())
diff --git a/cognee/tests/docs/guides/custom_prompts.py b/cognee/tests/docs/guides/custom_prompts.py
new file mode 100644
index 000000000..0d0a55a80
--- /dev/null
+++ b/cognee/tests/docs/guides/custom_prompts.py
@@ -0,0 +1,30 @@
+import asyncio
+import cognee
+from cognee.api.v1.search import SearchType
+
+custom_prompt = """
+Extract only people and cities as entities.
+Connect people to cities with the relationship "lives_in".
+Ignore all other entities.
+"""
+
+
+async def main():
+    await cognee.add(
+        [
+            "Alice moved to Paris in 2010, while Bob has always lived in New York.",
+            "Andreas was born in Venice, but later settled in Lisbon.",
+            "Diana and Tom were born and raised in Helsingy. Diana currently resides in Berlin, while Tom never moved.",
+        ]
+    )
+    await cognee.cognify(custom_prompt=custom_prompt)
+
+    res = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="Where does Alice live?",
+    )
+    print(res)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/custom_tasks_and_pipelines.py b/cognee/tests/docs/guides/custom_tasks_and_pipelines.py
new file mode 100644
index 000000000..202bb128a
--- /dev/null
+++ b/cognee/tests/docs/guides/custom_tasks_and_pipelines.py
@@ -0,0 +1,53 @@
+import asyncio
+from typing import Any, Dict, List
+from pydantic import BaseModel, SkipValidation
+
+import cognee
+from cognee.modules.engine.operations.setup import setup
+from cognee.infrastructure.llm.LLMGateway import LLMGateway
+from cognee.infrastructure.engine import DataPoint
+from cognee.tasks.storage import add_data_points
+from cognee.modules.pipelines import Task, run_pipeline
+
+
+class Person(DataPoint):
+    name: str
+    # Optional relationships (we'll let the LLM populate this)
+    knows: List["Person"] = []
+    # Make names searchable in the vector store
+    metadata: Dict[str, Any] = {"index_fields": ["name"]}
+
+
+class People(BaseModel):
+    persons: List[Person]
+
+
+async def extract_people(text: str) -> List[Person]:
+    system_prompt = (
+        "Extract people mentioned in the text. "
+        "Return as `persons: Person[]` with each Person having `name` and optional `knows` relations. "
+        "If the text says someone knows someone set `knows` accordingly. "
+        "Only include facts explicitly stated."
+    )
+    people = await LLMGateway.acreate_structured_output(text, system_prompt, People)
+    return people.persons
+
+
+async def main():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    await setup()
+
+    text = "Alice knows Bob."
+
+    tasks = [
+        Task(extract_people),  # input: text -> output: list[Person]
+        Task(add_data_points),  # input: list[Person] -> output: list[Person]
+    ]
+
+    async for _ in run_pipeline(tasks=tasks, data=text, datasets=["people_demo"]):
+        pass
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/graph_visualization.py b/cognee/tests/docs/guides/graph_visualization.py
new file mode 100644
index 000000000..d463cbb56
--- /dev/null
+++ b/cognee/tests/docs/guides/graph_visualization.py
@@ -0,0 +1,13 @@
+import asyncio
+import cognee
+from cognee.api.v1.visualize.visualize import visualize_graph
+
+
+async def main():
+    await cognee.add(["Alice knows Bob.", "NLP is a subfield of CS."])
+    await cognee.cognify()
+
+    await visualize_graph("./graph_after_cognify.html")
+
+
+asyncio.run(main())
diff --git a/cognee/tests/docs/guides/low_level_llm.py b/cognee/tests/docs/guides/low_level_llm.py
new file mode 100644
index 000000000..454f53f44
--- /dev/null
+++ b/cognee/tests/docs/guides/low_level_llm.py
@@ -0,0 +1,31 @@
+import asyncio
+
+from pydantic import BaseModel
+from typing import List
+from cognee.infrastructure.llm.LLMGateway import LLMGateway
+
+
+class MiniEntity(BaseModel):
+    name: str
+    type: str
+
+
+class MiniGraph(BaseModel):
+    nodes: List[MiniEntity]
+
+
+async def main():
+    system_prompt = (
+        "Extract entities as nodes with name and type. "
+        "Use concise, literal values present in the text."
+    )
+
+    text = "Apple develops iPhone; Audi produces the R8."
+
+    result = await LLMGateway.acreate_structured_output(text, system_prompt, MiniGraph)
+    print(result)
+    # MiniGraph(nodes=[MiniEntity(name='Apple', type='Organization'), ...])
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/memify_quickstart.py b/cognee/tests/docs/guides/memify_quickstart.py
new file mode 100644
index 000000000..040654350
--- /dev/null
+++ b/cognee/tests/docs/guides/memify_quickstart.py
@@ -0,0 +1,29 @@
+import asyncio
+import cognee
+from cognee import SearchType
+
+
+async def main():
+    # 1) Add two short chats and build a graph
+    await cognee.add(
+        [
+            "We follow PEP8. Add type hints and docstrings.",
+            "Releases should not be on Friday. Susan must review PRs.",
+        ],
+        dataset_name="rules_demo",
+    )
+    await cognee.cognify(datasets=["rules_demo"])  # builds graph
+
+    # 2) Enrich the graph (uses default memify tasks)
+    await cognee.memify(dataset="rules_demo")
+
+    # 3) Query the new coding rules
+    rules = await cognee.search(
+        query_type=SearchType.CODING_RULES,
+        query_text="List coding rules",
+        node_name=["coding_agent_rules"],
+    )
+    print("Rules:", rules)
+
+
+asyncio.run(main())
diff --git a/cognee/tests/docs/guides/ontology_quickstart.py b/cognee/tests/docs/guides/ontology_quickstart.py
new file mode 100644
index 000000000..2784dab19
--- /dev/null
+++ b/cognee/tests/docs/guides/ontology_quickstart.py
@@ -0,0 +1,30 @@
+import asyncio
+import cognee
+
+
+async def main():
+    texts = ["Audi produces the R8 and e-tron.", "Apple develops iPhone and MacBook."]
+
+    await cognee.add(texts)
+    # or: await cognee.add("/path/to/folder/of/files")
+
+    import os
+    from cognee.modules.ontology.ontology_config import Config
+    from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
+
+    ontology_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "ontology_input_example/basic_ontology.owl"
+    )
+
+    # Create full config structure manually
+    config: Config = {
+        "ontology_config": {
+            "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_path)
+        }
+    }
+
+    await cognee.cognify(config=config)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/s3_storage.py b/cognee/tests/docs/guides/s3_storage.py
new file mode 100644
index 000000000..1044e05b4
--- /dev/null
+++ b/cognee/tests/docs/guides/s3_storage.py
@@ -0,0 +1,25 @@
+import asyncio
+import cognee
+
+
+async def main():
+    # Single file
+    await cognee.add("s3://cognee-temp/2024-11-04.md")
+
+    # Folder/prefix (recursively expands)
+    await cognee.add("s3://cognee-temp")
+
+    # Mixed list
+    await cognee.add(
+        [
+            "s3://cognee-temp/2024-11-04.md",
+            "Some inline text to ingest",
+        ]
+    )
+
+    # Process the data
+    await cognee.cognify()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py
new file mode 100644
index 000000000..67d0c938d
--- /dev/null
+++ b/cognee/tests/docs/guides/search_basics.py
@@ -0,0 +1,17 @@
+import asyncio
+import cognee
+
+
+async def main():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    text = "First rule of coding: Do not talk about coding."
+
+    # Make sure you've already run cognee.cognify(...) so the graph has content
+    answers = await cognee.search(query_text="What are the main themes in my data?")
+    for answer in answers:
+        print(answer)
+
+
+asyncio.run(main())
diff --git a/cognee/tests/docs/guides/temporal_cognify.py b/cognee/tests/docs/guides/temporal_cognify.py
new file mode 100644
index 000000000..34c1ee33c
--- /dev/null
+++ b/cognee/tests/docs/guides/temporal_cognify.py
@@ -0,0 +1,57 @@
+import asyncio
+import cognee
+
+
+async def main():
+    text = """
+    In 1998 the project launched. In 2001 version 1.0 shipped. In 2004 the team merged
+    with another group. In 2010 support for v1 ended.
+    """
+
+    await cognee.add(text, dataset_name="timeline_demo")
+
+    await cognee.cognify(datasets=["timeline_demo"], temporal_cognify=True)
+
+    from cognee.api.v1.search import SearchType
+
+    # Before / after queries
+    result = await cognee.search(
+        query_type=SearchType.TEMPORAL, query_text="What happened before 2000?", top_k=10
+    )
+
+    assert result != []
+
+    result = await cognee.search(
+        query_type=SearchType.TEMPORAL, query_text="What happened after 2010?", top_k=10
+    )
+
+    assert result != []
+
+    # Between queries
+    result = await cognee.search(
+        query_type=SearchType.TEMPORAL, query_text="Events between 2001 and 2004", top_k=10
+    )
+
+    assert result != []
+
+    # Scoped descriptions
+    result = await cognee.search(
+        query_type=SearchType.TEMPORAL,
+        query_text="Key project milestones between 1998 and 2010",
+        top_k=10,
+    )
+
+    assert result != []
+
+    result = await cognee.search(
+        query_type=SearchType.TEMPORAL,
+        query_text="What happened after 2004?",
+        datasets=["timeline_demo"],
+        top_k=10,
+    )
+
+    assert result != []
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From d34fd9237bf41c6b421bd556541b50ea68246e45 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 4 Nov 2025 22:04:32 +0530
Subject: [PATCH 08/37] feat: adding last_acessed in the Data model

---
 .../e1ec1dcb50b6_add_last_accessed_to_data.py |  30 ++++++
 cognee/modules/data/models/Data.py            |   1 +
 .../retrieval/utils/access_tracking.py        | 102 ++++++++++++------
 3 files changed, 100 insertions(+), 33 deletions(-)
 create mode 100644 alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
new file mode 100644
index 000000000..0ccefa63b
--- /dev/null
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -0,0 +1,30 @@
+"""add_last_accessed_to_data
+
+Revision ID: e1ec1dcb50b6
+Revises: 211ab850ef3d
+Create Date: 2025-11-04 21:45:52.642322
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'e1ec1dcb50b6'
+down_revision: Union[str, None] = '211ab850ef3d'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+  
+def upgrade() -> None:  
+    op.add_column('data',   
+        sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
+    )  
+    # Optionally initialize with created_at values for existing records  
+    op.execute("UPDATE data SET last_accessed = created_at")  
+  
+  
+def downgrade() -> None:  
+    op.drop_column('data', 'last_accessed')
diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py
index ef228f2e1..27ab7481e 100644
--- a/cognee/modules/data/models/Data.py
+++ b/cognee/modules/data/models/Data.py
@@ -36,6 +36,7 @@ class Data(Base):
     data_size = Column(Integer, nullable=True)  # File size in bytes
     created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
     updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
+    last_accessed = Column(DateTime(timezone=True), nullable=True)
 
     datasets = relationship(
         "Dataset",
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 79afd25db..621e09e27 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -1,20 +1,27 @@
-  
 """Utilities for tracking data access in retrievers."""  
   
 import json  
 from datetime import datetime, timezone  
 from typing import List, Any  
+from uuid import UUID  
   
 from cognee.infrastructure.databases.graph import get_graph_engine  
+from cognee.infrastructure.databases.relational import get_relational_engine  
+from cognee.modules.data.models import Data  
 from cognee.shared.logging_utils import get_logger  
+from sqlalchemy import update  
   
 logger = get_logger(__name__)  
   
   
 async def update_node_access_timestamps(items: List[Any]):  
     """  
-    Update last_accessed_at for nodes in Kuzu graph database.  
-    Automatically determines node type from the graph database.  
+    Update last_accessed_at for nodes in graph database and corresponding Data records in SQL.  
+      
+    This function:  
+    1. Updates last_accessed_at in the graph database nodes (in properties JSON)  
+    2. Traverses to find origin TextDocument nodes  
+    3. Updates last_accessed in the SQL Data table for those documents  
       
     Parameters  
     ----------  
@@ -26,39 +33,68 @@ async def update_node_access_timestamps(items: List[Any]):
       
     graph_engine = await get_graph_engine()  
     timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
+    timestamp_dt = datetime.now(timezone.utc)  
       
+    # Extract node IDs  
+    node_ids = []  
     for item in items:  
-        # Extract ID from payload  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
-        if not item_id:  
-            continue  
-              
-        # try:  
-        # Query to get both node type and properties in one call  
-        result = await graph_engine.query(  
-            "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props",  
-            {"id": str(item_id)}  
-        )  
-          
-        if result and len(result) > 0 and result[0]:  
-            node_type = result[0][0]  # First column: node_type  
-            props_json = result[0][1]  # Second column: properties  
-              
-            # Parse existing properties JSON  
-            props = json.loads(props_json) if props_json else {}  
-            # Update last_accessed_at with millisecond timestamp  
-            props["last_accessed_at"] = timestamp_ms  
-              
-            # Write back to graph database  
-            await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                {"id": str(item_id), "props": json.dumps(props)}  
+        if item_id:  
+            node_ids.append(str(item_id))  
+      
+    if not node_ids:  
+        return  
+      
+    try:  
+        # Step 1: Batch update graph nodes  
+        for node_id in node_ids:  
+            result = await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) RETURN n.properties",  
+                {"id": node_id}  
             )  
               
-            logger.debug(f"Updated access timestamp for {node_type} node {item_id}")  
+            if result and result[0]:  
+                props = json.loads(result[0][0]) if result[0][0] else {}  
+                props["last_accessed_at"] = timestamp_ms  
                   
-        # except Exception as e:  
-        #     logger.error(f"Failed to update timestamp for node {item_id}: {e}")  
-        #     continue  
-      
-    logger.debug(f"Updated access timestamps for {len(items)} nodes")
+                await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                    {"id": node_id, "props": json.dumps(props)}  
+                )  
+          
+        logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")  
+          
+        # Step 2: Find origin TextDocument nodes  
+        origin_query = """  
+        UNWIND $node_ids AS node_id  
+        MATCH (n:Node {id: node_id})  
+        OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node)  
+        WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from')   
+        AND chunk.type = 'DocumentChunk'  
+        OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node)  
+        WHERE e2.relationship_name = 'is_part_of'  
+        AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
+        RETURN DISTINCT doc.id as doc_id  
+        """  
+          
+        result = await graph_engine.query(origin_query, {"node_ids": node_ids})  
+          
+        # Extract document IDs  
+        doc_ids = [row[0] for row in result if row and row[0]] if result else []  
+          
+        # Step 3: Update SQL Data table  
+        if doc_ids:  
+            db_engine = get_relational_engine()  
+            async with db_engine.get_async_session() as session:  
+                stmt = update(Data).where(  
+                    Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
+                ).values(last_accessed=timestamp_dt)  
+                  
+                await session.execute(stmt)  
+                await session.commit()  
+                  
+            logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL")  
+          
+    except Exception as e:  
+        logger.error(f"Failed to update timestamps: {e}")  
+        raise

From 3c0e915812a4ffb8662419647572c6229ed963a9 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 12:25:51 +0530
Subject: [PATCH 09/37] fix: removing hard relations

---
 .../modules/retrieval/utils/access_tracking.py   | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 621e09e27..36c0b7f50 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -20,7 +20,7 @@ async def update_node_access_timestamps(items: List[Any]):
       
     This function:  
     1. Updates last_accessed_at in the graph database nodes (in properties JSON)  
-    2. Traverses to find origin TextDocument nodes  
+    2. Traverses to find origin TextDocument nodes (without hardcoded relationship names)  
     3. Updates last_accessed in the SQL Data table for those documents  
       
     Parameters  
@@ -64,23 +64,21 @@ async def update_node_access_timestamps(items: List[Any]):
           
         logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")  
           
-        # Step 2: Find origin TextDocument nodes  
+        # Step 2: Find origin TextDocument nodes (without hardcoded relationship names)  
         origin_query = """  
         UNWIND $node_ids AS node_id  
         MATCH (n:Node {id: node_id})  
         OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node)  
-        WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from')   
-        AND chunk.type = 'DocumentChunk'  
-        OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node)  
-        WHERE e2.relationship_name = 'is_part_of'  
-        AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
+        WHERE chunk.type = 'DocumentChunk'  
+        OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node)  
+        WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
         RETURN DISTINCT doc.id as doc_id  
         """  
           
         result = await graph_engine.query(origin_query, {"node_ids": node_ids})  
           
-        # Extract document IDs  
-        doc_ids = [row[0] for row in result if row and row[0]] if result else []  
+        # Extract and deduplicate document IDs  
+        doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else []  
           
         # Step 3: Update SQL Data table  
         if doc_ids:  

From 9041a804ecc2d0be1903c2de0ac875f32fcc553c Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 18:32:49 +0530
Subject: [PATCH 10/37] fix: add text_doc flag

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 520 ++++++++++++--------
 1 file changed, 312 insertions(+), 208 deletions(-)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index e97692bb4..c9c711fe2 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,232 +1,336 @@
-"""  
-Task for automatically deleting unused data from the memify pipeline.  
-  
-This task identifies and removes data (chunks, entities, summaries) that hasn't  
-been accessed by retrievers for a specified period, helping maintain system  
-efficiency and storage optimization.  
-"""  
-  
-import json  
-from datetime import datetime, timezone, timedelta  
-from typing import Optional, Dict, Any  
-from uuid import UUID  
-  
-from cognee.infrastructure.databases.graph import get_graph_engine  
-from cognee.infrastructure.databases.vector import get_vector_engine  
-from cognee.shared.logging_utils import get_logger  
-  
-logger = get_logger(__name__)  
+"""    
+Task for automatically deleting unused data from the memify pipeline.    
+    
+This task identifies and removes data (chunks, entities, summaries) that hasn't    
+been accessed by retrievers for a specified period, helping maintain system    
+efficiency and storage optimization.    
+"""    
+    
+import json    
+from datetime import datetime, timezone, timedelta    
+from typing import Optional, Dict, Any    
+from uuid import UUID    
+    
+from cognee.infrastructure.databases.graph import get_graph_engine    
+from cognee.infrastructure.databases.vector import get_vector_engine    
+from cognee.infrastructure.databases.relational import get_relational_engine  
+from cognee.modules.data.models import Data, DatasetData  
+from cognee.shared.logging_utils import get_logger    
+from sqlalchemy import select, or_  
+import cognee  
+    
+logger = get_logger(__name__)    
+    
+    
+async def cleanup_unused_data(    
+    minutes_threshold: int = 30,    
+    dry_run: bool = True,    
+    user_id: Optional[UUID] = None,  
+    text_doc: bool = False  
+) -> Dict[str, Any]:    
+    """    
+    Identify and remove unused data from the memify pipeline.    
+        
+    Parameters    
+    ----------    
+    minutes_threshold : int    
+        Minutes since last access to consider data unused (default: 30)    
+    dry_run : bool    
+        If True, only report what would be deleted without actually deleting (default: True)    
+    user_id : UUID, optional    
+        Limit cleanup to specific user's data (default: None)  
+    text_doc : bool  
+        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()  
+        for proper whole-document deletion (default: False)  
+        
+    Returns    
+    -------    
+    Dict[str, Any]    
+        Cleanup results with status, counts, and timestamp    
+    """    
+    logger.info(    
+        "Starting cleanup task",    
+        minutes_threshold=minutes_threshold,    
+        dry_run=dry_run,    
+        user_id=str(user_id) if user_id else None,  
+        text_doc=text_doc  
+    )    
+        
+    # Calculate cutoff timestamp  
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+      
+    if text_doc:  
+        # SQL-based approach: Find unused TextDocuments and use cognee.delete()  
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)  
+    else:  
+        # Graph-based approach: Find unused nodes directly from graph  
+        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
+        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")    
+            
+        # Find unused nodes    
+        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)    
+            
+        total_unused = sum(len(nodes) for nodes in unused_nodes.values())    
+        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})    
+            
+        if dry_run:    
+            return {    
+                "status": "dry_run",    
+                "unused_count": total_unused,    
+                "deleted_count": {    
+                    "data_items": 0,    
+                    "chunks": 0,    
+                    "entities": 0,    
+                    "summaries": 0,    
+                    "associations": 0    
+                },    
+                "cleanup_date": datetime.now(timezone.utc).isoformat(),    
+                "preview": {    
+                    "chunks": len(unused_nodes["DocumentChunk"]),    
+                    "entities": len(unused_nodes["Entity"]),    
+                    "summaries": len(unused_nodes["TextSummary"])    
+                }    
+            }    
+            
+        # Delete unused nodes    
+        deleted_counts = await _delete_unused_nodes(unused_nodes)    
+            
+        logger.info("Cleanup completed", deleted_counts=deleted_counts)    
+            
+        return {    
+            "status": "completed",    
+            "unused_count": total_unused,    
+            "deleted_count": {    
+                "data_items": 0,    
+                "chunks": deleted_counts["DocumentChunk"],    
+                "entities": deleted_counts["Entity"],    
+                "summaries": deleted_counts["TextSummary"],    
+                "associations": deleted_counts["associations"]    
+            },    
+            "cleanup_date": datetime.now(timezone.utc).isoformat()    
+        }  
   
   
-async def cleanup_unused_data(  
-    minutes_threshold: int = 30,  
-    dry_run: bool = True,  
+async def _cleanup_via_sql(  
+    cutoff_date: datetime,  
+    dry_run: bool,  
     user_id: Optional[UUID] = None  
 ) -> Dict[str, Any]:  
     """  
-    Identify and remove unused data from the memify pipeline.  
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().  
       
     Parameters  
     ----------  
-    minutes_threshold : int  
-        Minutes since last access to consider data unused (default: 30)  
+    cutoff_date : datetime  
+        Cutoff date for last_accessed filtering  
     dry_run : bool  
-        If True, only report what would be deleted without actually deleting (default: True)  
-    user_id : UUID, optional  
-        Limit cleanup to specific user's data (default: None)  
-      
-    Returns  
-    -------  
-    Dict[str, Any]  
-        Cleanup results with status, counts, and timestamp  
-    """  
-    logger.info(  
-        "Starting cleanup task",  
-        minutes_threshold=minutes_threshold,  
-        dry_run=dry_run,  
-        user_id=str(user_id) if user_id else None  
-    )  
-      
-    # Calculate cutoff timestamp in milliseconds  
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
-    cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
-      
-    logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")  
-      
-    # Find unused nodes  
-    unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)  
-      
-    total_unused = sum(len(nodes) for nodes in unused_nodes.values())  
-    logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})  
-      
-    if dry_run:  
-        return {  
-            "status": "dry_run",  
-            "unused_count": total_unused,  
-            "deleted_count": {  
-                "data_items": 0,  
-                "chunks": 0,  
-                "entities": 0,  
-                "summaries": 0,  
-                "associations": 0  
-            },  
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
-            "preview": {  
-                "chunks": len(unused_nodes["DocumentChunk"]),  
-                "entities": len(unused_nodes["Entity"]),  
-                "summaries": len(unused_nodes["TextSummary"])  
-            }  
-        }  
-      
-    # Delete unused nodes  
-    deleted_counts = await _delete_unused_nodes(unused_nodes)  
-      
-    logger.info("Cleanup completed", deleted_counts=deleted_counts)  
-      
-    return {  
-        "status": "completed",  
-        "unused_count": total_unused,  
-        "deleted_count": {  
-            "data_items": 0,  
-            "chunks": deleted_counts["DocumentChunk"],  
-            "entities": deleted_counts["Entity"],  
-            "summaries": deleted_counts["TextSummary"],  
-            "associations": deleted_counts["associations"]  
-        },  
-        "cleanup_date": datetime.now(timezone.utc).isoformat()  
-    }  
-  
-  
-async def _find_unused_nodes(  
-    cutoff_timestamp_ms: int,  
-    user_id: Optional[UUID] = None  
-) -> Dict[str, list]:  
-    """  
-    Query Kuzu for nodes with old last_accessed_at timestamps.  
-      
-    Parameters  
-    ----------  
-    cutoff_timestamp_ms : int  
-        Cutoff timestamp in milliseconds since epoch  
+        If True, only report what would be deleted  
     user_id : UUID, optional  
         Filter by user ID if provided  
       
     Returns  
     -------  
-    Dict[str, list]  
-        Dictionary mapping node types to lists of unused node IDs  
+    Dict[str, Any]  
+        Cleanup results  
     """  
-    graph_engine = await get_graph_engine()  
+    db_engine = get_relational_engine()  
       
-    # Query all nodes with their properties  
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {  
-        "DocumentChunk": [],  
-        "Entity": [],  
-        "TextSummary": []  
-    }  
-      
-    for node_id, node_type, props_json in results:  
-        # Only process tracked node types  
-        if node_type not in unused_nodes:  
-            continue  
-          
-        # Parse properties JSON  
-        if props_json:  
-            try:  
-                props = json.loads(props_json)  
-                last_accessed = props.get("last_accessed_at")  
-                  
-                # Check if node is unused (never accessed or accessed before cutoff)  
-                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-                    # TODO: Add user_id filtering when user ownership is implemented  
-                    unused_nodes[node_type].append(node_id)  
-                    logger.debug(  
-                        f"Found unused {node_type}",  
-                        node_id=node_id,  
-                        last_accessed=last_accessed  
-                    )  
-            except json.JSONDecodeError:  
-                logger.warning(f"Failed to parse properties for node {node_id}")  
-                continue  
-      
-    return unused_nodes  
-  
-  
-async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:  
-    """  
-    Delete unused nodes from graph and vector databases.  
-      
-    Parameters  
-    ----------  
-    unused_nodes : Dict[str, list]  
-        Dictionary mapping node types to lists of node IDs to delete  
-      
-    Returns  
-    -------  
-    Dict[str, int]  
-        Count of deleted items by type  
-    """  
-    graph_engine = await get_graph_engine()  
-    vector_engine = get_vector_engine()  
-      
-    deleted_counts = {  
-        "DocumentChunk": 0,  
-        "Entity": 0,  
-        "TextSummary": 0,  
-        "associations": 0  
-    }  
-      
-    # Count associations before deletion  
-    for node_type, node_ids in unused_nodes.items():  
-        if not node_ids:  
-            continue  
-          
-        # Count edges connected to these nodes  
-        for node_id in node_ids:  
-            result = await graph_engine.query(  
-                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",  
-                {"id": node_id}  
+    async with db_engine.get_async_session() as session:  
+        # Query for Data records with old last_accessed timestamps  
+        query = select(Data, DatasetData).join(  
+            DatasetData, Data.id == DatasetData.data_id  
+        ).where(  
+            or_(  
+                Data.last_accessed < cutoff_date,  
+                Data.last_accessed.is_(None)  
             )  
-            if result and len(result) > 0:  
-                deleted_counts["associations"] += result[0][0]  
+        )  
+          
+        if user_id:  
+            from cognee.modules.data.models import Dataset  
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(  
+                Dataset.owner_id == user_id  
+            )  
+          
+        result = await session.execute(query)  
+        unused_data = result.all()  
       
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)  
-    for node_type, node_ids in unused_nodes.items():  
-        if not node_ids:  
-            continue  
-          
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")  
-          
-        # Delete nodes in batches  
-        await graph_engine.delete_nodes(node_ids)  
-        deleted_counts[node_type] = len(node_ids)  
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")  
       
-    # Delete from vector database  
-    vector_collections = {  
-        "DocumentChunk": "DocumentChunk_text",  
-        "Entity": "Entity_name",  
-        "TextSummary": "TextSummary_text"  
-    }  
+    if dry_run:  
+        return {  
+            "status": "dry_run",  
+            "unused_count": len(unused_data),  
+            "deleted_count": {  
+                "data_items": 0,  
+                "documents": 0  
+            },  
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
+            "preview": {  
+                "documents": len(unused_data)  
+            }  
+        }  
       
-    for node_type, collection_name in vector_collections.items():  
-        node_ids = unused_nodes[node_type]  
-        if not node_ids:  
-            continue  
-          
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
-          
+    # Delete each document using cognee.delete()  
+    deleted_count = 0  
+    from cognee.modules.users.methods import get_default_user  
+    user = await get_default_user() if user_id is None else None  
+      
+    for data, dataset_data in unused_data:  
         try:  
-            # Delete from vector collection  
-            if await vector_engine.has_collection(collection_name):  
-                for node_id in node_ids:  
-                    try:  
-                        await vector_engine.delete(collection_name, {"id": str(node_id)})  
-                    except Exception as e:  
-                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")  
+            await cognee.delete(  
+                data_id=data.id,  
+                dataset_id=dataset_data.dataset_id,  
+                mode="hard",  # Use hard mode to also remove orphaned entities  
+                user=user  
+            )  
+            deleted_count += 1  
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")  
         except Exception as e:  
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
+            logger.error(f"Failed to delete document {data.id}: {e}")  
       
+    logger.info("Cleanup completed", deleted_count=deleted_count)  
+      
+    return {  
+        "status": "completed",  
+        "unused_count": len(unused_data),  
+        "deleted_count": {  
+            "data_items": deleted_count,  
+            "documents": deleted_count  
+        },  
+        "cleanup_date": datetime.now(timezone.utc).isoformat()  
+    }  
+    
+    
+async def _find_unused_nodes(    
+    cutoff_timestamp_ms: int,    
+    user_id: Optional[UUID] = None    
+) -> Dict[str, list]:    
+    """    
+    Query Kuzu for nodes with old last_accessed_at timestamps.    
+        
+    Parameters    
+    ----------    
+    cutoff_timestamp_ms : int    
+        Cutoff timestamp in milliseconds since epoch    
+    user_id : UUID, optional    
+        Filter by user ID if provided    
+        
+    Returns    
+    -------    
+    Dict[str, list]    
+        Dictionary mapping node types to lists of unused node IDs    
+    """    
+    graph_engine = await get_graph_engine()    
+        
+    # Query all nodes with their properties    
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"    
+    results = await graph_engine.query(query)    
+        
+    unused_nodes = {    
+        "DocumentChunk": [],    
+        "Entity": [],    
+        "TextSummary": []    
+    }    
+        
+    for node_id, node_type, props_json in results:    
+        # Only process tracked node types    
+        if node_type not in unused_nodes:    
+            continue    
+            
+        # Parse properties JSON    
+        if props_json:    
+            try:    
+                props = json.loads(props_json)    
+                last_accessed = props.get("last_accessed_at")    
+                    
+                # Check if node is unused (never accessed or accessed before cutoff)    
+                if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
+                    unused_nodes[node_type].append(node_id)    
+                    logger.debug(    
+                        f"Found unused {node_type}",    
+                        node_id=node_id,    
+                        last_accessed=last_accessed    
+                    )    
+            except json.JSONDecodeError:    
+                logger.warning(f"Failed to parse properties for node {node_id}")    
+                continue    
+        
+    return unused_nodes    
+    
+    
+async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:    
+    """    
+    Delete unused nodes from graph and vector databases.    
+        
+    Parameters    
+    ----------    
+    unused_nodes : Dict[str, list]    
+        Dictionary mapping node types to lists of node IDs to delete    
+        
+    Returns    
+    -------    
+    Dict[str, int]    
+        Count of deleted items by type    
+    """    
+    graph_engine = await get_graph_engine()    
+    vector_engine = get_vector_engine()    
+        
+    deleted_counts = {    
+        "DocumentChunk": 0,    
+        "Entity": 0,    
+        "TextSummary": 0,    
+        "associations": 0    
+    }    
+        
+    # Count associations before deletion    
+    for node_type, node_ids in unused_nodes.items():    
+        if not node_ids:    
+            continue    
+            
+        # Count edges connected to these nodes    
+        for node_id in node_ids:    
+            result = await graph_engine.query(    
+                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",    
+                {"id": node_id}    
+            )    
+            if result and len(result) > 0:    
+                deleted_counts["associations"] += result[0][0]    
+        
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)    
+    for node_type, node_ids in unused_nodes.items():    
+        if not node_ids:    
+            continue    
+            
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")    
+            
+        # Delete nodes in batches    
+        await graph_engine.delete_nodes(node_ids)    
+        deleted_counts[node_type] = len(node_ids)    
+        
+    # Delete from vector database    
+    vector_collections = {    
+        "DocumentChunk": "DocumentChunk_text",    
+        "Entity": "Entity_name",    
+        "TextSummary": "TextSummary_text"    
+    }    
+        
+    for node_type, collection_name in vector_collections.items():    
+        node_ids = unused_nodes[node_type]    
+        if not node_ids:    
+            continue    
+            
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
+            
+        try:    
+            # Delete from vector collection    
+            if await vector_engine.has_collection(collection_name):    
+                for node_id in node_ids:    
+                    try:    
+                        await vector_engine.delete(collection_name, {"id": str(node_id)})    
+                    except Exception as e:    
+                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")    
+        except Exception as e:    
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
+        
     return deleted_counts

From ff263c0132b170b3c03961606db56c2a174d2b90 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 18:40:58 +0530
Subject: [PATCH 11/37] fix: add column check in migration

---
 .../e1ec1dcb50b6_add_last_accessed_to_data.py | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
index 0ccefa63b..267e11fb2 100644
--- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -17,14 +17,30 @@ down_revision: Union[str, None] = '211ab850ef3d'
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 
+def _get_column(inspector, table, name, schema=None):  
+    for col in inspector.get_columns(table, schema=schema):  
+        if col["name"] == name:  
+            return col  
+    return None  
+  
   
 def upgrade() -> None:  
-    op.add_column('data',   
-        sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
-    )  
-    # Optionally initialize with created_at values for existing records  
-    op.execute("UPDATE data SET last_accessed = created_at")  
+    conn = op.get_bind()  
+    insp = sa.inspect(conn)  
+  
+    last_accessed_column = _get_column(insp, "data", "last_accessed")   
+    if not last_accessed_column:  
+        op.add_column('data',   
+            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
+        )  
+        # Optionally initialize with created_at values for existing records  
+        op.execute("UPDATE data SET last_accessed = created_at")  
   
   
 def downgrade() -> None:  
-    op.drop_column('data', 'last_accessed')
+    conn = op.get_bind()  
+    insp = sa.inspect(conn)  
+      
+    last_accessed_column = _get_column(insp, "data", "last_accessed")  
+    if last_accessed_column:  
+        op.drop_column('data', 'last_accessed')

From c5f0c4af87ff13bf8e3cbe0f4e9163ece44c3094 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 5 Nov 2025 20:22:17 +0530
Subject: [PATCH 12/37] fix: add text_doc flag

---
 cognee/modules/retrieval/utils/access_tracking.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 36c0b7f50..65d597a93 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -67,12 +67,9 @@ async def update_node_access_timestamps(items: List[Any]):
         # Step 2: Find origin TextDocument nodes (without hardcoded relationship names)  
         origin_query = """  
         UNWIND $node_ids AS node_id  
-        MATCH (n:Node {id: node_id})  
-        OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node)  
-        WHERE chunk.type = 'DocumentChunk'  
-        OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node)  
-        WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument']  
-        RETURN DISTINCT doc.id as doc_id  
+        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)  
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']  
+        RETURN DISTINCT doc.id  
         """  
           
         result = await graph_engine.query(origin_query, {"node_ids": node_ids})  

From fdf037b3d0117bd29f0c541ed027895c070678df Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Thu, 6 Nov 2025 23:00:56 +0530
Subject: [PATCH 13/37] fix: min to days

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index c9c711fe2..4df622a2c 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -23,7 +23,7 @@ logger = get_logger(__name__)
     
     
 async def cleanup_unused_data(    
-    minutes_threshold: int = 30,    
+    days_threshold: Optional[int],    
     dry_run: bool = True,    
     user_id: Optional[UUID] = None,  
     text_doc: bool = False  
@@ -33,8 +33,8 @@ async def cleanup_unused_data(
         
     Parameters    
     ----------    
-    minutes_threshold : int    
-        Minutes since last access to consider data unused (default: 30)    
+    days_threshold : int    
+        days since last access to consider data unused     
     dry_run : bool    
         If True, only report what would be deleted without actually deleting (default: True)    
     user_id : UUID, optional    
@@ -50,14 +50,14 @@ async def cleanup_unused_data(
     """    
     logger.info(    
         "Starting cleanup task",    
-        minutes_threshold=minutes_threshold,    
+        days_threshold=days_threshold,    
         dry_run=dry_run,    
         user_id=str(user_id) if user_id else None,  
         text_doc=text_doc  
     )    
         
     # Calculate cutoff timestamp  
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
       
     if text_doc:  
         # SQL-based approach: Find unused TextDocuments and use cognee.delete()  

From 84c8e07ddd980af7c11b89c7e510b38e5c44f119 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 7 Nov 2025 12:03:17 +0530
Subject: [PATCH 14/37] fix: remove uneccessary imports

---
 cognee/modules/chunking/models/DocumentChunk.py | 2 --
 cognee/modules/engine/models/Entity.py          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index a9fb08a9e..e2b216a9b 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -1,7 +1,5 @@
 from typing import List, Union
 
-from pydantic import BaseModel, Field
-from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.engine.models.Edge import Edge
 from cognee.modules.data.processing.document_types import Document
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 4083cd2e6..a34a6503c 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -1,8 +1,6 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.engine.models.EntityType import EntityType
 from typing import Optional
-from datetime import datetime, timezone  
-from pydantic import BaseModel, Field
 
 class Entity(DataPoint):
     name: str

From 84bd2f38f7513c244ed1040937a1e5a5297cec2e Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Fri, 7 Nov 2025 12:12:46 +0530
Subject: [PATCH 15/37] fix: remove uneccessary imports

---
 cognee/tasks/summarization/models.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 8cee2ade3..8420cfaa5 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -1,7 +1,5 @@
 
-from pydantic import BaseModel, Field
 from typing import Union
-from datetime import datetime, timezone  
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart

From d351c9a009d12a8a8a4869afa7aee38c61482e21 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 10 Nov 2025 21:58:01 +0530
Subject: [PATCH 16/37] fix: return chunk payload

---
 cognee/modules/retrieval/chunks_retriever.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
index be1f95811..b7a90238a 100644
--- a/cognee/modules/retrieval/chunks_retriever.py
+++ b/cognee/modules/retrieval/chunks_retriever.py
@@ -57,6 +57,7 @@ class ChunksRetriever(BaseRetriever):
 
         chunk_payloads = [result.payload for result in found_chunks]
         logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
+        return chunk_payloads
 
     async def get_completion(
         self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None

From ac3300760b7a521aebe452d041bb7ceaa35f8052 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 12 Nov 2025 12:26:28 +0100
Subject: [PATCH 17/37] test: add search tests docs

---
 cognee/tests/docs/guides/search_basics.py | 46 +++++++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py
index 67d0c938d..09dee3f92 100644
--- a/cognee/tests/docs/guides/search_basics.py
+++ b/cognee/tests/docs/guides/search_basics.py
@@ -1,17 +1,57 @@
 import asyncio
 import cognee
 
+from cognee.modules.search.types import SearchType, CombinedSearchResult
+
 
 async def main():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-    text = "First rule of coding: Do not talk about coding."
+    text = """
+        Natural language processing (NLP) is an interdisciplinary
+        subfield of computer science and information retrieval.
+        First rule of coding: Do not talk about coding.
+        """
+
+    text2 = """
+    Sandwiches are best served toasted with cheese, ham, mayo,
+    lettuce, mustard, and salt & pepper.
+    """
+
+    await cognee.add(text, dataset_name="NLP_coding")
+    await cognee.add(text2, dataset_name="Sandwiches")
+    await cognee.add(text2)
+
+    await cognee.cognify()
 
     # Make sure you've already run cognee.cognify(...) so the graph has content
     answers = await cognee.search(query_text="What are the main themes in my data?")
-    for answer in answers:
-        print(answer)
+    assert len(answers) > 0
 
+    answers = await cognee.search(
+        query_text="List coding guidelines",
+        query_type=SearchType.CODING_RULES,
+    )
+    assert len(answers) > 0
+
+    answers = await cognee.search(
+        query_text="Give me a confident answer: What is NLP?",
+        system_prompt="Answer succinctly and state confidence at the end.",
+    )
+    assert len(answers) > 0
+
+    answers = await cognee.search(
+        query_text="Tell me about NLP",
+        only_context=True,
+    )
+    assert len(answers) > 0
+
+    answers = await cognee.search(
+        query_text="Quarterly financial highlights",
+        datasets=["NLP_coding", "Sandwiches"],
+        use_combined_context=True,
+    )
+    assert isinstance(answers, CombinedSearchResult)
 
 asyncio.run(main())

From 503bdc34f38e18e2ec3dccb6e47aaff669702f55 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 12 Nov 2025 13:20:23 +0100
Subject: [PATCH 18/37] test: add tests to workflows

---
 .github/workflows/docs_tests.yml   | 276 ++++++++++++++++++++++++++++-
 .github/workflows/release_test.yml |   5 +
 2 files changed, 274 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/docs_tests.yml b/.github/workflows/docs_tests.yml
index b3c538668..7f7282bb2 100644
--- a/.github/workflows/docs_tests.yml
+++ b/.github/workflows/docs_tests.yml
@@ -1,18 +1,280 @@
-name: Docs Test Suite
+name: Docs Tests
+
 permissions:
   contents: read
 
 on:
-  release:
   workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
+  workflow_call:
+    secrets:
+      LLM_PROVIDER:
+        required: true
+      LLM_MODEL:
+        required: true
+      LLM_ENDPOINT:
+        required: true
+      LLM_API_KEY:
+        required: true
+      LLM_API_VERSION:
+        required: true
+      EMBEDDING_PROVIDER:
+        required: true
+      EMBEDDING_MODEL:
+        required: true
+      EMBEDDING_ENDPOINT:
+        required: true
+      EMBEDDING_API_KEY:
+        required: true
+      EMBEDDING_API_VERSION:
+        required: true
 
 env:
-  RUNTIME__LOG_LEVEL: ERROR
   ENV: 'dev'
 
 jobs:
+  test-search-basics:
+    name: Test Search Basics
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
 
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Search Basics Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/search_basics.py
+
+  test-temporal-cognify:
+    name: Test Temporal Cognify
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Temporal Cognify Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py
+
+  test-ontology-quickstart:
+    name: Test Temporal Cognify
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Temporal Cognify Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py
+
+  test-s3-storage:
+    name: Test S3 Docs Guide
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+          extra-dependencies: "aws"
+
+      - name: Run S3 Docs Guide Test
+        env:
+          ENABLE_BACKEND_ACCESS_CONTROL: True
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+          STORAGE_BACKEND: s3
+          AWS_REGION: eu-west-1
+          AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }}
+        run: uv run python ./cognee/tests/docs/guides/s3_storage.py
+
+  test-graph-visualization:
+    name: Test Graph Visualization
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Graph Visualization Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/graph_visualization.py
+
+  test-low-level-llm:
+    name: Test Low Level LLM
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Low Level LLM Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/low_level_llm.py
+
+  test-memify-quickstart:
+    name: Test Memify Quickstart
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Memify Quickstart Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/memify_quickstart.py
+
+  test-custom-data-models:
+    name: Test Custom Data Models
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Custom Data Models Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/custom_data_models.py
+
+  test-custom-tasks-and-pipelines:
+    name: Test Custom Tasks and Pipelines
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Custom Tasks and Pipelines Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/custom_tasks_and_pipelines.py
+
+  test-custom-prompts:
+    name: Test Custom Prompts
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Custom Prompts Test
+        env:
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/docs/guides/custom_prompts.py
\ No newline at end of file
diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index 6ac3ca515..89540fcfb 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -14,4 +14,9 @@ jobs:
   load-tests:
     name: Load Tests
     uses: ./.github/workflows/load_tests.yml
+    secrets: inherit
+
+  docs-tests:
+    name: Docs Tests
+    uses: ./.github/workflows/docs_tests.yml
     secrets: inherit
\ No newline at end of file

From 1e56d6dc389e1f33c08a7ee897689a941a7b8a9f Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 12 Nov 2025 13:42:53 +0100
Subject: [PATCH 19/37] chore: ruff format

---
 cognee/tests/docs/guides/search_basics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py
index 09dee3f92..f1847ad4b 100644
--- a/cognee/tests/docs/guides/search_basics.py
+++ b/cognee/tests/docs/guides/search_basics.py
@@ -54,4 +54,5 @@ async def main():
     )
     assert isinstance(answers, CombinedSearchResult)
 
+
 asyncio.run(main())

From 7bd7079aac9fcb003bcc20e118bc65d066e9029c Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 18 Nov 2025 22:17:23 +0530
Subject: [PATCH 20/37] fix: vecto_engine.delte_data_points

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 33 ++++++++++-----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index 4df622a2c..fd4b68204 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -315,22 +315,21 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
         "TextSummary": "TextSummary_text"    
     }    
         
-    for node_type, collection_name in vector_collections.items():    
-        node_ids = unused_nodes[node_type]    
-        if not node_ids:    
-            continue    
-            
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
-            
-        try:    
-            # Delete from vector collection    
-            if await vector_engine.has_collection(collection_name):    
-                for node_id in node_ids:    
-                    try:    
-                        await vector_engine.delete(collection_name, {"id": str(node_id)})    
-                    except Exception as e:    
-                        logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")    
-        except Exception as e:    
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
+
+    for node_type, collection_name in vector_collections.items():
+        node_ids = unused_nodes[node_type]
+        if not node_ids:
+            continue
+
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")
+
+        try:
+            if await vector_engine.has_collection(collection_name):
+                await vector_engine.delete_data_points(
+                    collection_name,
+                    [str(node_id) for node_id in node_ids]
+                )
+        except Exception as e:
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")
         
     return deleted_counts

From 5fac3b40b94e4c81a7d9828ca9d2d84ab5e82bc1 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 18 Nov 2025 22:26:59 +0530
Subject: [PATCH 21/37] fix: test file for cleanup unused data

---
 cognee/tests/test_cleanup_unused_data.py | 244 +++++++++++++++++++++++
 1 file changed, 244 insertions(+)
 create mode 100644 cognee/tests/test_cleanup_unused_data.py

diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py
new file mode 100644
index 000000000..c21b9f5ea
--- /dev/null
+++ b/cognee/tests/test_cleanup_unused_data.py
@@ -0,0 +1,244 @@
+import os    
+import pathlib    
+import cognee    
+from datetime import datetime, timezone, timedelta    
+from uuid import UUID    
+from sqlalchemy import select, update    
+from cognee.modules.data.models import Data, DatasetData    
+from cognee.infrastructure.databases.relational import get_relational_engine    
+from cognee.modules.users.methods import get_default_user    
+from cognee.shared.logging_utils import get_logger    
+from cognee.modules.search.types import SearchType    
+    
+logger = get_logger()    
+    
+    
+async def test_textdocument_cleanup_with_sql():    
+    """    
+    End-to-end test for TextDocument cleanup based on last_accessed timestamps.    
+        
+    Tests:    
+    1. Add and cognify a document    
+    2. Perform search to populate last_accessed timestamp    
+    3. Verify last_accessed is set in SQL Data table    
+    4. Manually age the timestamp beyond cleanup threshold    
+    5. Run cleanup with text_doc=True    
+    6. Verify document was deleted from all databases (relational, graph, and vector)  
+    """    
+    # Setup test directories    
+    data_directory_path = str(    
+        pathlib.Path(    
+            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")    
+        ).resolve()    
+    )    
+    cognee_directory_path = str(    
+        pathlib.Path(    
+            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")    
+        ).resolve()    
+    )    
+        
+    cognee.config.data_root_directory(data_directory_path)    
+    cognee.config.system_root_directory(cognee_directory_path)    
+        
+    # Initialize database    
+    from cognee.modules.engine.operations.setup import setup    
+        
+    # Clean slate    
+    await cognee.prune.prune_data()    
+    await cognee.prune.prune_system(metadata=True)    
+        
+    logger.info("🧪 Testing TextDocument cleanup based on last_accessed")    
+        
+    # Step 1: Add and cognify a test document    
+    dataset_name = "test_cleanup_dataset"    
+    test_text = """    
+    Machine learning is a subset of artificial intelligence that enables systems to learn    
+    and improve from experience without being explicitly programmed. Deep learning uses    
+    neural networks with multiple layers to process data.    
+    """    
+        
+    await setup()    
+    user = await get_default_user()    
+    await cognee.add([test_text], dataset_name=dataset_name, user=user)    
+        
+    cognify_result = await cognee.cognify([dataset_name], user=user)    
+        
+    # Extract dataset_id from cognify result (ds_id is already a UUID)    
+    dataset_id = None    
+    for ds_id, pipeline_result in cognify_result.items():    
+        dataset_id = ds_id  # Don't wrap in UUID() - it's already a UUID object    
+        break    
+        
+    assert dataset_id is not None, "Failed to get dataset_id from cognify result"    
+    logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")    
+        
+    # Step 2: Perform search to trigger last_accessed update    
+    logger.info("Triggering search to update last_accessed...")    
+    search_results = await cognee.search(    
+        query_type=SearchType.CHUNKS,    
+        query_text="machine learning",    
+        datasets=[dataset_name],    
+        user=user    
+    )    
+    logger.info(f"✅ Search completed, found {len(search_results)} results")    
+        
+    # Step 3: Verify last_accessed was set in SQL Data table    
+    db_engine = get_relational_engine()    
+    async with db_engine.get_async_session() as session:    
+        # Get the Data record for this dataset    
+        result = await session.execute(    
+            select(Data, DatasetData)    
+            .join(DatasetData, Data.id == DatasetData.data_id)    
+            .where(DatasetData.dataset_id == dataset_id)    
+        )    
+        data_records = result.all()    
+        assert len(data_records) > 0, "No Data records found for the dataset"    
+        data_record = data_records[0][0]  
+        data_id = data_record.id    
+            
+        # Verify last_accessed is set (should be set by search operation)    
+        assert data_record.last_accessed is not None, (    
+            "last_accessed should be set after search operation"    
+        )    
+            
+        original_last_accessed = data_record.last_accessed    
+        logger.info(f"✅ last_accessed verified: {original_last_accessed}")    
+        
+    # Step 4: Manually age the timestamp to be older than cleanup threshold    
+    days_threshold = 30   
+    aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10)    
+        
+    async with db_engine.get_async_session() as session:    
+        stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)    
+        await session.execute(stmt)    
+        await session.commit()    
+        
+    # Query in a NEW session to avoid cached values    
+    async with db_engine.get_async_session() as session:    
+        result = await session.execute(select(Data).where(Data.id == data_id))    
+        updated_data = result.scalar_one_or_none()    
+            
+        # Make both timezone-aware for comparison    
+        retrieved_timestamp = updated_data.last_accessed    
+        if retrieved_timestamp.tzinfo is None:    
+            # If database returned naive datetime, make it UTC-aware    
+            retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)    
+            
+        assert retrieved_timestamp == aged_timestamp, (    
+            f"Timestamp should be updated to aged value. "    
+            f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}"    
+        )  
+          
+    # Step 5: Test cleanup with text_doc=True    
+    from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data    
+        
+    # First do a dry run    
+    logger.info("Testing dry run with text_doc=True...")    
+    dry_run_result = await cleanup_unused_data(    
+        days_threshold=30,    
+        dry_run=True,    
+        user_id=user.id,    
+        text_doc=True    
+    )    
+        
+    assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'"    
+    assert dry_run_result['unused_count'] > 0, (    
+        "Should find at least one unused document"    
+    )    
+    logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")    
+        
+    # Now run actual cleanup    
+    logger.info("Executing cleanup with text_doc=True...")    
+    cleanup_result = await cleanup_unused_data(    
+        days_threshold=30,    
+        dry_run=False,    
+        user_id=user.id,    
+        text_doc=True    
+    )    
+        
+    assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"    
+    assert cleanup_result["deleted_count"]["documents"] > 0, (    
+        "At least one document should be deleted"    
+    )    
+    logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents")    
+        
+    # Step 6: Verify the document was actually deleted from SQL    
+    async with db_engine.get_async_session() as session:    
+        deleted_data = (    
+            await session.execute(select(Data).where(Data.id == data_id))    
+        ).scalar_one_or_none()    
+            
+        assert deleted_data is None, (    
+            "Data record should be deleted after cleanup"    
+        )    
+        logger.info("✅ Confirmed: Data record was deleted from SQL database")    
+        
+    # Verify the dataset-data link was also removed    
+    async with db_engine.get_async_session() as session:    
+        dataset_data_link = (    
+            await session.execute(    
+                select(DatasetData).where(    
+                    DatasetData.data_id == data_id,    
+                    DatasetData.dataset_id == dataset_id    
+                )    
+            )    
+        ).scalar_one_or_none()    
+            
+        assert dataset_data_link is None, (    
+            "DatasetData link should be deleted after cleanup"    
+        )    
+        logger.info("✅ Confirmed: DatasetData link was deleted")    
+        
+    # Verify graph nodes were cleaned up    
+    from cognee.infrastructure.databases.graph import get_graph_engine    
+        
+    graph_engine = await get_graph_engine()    
+        
+    # Try to find the TextDocument node - it should not exist    
+    result = await graph_engine.query(    
+        "MATCH (n:Node {id: $id}) RETURN n",    
+        {"id": str(data_id)}    
+    )    
+        
+    assert len(result) == 0, (    
+        "TextDocument node should be deleted from graph database"    
+    )    
+    logger.info("✅ Confirmed: TextDocument node was deleted from graph database")    
+      
+    # Verify vector database was cleaned up  
+    from cognee.infrastructure.databases.vector import get_vector_engine  
+      
+    vector_engine = get_vector_engine()  
+      
+    # Check each collection that should have been cleaned up  
+    vector_collections = [  
+        "DocumentChunk_text",  
+        "Entity_name",   
+        "TextSummary_text"  
+    ]  
+      
+    for collection_name in vector_collections:  
+        if await vector_engine.has_collection(collection_name):  
+            # Try to retrieve the deleted data points  
+            try:  
+                results = await vector_engine.retrieve(collection_name, [str(data_id)])  
+                assert len(results) == 0, (  
+                    f"Data points should be deleted from {collection_name} collection"  
+                )  
+                logger.info(f"✅ Confirmed: {collection_name} collection is clean")  
+            except Exception as e:  
+                # Collection might be empty or not exist, which is fine  
+                logger.info(f"✅ Confirmed: {collection_name} collection is empty or doesn't exist")  
+                pass  
+      
+    logger.info("✅ Confirmed: Vector database entries were deleted")  
+        
+    logger.info("🎉 All cleanup tests passed!")    
+        
+    return True    
+    
+    
+if __name__ == "__main__":    
+    import asyncio    
+    success = asyncio.run(test_textdocument_cleanup_with_sql())    
+    exit(0 if success else 1)

From 43290af1b23d24d6ab8b5d57c243abe1cee8787e Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 19 Nov 2025 21:00:16 +0530
Subject: [PATCH 22/37] fix: set last_acessed to current timestamp

---
 alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
index 267e11fb2..a16c99e9f 100644
--- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -34,7 +34,7 @@ def upgrade() -> None:
             sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
         )  
         # Optionally initialize with created_at values for existing records  
-        op.execute("UPDATE data SET last_accessed = created_at")  
+        op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
   
   
 def downgrade() -> None:  

From b52c1a1e25e6edffe112462836ab315b36bec567 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 24 Nov 2025 12:50:39 +0530
Subject: [PATCH 23/37] fix: flag to enable and disable last_accessed

---
 .../e1ec1dcb50b6_add_last_accessed_to_data.py | 88 ++++++++++---------
 .../retrieval/utils/access_tracking.py        |  7 +-
 cognee/tasks/cleanup/cleanup_unused_data.py   | 40 ++++++++-
 3 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
index a16c99e9f..f1a36ae59 100644
--- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
+++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -1,46 +1,52 @@
-"""add_last_accessed_to_data
-
-Revision ID: e1ec1dcb50b6
-Revises: 211ab850ef3d
-Create Date: 2025-11-04 21:45:52.642322
-
-"""
-from typing import Sequence, Union
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision: str = 'e1ec1dcb50b6'
-down_revision: Union[str, None] = '211ab850ef3d'
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-def _get_column(inspector, table, name, schema=None):  
-    for col in inspector.get_columns(table, schema=schema):  
-        if col["name"] == name:  
-            return col  
-    return None  
+"""add_last_accessed_to_data  
+  
+Revision ID: e1ec1dcb50b6  
+Revises: 211ab850ef3d  
+Create Date: 2025-11-04 21:45:52.642322  
+  
+"""  
+import os  
+from typing import Sequence, Union  
+  
+from alembic import op  
+import sqlalchemy as sa  
   
   
-def upgrade() -> None:  
-    conn = op.get_bind()  
-    insp = sa.inspect(conn)  
-  
-    last_accessed_column = _get_column(insp, "data", "last_accessed")   
-    if not last_accessed_column:  
-        op.add_column('data',   
-            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)  
-        )  
-        # Optionally initialize with created_at values for existing records  
-        op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
+# revision identifiers, used by Alembic.  
+revision: str = 'e1ec1dcb50b6'  
+down_revision: Union[str, None] = '211ab850ef3d'  
+branch_labels: Union[str, Sequence[str], None] = None  
+depends_on: Union[str, Sequence[str], None] = None  
   
   
-def downgrade() -> None:  
-    conn = op.get_bind()  
-    insp = sa.inspect(conn)  
-      
-    last_accessed_column = _get_column(insp, "data", "last_accessed")  
-    if last_accessed_column:  
+def _get_column(inspector, table, name, schema=None):    
+    for col in inspector.get_columns(table, schema=schema):    
+        if col["name"] == name:    
+            return col    
+    return None    
+    
+    
+def upgrade() -> None:    
+    conn = op.get_bind()    
+    insp = sa.inspect(conn)    
+    
+    last_accessed_column = _get_column(insp, "data", "last_accessed")     
+    if not last_accessed_column:    
+        # Always create the column for schema consistency  
+        op.add_column('data',     
+            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)    
+        )    
+          
+        # Only initialize existing records if feature is enabled  
+        enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true"  
+        if enable_last_accessed:  
+            op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
+    
+    
+def downgrade() -> None:    
+    conn = op.get_bind()    
+    insp = sa.inspect(conn)    
+        
+    last_accessed_column = _get_column(insp, "data", "last_accessed")    
+    if last_accessed_column:    
         op.drop_column('data', 'last_accessed')
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 65d597a93..6df0284ec 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -4,7 +4,7 @@ import json
 from datetime import datetime, timezone  
 from typing import List, Any  
 from uuid import UUID  
-  
+import os 
 from cognee.infrastructure.databases.graph import get_graph_engine  
 from cognee.infrastructure.databases.relational import get_relational_engine  
 from cognee.modules.data.models import Data  
@@ -27,7 +27,10 @@ async def update_node_access_timestamps(items: List[Any]):
     ----------  
     items : List[Any]  
         List of items with payload containing 'id' field (from vector search results)  
-    """  
+    """ 
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
+        return  
+          
     if not items:  
         return  
       
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index fd4b68204..175452a0a 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -10,7 +10,7 @@ import json
 from datetime import datetime, timezone, timedelta    
 from typing import Optional, Dict, Any    
 from uuid import UUID    
-    
+import os    
 from cognee.infrastructure.databases.graph import get_graph_engine    
 from cognee.infrastructure.databases.vector import get_vector_engine    
 from cognee.infrastructure.databases.relational import get_relational_engine  
@@ -47,7 +47,43 @@ async def cleanup_unused_data(
     -------    
     Dict[str, Any]    
         Cleanup results with status, counts, and timestamp    
-    """    
+    """   
+    # Check 1: Environment variable must be enabled  
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
+        logger.warning(  
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."  
+        )  
+        return {  
+            "status": "skipped",  
+            "reason": "ENABLE_LAST_ACCESSED not enabled",  
+            "unused_count": 0,  
+            "deleted_count": {},  
+            "cleanup_date": datetime.now(timezone.utc).isoformat()  
+        }  
+      
+    # Check 2: Verify tracking has actually been running  
+    db_engine = get_relational_engine()  
+    async with db_engine.get_async_session() as session:  
+        # Count records with non-NULL last_accessed  
+        tracked_count = await session.execute(  
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))  
+        )  
+        tracked_records = tracked_count.scalar()  
+          
+        if tracked_records == 0:  
+            logger.warning(  
+                "Cleanup skipped: No records have been tracked yet. "  
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "  
+                "Wait for retrievers to update timestamps before running cleanup."  
+            )  
+            return {  
+                "status": "skipped",  
+                "reason": "No tracked records found - tracking may be newly enabled",  
+                "unused_count": 0,  
+                "deleted_count": {},  
+                "cleanup_date": datetime.now(timezone.utc).isoformat()  
+            }  
+      
     logger.info(    
         "Starting cleanup task",    
         days_threshold=days_threshold,    

From 5cb6510205742e7a5abf2afe23d2527b229931d0 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Mon, 24 Nov 2025 13:12:46 +0530
Subject: [PATCH 24/37] fix: import

---
 cognee/tasks/cleanup/cleanup_unused_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index 175452a0a..a90d96b5c 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -18,6 +18,7 @@ from cognee.modules.data.models import Data, DatasetData
 from cognee.shared.logging_utils import get_logger    
 from sqlalchemy import select, or_  
 import cognee  
+import sqlalchemy as sa
     
 logger = get_logger(__name__)    
     

From 12ce80005ceccafac38a63da458e6df376776b61 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 26 Nov 2025 17:32:50 +0530
Subject: [PATCH 25/37] fix: generalized queries

---
 .../retrieval/utils/access_tracking.py        | 147 ++--
 cognee/tasks/cleanup/cleanup_unused_data.py   | 778 ++++++++++--------
 2 files changed, 516 insertions(+), 409 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 6df0284ec..12a66f8bc 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -13,24 +13,10 @@ from sqlalchemy import update
   
 logger = get_logger(__name__)  
   
-  
 async def update_node_access_timestamps(items: List[Any]):  
-    """  
-    Update last_accessed_at for nodes in graph database and corresponding Data records in SQL.  
-      
-    This function:  
-    1. Updates last_accessed_at in the graph database nodes (in properties JSON)  
-    2. Traverses to find origin TextDocument nodes (without hardcoded relationship names)  
-    3. Updates last_accessed in the SQL Data table for those documents  
-      
-    Parameters  
-    ----------  
-    items : List[Any]  
-        List of items with payload containing 'id' field (from vector search results)  
-    """ 
     if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
         return  
-          
+      
     if not items:  
         return  
       
@@ -49,50 +35,95 @@ async def update_node_access_timestamps(items: List[Any]):
         return  
       
     try:  
-        # Step 1: Batch update graph nodes  
-        for node_id in node_ids:  
-            result = await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) RETURN n.properties",  
-                {"id": node_id}  
-            )  
+        # Detect database provider and use appropriate queries  
+        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+          
+        if provider == "kuzu":  
+            await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms)  
+        elif provider == "neo4j":  
+            await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms)  
+        elif provider == "neptune":  
+            await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms)  
+        else:  
+            logger.warning(f"Unsupported graph provider: {provider}")  
+            return  
               
-            if result and result[0]:  
-                props = json.loads(result[0][0]) if result[0][0] else {}  
-                props["last_accessed_at"] = timestamp_ms  
-                  
-                await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                    {"id": node_id, "props": json.dumps(props)}  
-                )  
-          
-        logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")  
-          
-        # Step 2: Find origin TextDocument nodes (without hardcoded relationship names)  
-        origin_query = """  
-        UNWIND $node_ids AS node_id  
-        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)  
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']  
-        RETURN DISTINCT doc.id  
-        """  
-          
-        result = await graph_engine.query(origin_query, {"node_ids": node_ids})  
-          
-        # Extract and deduplicate document IDs  
-        doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else []  
-          
-        # Step 3: Update SQL Data table  
+        # Find origin documents and update SQL  
+        doc_ids = await _find_origin_documents(graph_engine, node_ids, provider)  
         if doc_ids:  
-            db_engine = get_relational_engine()  
-            async with db_engine.get_async_session() as session:  
-                stmt = update(Data).where(  
-                    Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
-                ).values(last_accessed=timestamp_dt)  
-                  
-                await session.execute(stmt)  
-                await session.commit()  
-                  
-            logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL")  
-          
+            await _update_sql_records(doc_ids, timestamp_dt)  
+              
     except Exception as e:  
         logger.error(f"Failed to update timestamps: {e}")  
-        raise
+        raise  
+  
+async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms):  
+    """Kuzu-specific node updates"""  
+    for node_id in node_ids:  
+        result = await graph_engine.query(  
+            "MATCH (n:Node {id: $id}) RETURN n.properties",  
+            {"id": node_id}  
+        )  
+          
+        if result and result[0]:  
+            props = json.loads(result[0][0]) if result[0][0] else {}  
+            props["last_accessed_at"] = timestamp_ms  
+              
+            await graph_engine.query(  
+                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                {"id": node_id, "props": json.dumps(props)}  
+            )  
+  
+async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms):  
+    """Neo4j-specific node updates"""  
+    for node_id in node_ids:  
+        await graph_engine.query(  
+            "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
+            {"id": node_id, "timestamp": timestamp_ms}  
+        )  
+  
+async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms):  
+    """Neptune-specific node updates"""  
+    for node_id in node_ids:  
+        await graph_engine.query(  
+            "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
+            {"id": node_id, "timestamp": timestamp_ms}  
+        )  
+  
+async def _find_origin_documents(graph_engine, node_ids, provider):  
+    """Find origin documents with provider-specific queries"""  
+    if provider == "kuzu":  
+        query = """  
+        UNWIND $node_ids AS node_id    
+        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
+        RETURN DISTINCT doc.id  
+        """  
+    elif provider == "neo4j":  
+        query = """  
+        UNWIND $node_ids AS node_id    
+        MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__)    
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
+        RETURN DISTINCT doc.id  
+        """  
+    elif provider == "neptune":  
+        query = """  
+        UNWIND $node_ids AS node_id    
+        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
+        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
+        RETURN DISTINCT doc.id  
+        """  
+      
+    result = await graph_engine.query(query, {"node_ids": node_ids})  
+    return list(set([row[0] for row in result if row and row[0]])) if result else []  
+  
+async def _update_sql_records(doc_ids, timestamp_dt):  
+    """Update SQL Data table (same for all providers)"""  
+    db_engine = get_relational_engine()  
+    async with db_engine.get_async_session() as session:  
+        stmt = update(Data).where(  
+            Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
+        ).values(last_accessed=timestamp_dt)  
+          
+        await session.execute(stmt)  
+        await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index a90d96b5c..b89c939a8 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,372 +1,448 @@
-"""    
-Task for automatically deleting unused data from the memify pipeline.    
-    
-This task identifies and removes data (chunks, entities, summaries) that hasn't    
-been accessed by retrievers for a specified period, helping maintain system    
-efficiency and storage optimization.    
-"""    
-    
-import json    
-from datetime import datetime, timezone, timedelta    
-from typing import Optional, Dict, Any    
-from uuid import UUID    
-import os    
-from cognee.infrastructure.databases.graph import get_graph_engine    
-from cognee.infrastructure.databases.vector import get_vector_engine    
-from cognee.infrastructure.databases.relational import get_relational_engine  
-from cognee.modules.data.models import Data, DatasetData  
-from cognee.shared.logging_utils import get_logger    
-from sqlalchemy import select, or_  
-import cognee  
-import sqlalchemy as sa
-    
-logger = get_logger(__name__)    
+"""      
+Task for automatically deleting unused data from the memify pipeline.      
+      
+This task identifies and removes data (chunks, entities, summaries)) that hasn't      
+been accessed by retrievers for a specified period, helping maintain system      
+efficiency and storage optimization.      
+"""      
+      
+import json      
+from datetime import datetime, timezone, timedelta      
+from typing import Optional, Dict, Any      
+from uuid import UUID      
+import os      
+from cognee.infrastructure.databases.graph import get_graph_engine      
+from cognee.infrastructure.databases.vector import get_vector_engine      
+from cognee.infrastructure.databases.relational import get_relational_engine    
+from cognee.modules.data.models import Data, DatasetData    
+from cognee.shared.logging_utils import get_logger      
+from sqlalchemy import select, or_    
+import cognee    
+import sqlalchemy as sa  
+      
+logger = get_logger(__name__)      
+      
+      
+async def cleanup_unused_data(      
+    minutes_threshold: Optional[int],      
+    dry_run: bool = True,      
+    user_id: Optional[UUID] = None,    
+    text_doc: bool = False    
+) -> Dict[str, Any]:      
+    """      
+    Identify and remove unused data from the memify pipeline.      
+          
+    Parameters      
+    ----------      
+    minutes_threshold : int      
+        days since last access to consider data unused       
+    dry_run : bool      
+        If True, only report what would be delete without actually deleting (default: True)      
+    user_id : UUID, optional      
+        Limit cleanup to specific user's data (default: None)    
+    text_doc : bool    
+        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()    
+        for proper whole-document deletion (default: False)    
+          
+    Returns      
+    -------      
+    Dict[str, Any]      
+        Cleanup results with status, counts, and timestamp      
+    """     
+    # Check 1: Environment variable must be enabled    
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":    
+        logger.warning(    
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."    
+        )    
+        return {    
+            "status": "skipped",    
+            "reason": "ENABLE_LAST_ACCESSED not enabled",    
+            "unused_count": 0,    
+            "deleted_count": {},    
+            "cleanup_date": datetime.now(timezone.utc).isoformat()    
+        }    
+        
+    # Check 2: Verify tracking has actually been running    
+    db_engine = get_relational_engine()    
+    async with db_engine.get_async_session() as session:    
+        # Count records with non-NULL last_accessed    
+        tracked_count = await session.execute(    
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))    
+        )    
+        tracked_records = tracked_count.scalar()    
+            
+        if tracked_records == 0:    
+            logger.warning(    
+                "Cleanup skipped: No records have been tracked yet. "    
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "    
+                "Wait for retrievers to update timestamps before running cleanup."    
+            )    
+            return {    
+                "status": "skipped",    
+                "reason": "No tracked records found - tracking may be newly enabled",    
+                "unused_count": 0,    
+                "deleted_count": {},    
+                "cleanup_date": datetime.now(timezone.utc).isoformat()    
+            }    
+        
+    logger.info(      
+        "Starting cleanup task",      
+        minutes_threshold=minutes_threshold,      
+        dry_run=dry_run,      
+        user_id=str(user_id) if user_id else None,    
+        text_doc=text_doc    
+    )      
+          
+    # Calculate cutoff timestamp    
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
+        
+    if text_doc:    
+        # SQL-based approach: Find unused TextDocuments and use cognee.delete()    
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)    
+    else:    
+        # Graph-based approach: Find unused nodes directly from graph    
+        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)    
+        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")      
+              
+        # Detect database provider and find unused nodes  
+        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider)  
+              
+        total_unused = sum(len(nodes) for nodes in unused_nodes.values())      
+        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})      
+              
+        if dry_run:      
+            return {      
+                "status": "dry_run",      
+                "unused_count": total_unused,      
+                "deleted_count": {      
+                    "data_items": 0,      
+                    "chunks": 0,      
+                    "entities": 0,      
+                    "summaries": 0,      
+                    "associations": 0      
+                },      
+                "cleanup_date": datetime.now(timezone.utc).isoformat(),      
+                "preview": {      
+                    "chunks": len(unused_nodes["DocumentChunk"]),      
+                    "entities": len(unused_nodes["Entity"]),      
+                    "summaries": len(unused_nodes["TextSummary"])      
+                }      
+            }      
+              
+        # Delete unused nodes with provider-specific logic  
+        deleted_counts = await _delete_unused_nodes(unused_nodes, provider)  
+              
+        logger.info("Cleanup completed", deleted_counts=deleted_counts)      
+              
+        return {      
+            "status": "completed",      
+            "unused_count": total_unused,      
+            "deleted_count": {      
+                "data_items": 0,      
+                "chunks": deleted_counts["DocumentChunk"],      
+                "entities": deleted_counts["Entity"],      
+                "summaries": deleted_counts["TextSummary"],      
+                "associations": deleted_counts["associations"]      
+            },      
+            "cleanup_date": datetime.now(timezone.utc).isoformat()      
+        }    
     
     
-async def cleanup_unused_data(    
-    days_threshold: Optional[int],    
-    dry_run: bool = True,    
-    user_id: Optional[UUID] = None,  
-    text_doc: bool = False  
+async def _cleanup_via_sql(    
+    cutoff_date: datetime,    
+    dry_run: bool,    
+    user_id: Optional[UUID] = None    
 ) -> Dict[str, Any]:    
     """    
-    Identify and remove unused data from the memify pipeline.    
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().    
         
     Parameters    
     ----------    
-    days_threshold : int    
-        days since last access to consider data unused     
+    cutoff_date : datetime    
+        Cutoff date for last_accessed filtering    
     dry_run : bool    
-        If True, only report what would be deleted without actually deleting (default: True)    
-    user_id : UUID, optional    
-        Limit cleanup to specific user's data (default: None)  
-    text_doc : bool  
-        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()  
-        for proper whole-document deletion (default: False)  
-        
-    Returns    
-    -------    
-    Dict[str, Any]    
-        Cleanup results with status, counts, and timestamp    
-    """   
-    # Check 1: Environment variable must be enabled  
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
-        logger.warning(  
-            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."  
-        )  
-        return {  
-            "status": "skipped",  
-            "reason": "ENABLE_LAST_ACCESSED not enabled",  
-            "unused_count": 0,  
-            "deleted_count": {},  
-            "cleanup_date": datetime.now(timezone.utc).isoformat()  
-        }  
-      
-    # Check 2: Verify tracking has actually been running  
-    db_engine = get_relational_engine()  
-    async with db_engine.get_async_session() as session:  
-        # Count records with non-NULL last_accessed  
-        tracked_count = await session.execute(  
-            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))  
-        )  
-        tracked_records = tracked_count.scalar()  
-          
-        if tracked_records == 0:  
-            logger.warning(  
-                "Cleanup skipped: No records have been tracked yet. "  
-                "ENABLE_LAST_ACCESSED may have been recently enabled. "  
-                "Wait for retrievers to update timestamps before running cleanup."  
-            )  
-            return {  
-                "status": "skipped",  
-                "reason": "No tracked records found - tracking may be newly enabled",  
-                "unused_count": 0,  
-                "deleted_count": {},  
-                "cleanup_date": datetime.now(timezone.utc).isoformat()  
-            }  
-      
-    logger.info(    
-        "Starting cleanup task",    
-        days_threshold=days_threshold,    
-        dry_run=dry_run,    
-        user_id=str(user_id) if user_id else None,  
-        text_doc=text_doc  
-    )    
-        
-    # Calculate cutoff timestamp  
-    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
-      
-    if text_doc:  
-        # SQL-based approach: Find unused TextDocuments and use cognee.delete()  
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)  
-    else:  
-        # Graph-based approach: Find unused nodes directly from graph  
-        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)  
-        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")    
-            
-        # Find unused nodes    
-        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)    
-            
-        total_unused = sum(len(nodes) for nodes in unused_nodes.values())    
-        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})    
-            
-        if dry_run:    
-            return {    
-                "status": "dry_run",    
-                "unused_count": total_unused,    
-                "deleted_count": {    
-                    "data_items": 0,    
-                    "chunks": 0,    
-                    "entities": 0,    
-                    "summaries": 0,    
-                    "associations": 0    
-                },    
-                "cleanup_date": datetime.now(timezone.utc).isoformat(),    
-                "preview": {    
-                    "chunks": len(unused_nodes["DocumentChunk"]),    
-                    "entities": len(unused_nodes["Entity"]),    
-                    "summaries": len(unused_nodes["TextSummary"])    
-                }    
-            }    
-            
-        # Delete unused nodes    
-        deleted_counts = await _delete_unused_nodes(unused_nodes)    
-            
-        logger.info("Cleanup completed", deleted_counts=deleted_counts)    
-            
-        return {    
-            "status": "completed",    
-            "unused_count": total_unused,    
-            "deleted_count": {    
-                "data_items": 0,    
-                "chunks": deleted_counts["DocumentChunk"],    
-                "entities": deleted_counts["Entity"],    
-                "summaries": deleted_counts["TextSummary"],    
-                "associations": deleted_counts["associations"]    
-            },    
-            "cleanup_date": datetime.now(timezone.utc).isoformat()    
-        }  
-  
-  
-async def _cleanup_via_sql(  
-    cutoff_date: datetime,  
-    dry_run: bool,  
-    user_id: Optional[UUID] = None  
-) -> Dict[str, Any]:  
-    """  
-    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().  
-      
-    Parameters  
-    ----------  
-    cutoff_date : datetime  
-        Cutoff date for last_accessed filtering  
-    dry_run : bool  
-        If True, only report what would be deleted  
-    user_id : UUID, optional  
-        Filter by user ID if provided  
-      
-    Returns  
-    -------  
-    Dict[str, Any]  
-        Cleanup results  
-    """  
-    db_engine = get_relational_engine()  
-      
-    async with db_engine.get_async_session() as session:  
-        # Query for Data records with old last_accessed timestamps  
-        query = select(Data, DatasetData).join(  
-            DatasetData, Data.id == DatasetData.data_id  
-        ).where(  
-            or_(  
-                Data.last_accessed < cutoff_date,  
-                Data.last_accessed.is_(None)  
-            )  
-        )  
-          
-        if user_id:  
-            from cognee.modules.data.models import Dataset  
-            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(  
-                Dataset.owner_id == user_id  
-            )  
-          
-        result = await session.execute(query)  
-        unused_data = result.all()  
-      
-    logger.info(f"Found {len(unused_data)} unused documents in SQL")  
-      
-    if dry_run:  
-        return {  
-            "status": "dry_run",  
-            "unused_count": len(unused_data),  
-            "deleted_count": {  
-                "data_items": 0,  
-                "documents": 0  
-            },  
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),  
-            "preview": {  
-                "documents": len(unused_data)  
-            }  
-        }  
-      
-    # Delete each document using cognee.delete()  
-    deleted_count = 0  
-    from cognee.modules.users.methods import get_default_user  
-    user = await get_default_user() if user_id is None else None  
-      
-    for data, dataset_data in unused_data:  
-        try:  
-            await cognee.delete(  
-                data_id=data.id,  
-                dataset_id=dataset_data.dataset_id,  
-                mode="hard",  # Use hard mode to also remove orphaned entities  
-                user=user  
-            )  
-            deleted_count += 1  
-            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")  
-        except Exception as e:  
-            logger.error(f"Failed to delete document {data.id}: {e}")  
-      
-    logger.info("Cleanup completed", deleted_count=deleted_count)  
-      
-    return {  
-        "status": "completed",  
-        "unused_count": len(unused_data),  
-        "deleted_count": {  
-            "data_items": deleted_count,  
-            "documents": deleted_count  
-        },  
-        "cleanup_date": datetime.now(timezone.utc).isoformat()  
-    }  
-    
-    
-async def _find_unused_nodes(    
-    cutoff_timestamp_ms: int,    
-    user_id: Optional[UUID] = None    
-) -> Dict[str, list]:    
-    """    
-    Query Kuzu for nodes with old last_accessed_at timestamps.    
-        
-    Parameters    
-    ----------    
-    cutoff_timestamp_ms : int    
-        Cutoff timestamp in milliseconds since epoch    
+        If True, only report what would be deleted    
     user_id : UUID, optional    
         Filter by user ID if provided    
         
     Returns    
     -------    
-    Dict[str, list]    
-        Dictionary mapping node types to lists of unused node IDs    
+    Dict[str, Any]    
+        Cleanup results    
     """    
-    graph_engine = await get_graph_engine()    
+    db_engine = get_relational_engine()    
         
-    # Query all nodes with their properties    
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"    
-    results = await graph_engine.query(query)    
-        
-    unused_nodes = {    
-        "DocumentChunk": [],    
-        "Entity": [],    
-        "TextSummary": []    
-    }    
-        
-    for node_id, node_type, props_json in results:    
-        # Only process tracked node types    
-        if node_type not in unused_nodes:    
-            continue    
-            
-        # Parse properties JSON    
-        if props_json:    
-            try:    
-                props = json.loads(props_json)    
-                last_accessed = props.get("last_accessed_at")    
-                    
-                # Check if node is unused (never accessed or accessed before cutoff)    
-                if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
-                    unused_nodes[node_type].append(node_id)    
-                    logger.debug(    
-                        f"Found unused {node_type}",    
-                        node_id=node_id,    
-                        last_accessed=last_accessed    
-                    )    
-            except json.JSONDecodeError:    
-                logger.warning(f"Failed to parse properties for node {node_id}")    
-                continue    
-        
-    return unused_nodes    
-    
-    
-async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:    
-    """    
-    Delete unused nodes from graph and vector databases.    
-        
-    Parameters    
-    ----------    
-    unused_nodes : Dict[str, list]    
-        Dictionary mapping node types to lists of node IDs to delete    
-        
-    Returns    
-    -------    
-    Dict[str, int]    
-        Count of deleted items by type    
-    """    
-    graph_engine = await get_graph_engine()    
-    vector_engine = get_vector_engine()    
-        
-    deleted_counts = {    
-        "DocumentChunk": 0,    
-        "Entity": 0,    
-        "TextSummary": 0,    
-        "associations": 0    
-    }    
-        
-    # Count associations before deletion    
-    for node_type, node_ids in unused_nodes.items():    
-        if not node_ids:    
-            continue    
-            
-        # Count edges connected to these nodes    
-        for node_id in node_ids:    
-            result = await graph_engine.query(    
-                "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",    
-                {"id": node_id}    
+    async with db_engine.get_async_session() as session:    
+        # Query for Data records with old last_accessed timestamps    
+        query = select(Data, DatasetData).join(    
+            DatasetData, Data.id == DatasetData.data_id    
+        ).where(    
+            or_(    
+                Data.last_accessed < cutoff_date,    
+                Data.last_accessed.is_(None)    
             )    
-            if result and len(result) > 0:    
-                deleted_counts["associations"] += result[0][0]    
-        
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)    
-    for node_type, node_ids in unused_nodes.items():    
-        if not node_ids:    
-            continue    
+        )    
             
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")    
+        if user_id:    
+            from cognee.modules.data.models import Dataset    
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(    
+                Dataset.owner_id == user_id    
+            )    
             
-        # Delete nodes in batches    
-        await graph_engine.delete_nodes(node_ids)    
-        deleted_counts[node_type] = len(node_ids)    
+        result = await session.execute(query)    
+        unused_data = result.all()    
         
-    # Delete from vector database    
-    vector_collections = {    
-        "DocumentChunk": "DocumentChunk_text",    
-        "Entity": "Entity_name",    
-        "TextSummary": "TextSummary_text"    
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")    
+        
+    if dry_run:    
+        return {    
+            "status": "dry_run",    
+            "unused_count": len(unused_data),    
+            "deleted_count": {    
+                "data_items": 0,    
+                "documents": 0    
+            },    
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),    
+            "preview": {    
+                "documents": len(unused_data)    
+            }    
+        }    
+        
+    # Delete each document using cognee.delete()    
+    deleted_count = 0    
+    from cognee.modules.users.methods import get_default_user    
+    user = await get_default_user() if user_id is None else None    
+        
+    for data, dataset_data in unused_data:    
+        try:    
+            await cognee.delete(    
+                data_id=data.id,    
+                dataset_id=dataset_data.dataset_id,    
+                mode="hard",  # Use hard mode to also remove orphaned entities    
+                user=user    
+            )    
+            deleted_count += 1    
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")    
+        except Exception as e:    
+            logger.error(f"Failed to delete document {data.id}: {e}")    
+        
+    logger.info("Cleanup completed", deleted_count=deleted_count)    
+        
+    return {    
+        "status": "completed",    
+        "unused_count": len(unused_data),    
+        "deleted_count": {    
+            "data_items": deleted_count,    
+            "documents": deleted_count    
+        },    
+        "cleanup_date": datetime.now(timezone.utc).isoformat()    
     }    
-        
-
-    for node_type, collection_name in vector_collections.items():
-        node_ids = unused_nodes[node_type]
-        if not node_ids:
-            continue
-
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")
-
-        try:
-            if await vector_engine.has_collection(collection_name):
-                await vector_engine.delete_data_points(
-                    collection_name,
-                    [str(node_id) for node_id in node_ids]
-                )
-        except Exception as e:
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")
-        
+      
+      
+async def _find_unused_nodes(      
+    cutoff_timestamp_ms: int,      
+    user_id: Optional[UUID] = None,  
+    provider: str = "kuzu"  
+) -> Dict[str, list]:      
+    """      
+    Find unused nodes with provider-specific queries.      
+          
+    Parameters      
+    ----------      
+    cutoff_timestamp_ms : int      
+        Cutoff timestamp in milliseconds since epoch      
+    user_id : UUID, optional      
+        Filter by user ID if provided    
+    provider : str    
+        Graph database provider (kuzu, neo4j, neptune)  
+          
+    Returns      
+    -------      
+    Dict[str, list]      
+        Dictionary mapping node types to lists of unused node IDs      
+    """      
+    graph_engine = await get_graph_engine()      
+          
+    if provider == "kuzu":  
+        return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms)  
+    elif provider == "neo4j":  
+        return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms)  
+    elif provider == "neptune":  
+        return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms)  
+    else:  
+        logger.warning(f"Unsupported graph provider: {provider}")  
+        return {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+  
+  
+async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms):  
+    """Kuzu-specific unused node detection"""  
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+      
+    for node_id, node_type, props_json in results:  
+        if node_type not in unused_nodes:  
+            continue  
+              
+        if props_json:  
+            try:  
+                props = json.loads(props_json)  
+                last_accessed = props.get("last_accessed_at")  
+                  
+                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+                    unused_nodes[node_type].append(node_id)  
+                    logger.debug(  
+                        f"Found unused {node_type}",  
+                        node_id=node_id,  
+                        last_accessed=last_accessed  
+                    )  
+            except json.JSONDecodeError:  
+                logger.warning(f"Failed to parse properties for node {node_id}")  
+                continue  
+      
+    return unused_nodes  
+  
+  
+async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms):  
+    """Neo4j-specific unused node detection"""  
+    query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+      
+    for row in results:  
+        node_id = row["n"]["id"]  
+        node_type = row["n"]["type"]  
+        last_accessed = row["n"].get("last_accessed_at")  
+          
+        if node_type not in unused_nodes:  
+            continue  
+              
+        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+            unused_nodes[node_type].append(node_id)  
+            logger.debug(  
+                f"Found unused {node_type}",  
+                node_id=node_id,  
+                last_accessed=last_accessed  
+            )  
+      
+    return unused_nodes  
+  
+  
+async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms):  
+    """Neptune-specific unused node detection"""  
+    query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at"  
+    results = await graph_engine.query(query)  
+      
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
+      
+    for row in results:  
+        node_id = row["n"]["id"]  
+        node_type = row["n"]["type"]  
+        last_accessed = row["n"].get("last_accessed_at")  
+          
+        if node_type not in unused_nodes:  
+            continue  
+              
+        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
+            unused_nodes[node_type].append(node_id)  
+            logger.debug(  
+                f"Found unused {node_type}",  
+                node_id=node_id,  
+                last_accessed=last_accessed  
+            )  
+      
+    return unused_nodes  
+  
+  
+async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]:      
+    """      
+    Delete unused nodes from graph and vector databases.      
+          
+    Parameters      
+    ----------      
+    unused_nodes : Dict[str, list]      
+        Dictionary mapping node types to lists of node IDs to delete    
+    provider : str    
+        Graph database provider (kuzu, neo4j, neptune)  
+          
+    Returns      
+    -------      
+    Dict[str, int]      
+        Count of deleted items by type      
+    """      
+    graph_engine = await get_graph_engine()      
+    vector_engine = get_vector_engine()      
+          
+    deleted_counts = {      
+        "DocumentChunk": 0,      
+        "Entity": 0,      
+        "TextSummary": 0,      
+        "associations": 0      
+    }      
+          
+    # Count associations before deletion      
+    for node_type, node_ids in unused_nodes.items():      
+        if not node_ids:      
+            continue      
+              
+        # Count edges connected to these nodes      
+        for node_id in node_ids:      
+            if provider == "kuzu":  
+                result = await graph_engine.query(      
+                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
+                    {"id": node_id}      
+                )  
+            elif provider == "neo4j":  
+                result = await graph_engine.query(      
+                    "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)",      
+                    {"id": node_id}      
+                )  
+            elif provider == "neptune":  
+                result = await graph_engine.query(      
+                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
+                    {"id": node_id}      
+                )  
+              
+            if result and len(result) > 0:      
+                count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"]  
+                deleted_counts["associations"] += count  
+      
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)      
+    for node_type, node_ids in unused_nodes.items():      
+        if not node_ids:      
+            continue      
+              
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")      
+              
+        # Delete nodes in batches      
+        await graph_engine.delete_nodes(node_ids)      
+        deleted_counts[node_type] = len(node_ids)      
+          
+    # Delete from vector database      
+    vector_collections = {      
+        "DocumentChunk": "DocumentChunk_text",      
+        "Entity": "Entity_name",      
+        "TextSummary": "TextSummary_text"      
+    }      
+          
+  
+    for node_type, collection_name in vector_collections.items():  
+        node_ids = unused_nodes[node_type]  
+        if not node_ids:  
+            continue  
+  
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
+  
+        try:  
+            if await vector_engine.has_collection(collection_name):  
+                await vector_engine.delete_data_points(  
+                    collection_name,  
+                    [str(node_id) for node_id in node_ids]  
+                )  
+        except Exception as e:  
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
+          
     return deleted_counts

From 6a4d31356bb613e5cf74e7972445f804796ee6d4 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 2 Dec 2025 18:55:47 +0530
Subject: [PATCH 26/37] fix: using graph projection instead of conditions

---
 .../retrieval/utils/access_tracking.py        | 156 ++--
 cognee/tasks/cleanup/cleanup_unused_data.py   | 759 ++++++++----------
 2 files changed, 418 insertions(+), 497 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 12a66f8bc..935c47157 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -4,118 +4,116 @@ import json
 from datetime import datetime, timezone  
 from typing import List, Any  
 from uuid import UUID  
-import os 
+import os   
 from cognee.infrastructure.databases.graph import get_graph_engine  
 from cognee.infrastructure.databases.relational import get_relational_engine  
 from cognee.modules.data.models import Data  
 from cognee.shared.logging_utils import get_logger  
 from sqlalchemy import update  
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
   
 logger = get_logger(__name__)  
   
 async def update_node_access_timestamps(items: List[Any]):  
     if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
         return  
-      
+          
     if not items:  
         return  
-      
+          
     graph_engine = await get_graph_engine()  
     timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
     timestamp_dt = datetime.now(timezone.utc)  
-      
+          
     # Extract node IDs  
     node_ids = []  
     for item in items:  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
         if item_id:  
             node_ids.append(str(item_id))  
-      
+          
     if not node_ids:  
         return  
-      
-    try:  
-        # Detect database provider and use appropriate queries  
-        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
           
-        if provider == "kuzu":  
-            await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms)  
-        elif provider == "neo4j":  
-            await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms)  
-        elif provider == "neptune":  
-            await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms)  
-        else:  
-            logger.warning(f"Unsupported graph provider: {provider}")  
-            return  
+    try:  
+        # Update nodes using graph projection ( database-agnostic approach  
+        await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)  
               
         # Find origin documents and update SQL  
-        doc_ids = await _find_origin_documents(graph_engine, node_ids, provider)  
+        doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
         if doc_ids:  
             await _update_sql_records(doc_ids, timestamp_dt)  
-              
+                  
     except Exception as e:  
         logger.error(f"Failed to update timestamps: {e}")  
         raise  
   
-async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms):  
-    """Kuzu-specific node updates"""  
-    for node_id in node_ids:  
-        result = await graph_engine.query(  
-            "MATCH (n:Node {id: $id}) RETURN n.properties",  
-            {"id": node_id}  
-        )  
-          
-        if result and result[0]:  
-            props = json.loads(result[0][0]) if result[0][0] else {}  
-            props["last_accessed_at"] = timestamp_ms  
-              
-            await graph_engine.query(  
-                "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                {"id": node_id, "props": json.dumps(props)}  
-            )  
-  
-async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms):  
-    """Neo4j-specific node updates"""  
-    for node_id in node_ids:  
-        await graph_engine.query(  
-            "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
-            {"id": node_id, "timestamp": timestamp_ms}  
-        )  
-  
-async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms):  
-    """Neptune-specific node updates"""  
-    for node_id in node_ids:  
-        await graph_engine.query(  
-            "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
-            {"id": node_id, "timestamp": timestamp_ms}  
-        )  
-  
-async def _find_origin_documents(graph_engine, node_ids, provider):  
-    """Find origin documents with provider-specific queries"""  
-    if provider == "kuzu":  
-        query = """  
-        UNWIND $node_ids AS node_id    
-        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
-        RETURN DISTINCT doc.id  
-        """  
-    elif provider == "neo4j":  
-        query = """  
-        UNWIND $node_ids AS node_id    
-        MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__)    
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
-        RETURN DISTINCT doc.id  
-        """  
-    elif provider == "neptune":  
-        query = """  
-        UNWIND $node_ids AS node_id    
-        MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)    
-        WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']    
-        RETURN DISTINCT doc.id  
-        """  
+async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):  
+    """Update nodes using graph projection - works with any graph database"""  
+    # Project the graph with necessary properties  
+    memory_fragment = CogneeGraph()  
+    await memory_fragment.project_graph_from_db(  
+        graph_engine,  
+        node_properties_to_project=["id"],  
+        edge_properties_to_project=[]  
+    )  
       
-    result = await graph_engine.query(query, {"node_ids": node_ids})  
-    return list(set([row[0] for row in result if row and row[0]])) if result else []  
+    # Update each node's last_accessed_at property  
+    for node_id in node_ids:  
+        node = memory_fragment.get_node(node_id)  
+        if node:  
+            # Update the node in the database  
+            provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+              
+            if provider == "kuzu":  
+                # Kuzu stores properties as JSON  
+                result = await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) RETURN n.properties",  
+                    {"id": node_id}  
+                )  
+                  
+                if result and result[0]:  
+                    props = json.loads(result[0][0]) if result[0][0] else {}  
+                    props["last_accessed_at"] = timestamp_ms  
+                      
+                    await graph_engine.query(  
+                        "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                        {"id": node_id, "props": json.dumps(props)}  
+                    )  
+            elif provider == "neo4j":  
+                await graph_engine.query(  
+                    "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
+                    {"id": node_id, "timestamp": timestamp_ms}  
+                )  
+            elif provider == "neptune":  
+                await graph_engine.query(  
+                    "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
+                    {"id": node_id, "timestamp": timestamp_ms}  
+                )  
+  
+async def _find_origin_documents_via_projection(graph_engine, node_ids):  
+    """Find origin documents using graph projection instead of DB queries"""  
+    # Project the entire graph with necessary properties  
+    memory_fragment = CogneeGraph()  
+    await memory_fragment.project_graph_from_db(  
+        graph_engine,  
+        node_properties_to_project=["id", "type"],  
+        edge_properties_to_project=["relationship_name"]  
+    )  
+      
+    # Find origin documents by traversing the in-memory graph  
+    doc_ids = set()  
+    for node_id in node_ids:  
+        node = memory_fragment.get_node(node_id)  
+        if node and node.get_attribute("type") == "DocumentChunk":  
+            # Traverse edges to find connected documents  
+            for edge in node.get_skeleton_edges():  
+                # Get the neighbor node  
+                neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node()  
+                if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:  
+                    doc_ids.add(neighbor.id)  
+      
+    return list(doc_ids)  
   
 async def _update_sql_records(doc_ids, timestamp_dt):  
     """Update SQL Data table (same for all providers)"""  
@@ -124,6 +122,6 @@ async def _update_sql_records(doc_ids, timestamp_dt):
         stmt = update(Data).where(  
             Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
         ).values(last_accessed=timestamp_dt)  
-          
+              
         await session.execute(stmt)  
         await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index b89c939a8..c70b97a00 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,448 +1,371 @@
-"""      
-Task for automatically deleting unused data from the memify pipeline.      
-      
-This task identifies and removes data (chunks, entities, summaries)) that hasn't      
-been accessed by retrievers for a specified period, helping maintain system      
-efficiency and storage optimization.      
-"""      
-      
-import json      
-from datetime import datetime, timezone, timedelta      
-from typing import Optional, Dict, Any      
-from uuid import UUID      
-import os      
-from cognee.infrastructure.databases.graph import get_graph_engine      
-from cognee.infrastructure.databases.vector import get_vector_engine      
-from cognee.infrastructure.databases.relational import get_relational_engine    
-from cognee.modules.data.models import Data, DatasetData    
-from cognee.shared.logging_utils import get_logger      
-from sqlalchemy import select, or_    
-import cognee    
-import sqlalchemy as sa  
-      
-logger = get_logger(__name__)      
+"""        
+Task for automatically deleting unused data from the memify pipeline.        
+        
+This task identifies and removes data (chunks, entities, summaries)) that hasn't        
+been accessed by retrievers for a specified period, helping maintain system        
+efficiency and storage optimization.        
+"""        
+        
+import json        
+from datetime import datetime, timezone, timedelta        
+from typing import Optional, Dict, Any        
+from uuid import UUID        
+import os        
+from cognee.infrastructure.databases.graph import get_graph_engine        
+from cognee.infrastructure.databases.vector import get_vector_engine        
+from cognee.infrastructure.databases.relational import get_relational_engine      
+from cognee.modules.data.models import Data, DatasetData      
+from cognee.shared.logging_utils import get_logger        
+from sqlalchemy import select, or_      
+import cognee      
+import sqlalchemy as sa    
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
+        
+logger = get_logger(__name__)        
+        
+        
+async def cleanup_unused_data(        
+    minutes_threshold: Optional[int],        
+    dry_run: bool = True,        
+    user_id: Optional[UUID] = None,      
+    text_doc: bool = False      
+) -> Dict[str, Any]:        
+    """        
+    Identify and remove unused data from the memify pipeline.        
+            
+    Parameters        
+    ----------        
+    minutes_threshold : int        
+        days since last access to consider data unused         
+    dry_run : bool        
+        If True, only report what would be delete without actually deleting (default: True)        
+    user_id : UUID, optional        
+        Limit cleanup to specific user's data (default: None)      
+    text_doc : bool      
+        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
+        for proper whole-document deletion (default: False)      
+            
+    Returns        
+    -------        
+    Dict[str, Any]        
+        Cleanup results with status, counts, and timestamp        
+    """       
+    # Check 1: Environment variable must be enabled      
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":      
+        logger.warning(      
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."      
+        )      
+        return {      
+            "status": "skipped",      
+            "reason": "ENABLE_LAST_ACCESSED not enabled",      
+            "unused_count": 0,      
+            "deleted_count": {},      
+            "cleanup_date": datetime.now(timezone.utc).isoformat()      
+        }      
+          
+    # Check 2: Verify tracking has actually been running      
+    db_engine = get_relational_engine()      
+    async with db_engine.get_async_session() as session:      
+        # Count records with non-NULL last_accessed      
+        tracked_count = await session.execute(      
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))      
+        )      
+        tracked_records = tracked_count.scalar()      
+              
+        if tracked_records == 0:      
+            logger.warning(      
+                "Cleanup skipped: No records have been tracked yet. "      
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "      
+                "Wait for retrievers to update timestamps before running cleanup."      
+            )      
+            return {      
+                "status": "skipped",      
+                "reason": "No tracked records found - tracking may be newly enabled",      
+                "unused_count": 0,      
+                "deleted_count": {},      
+                "cleanup_date": datetime.now(timezone.utc).isoformat()      
+            }      
+          
+    logger.info(        
+        "Starting cleanup task",        
+        minutes_threshold=minutes_threshold,        
+        dry_run=dry_run,        
+        user_id=str(user_id) if user_id else None,      
+        text_doc=text_doc      
+    )        
+            
+    # Calculate cutoff timestamp      
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)    
+          
+    if text_doc:      
+        # SQL-based approach: Find unused TextDocuments and use cognee.delete()      
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
+    else:      
+        # Graph-based approach: Find unused nodes using projection (database-agnostic)      
+        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)      
+        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")        
+                
+        # Find unused nodes using graph projection    
+        unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms)    
+                
+        total_unused = sum(len(nodes) for nodes in unused_nodes.values())        
+        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})        
+                
+        if dry_run:        
+            return {        
+                "status": "dry_run",        
+                "unused_count": total_unused,        
+                "deleted_count": {        
+                    "data_items": 0,        
+                    "chunks": 0,        
+                    "entities": 0,        
+                    "summaries": 0,        
+                    "associations": 0        
+                },        
+                "cleanup_date": datetime.now(timezone.utc).isoformat(),        
+                "preview": {        
+                    "chunks": len(unused_nodes["DocumentChunk"]),        
+                    "entities": len(unused_nodes["Entity"]),        
+                    "summaries": len(unused_nodes["TextSummary"])        
+                }        
+            }        
+                
+        # Delete unused nodes (provider-agnostic deletion)    
+        deleted_counts = await _delete_unused_nodes(unused_nodes)    
+                
+        logger.info("Cleanup completed", deleted_counts=deleted_counts)        
+                
+        return {        
+            "status": "completed",        
+            "unused_count": total_unused,        
+            "deleted_count": {        
+                "data_items": 0,        
+                "chunks": deleted_counts["DocumentChunk"],        
+                "entities": deleted_counts["Entity"],        
+                "summaries": deleted_counts["TextSummary"],        
+                "associations": deleted_counts["associations"]        
+            },        
+            "cleanup_date": datetime.now(timezone.utc).isoformat()        
+        }      
       
       
-async def cleanup_unused_data(      
-    minutes_threshold: Optional[int],      
-    dry_run: bool = True,      
-    user_id: Optional[UUID] = None,    
-    text_doc: bool = False    
+async def _cleanup_via_sql(      
+    cutoff_date: datetime,      
+    dry_run: bool,      
+    user_id: Optional[UUID] = None      
 ) -> Dict[str, Any]:      
     """      
-    Identify and remove unused data from the memify pipeline.      
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().      
           
     Parameters      
     ----------      
-    minutes_threshold : int      
-        days since last access to consider data unused       
+    cutoff_date : datetime      
+        Cutoff date for last_accessed filtering      
     dry_run : bool      
-        If True, only report what would be delete without actually deleting (default: True)      
+        If True, only report what would be deleted      
     user_id : UUID, optional      
-        Limit cleanup to specific user's data (default: None)    
-    text_doc : bool    
-        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()    
-        for proper whole-document deletion (default: False)    
+        Filter by user ID if provided      
           
     Returns      
     -------      
     Dict[str, Any]      
-        Cleanup results with status, counts, and timestamp      
-    """     
-    # Check 1: Environment variable must be enabled    
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":    
-        logger.warning(    
-            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."    
-        )    
-        return {    
-            "status": "skipped",    
-            "reason": "ENABLE_LAST_ACCESSED not enabled",    
-            "unused_count": 0,    
-            "deleted_count": {},    
-            "cleanup_date": datetime.now(timezone.utc).isoformat()    
-        }    
-        
-    # Check 2: Verify tracking has actually been running    
-    db_engine = get_relational_engine()    
-    async with db_engine.get_async_session() as session:    
-        # Count records with non-NULL last_accessed    
-        tracked_count = await session.execute(    
-            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))    
-        )    
-        tracked_records = tracked_count.scalar()    
-            
-        if tracked_records == 0:    
-            logger.warning(    
-                "Cleanup skipped: No records have been tracked yet. "    
-                "ENABLE_LAST_ACCESSED may have been recently enabled. "    
-                "Wait for retrievers to update timestamps before running cleanup."    
-            )    
-            return {    
-                "status": "skipped",    
-                "reason": "No tracked records found - tracking may be newly enabled",    
-                "unused_count": 0,    
-                "deleted_count": {},    
-                "cleanup_date": datetime.now(timezone.utc).isoformat()    
-            }    
-        
-    logger.info(      
-        "Starting cleanup task",      
-        minutes_threshold=minutes_threshold,      
-        dry_run=dry_run,      
-        user_id=str(user_id) if user_id else None,    
-        text_doc=text_doc    
-    )      
+        Cleanup results      
+    """      
+    db_engine = get_relational_engine()      
           
-    # Calculate cutoff timestamp    
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)  
-        
-    if text_doc:    
-        # SQL-based approach: Find unused TextDocuments and use cognee.delete()    
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)    
-    else:    
-        # Graph-based approach: Find unused nodes directly from graph    
-        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)    
-        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")      
+    async with db_engine.get_async_session() as session:      
+        # Query for Data records with old last_accessed timestamps      
+        query = select(Data, DatasetData).join(      
+            DatasetData, Data.id == DatasetData.data_id      
+        ).where(      
+            or_(      
+                Data.last_accessed < cutoff_date,      
+                Data.last_accessed.is_(None)      
+            )      
+        )      
               
-        # Detect database provider and find unused nodes  
-        provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
-        unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider)  
-              
-        total_unused = sum(len(nodes) for nodes in unused_nodes.values())      
-        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})      
-              
-        if dry_run:      
-            return {      
-                "status": "dry_run",      
-                "unused_count": total_unused,      
-                "deleted_count": {      
-                    "data_items": 0,      
-                    "chunks": 0,      
-                    "entities": 0,      
-                    "summaries": 0,      
-                    "associations": 0      
-                },      
-                "cleanup_date": datetime.now(timezone.utc).isoformat(),      
-                "preview": {      
-                    "chunks": len(unused_nodes["DocumentChunk"]),      
-                    "entities": len(unused_nodes["Entity"]),      
-                    "summaries": len(unused_nodes["TextSummary"])      
-                }      
-            }      
-              
-        # Delete unused nodes with provider-specific logic  
-        deleted_counts = await _delete_unused_nodes(unused_nodes, provider)  
-              
-        logger.info("Cleanup completed", deleted_counts=deleted_counts)      
+        if user_id:      
+            from cognee.modules.data.models import Dataset      
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(      
+                Dataset.owner_id == user_id      
+            )      
               
+        result = await session.execute(query)      
+        unused_data = result.all()      
+          
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")      
+          
+    if dry_run:      
         return {      
-            "status": "completed",      
-            "unused_count": total_unused,      
+            "status": "dry_run",      
+            "unused_count": len(unused_data),      
             "deleted_count": {      
                 "data_items": 0,      
-                "chunks": deleted_counts["DocumentChunk"],      
-                "entities": deleted_counts["Entity"],      
-                "summaries": deleted_counts["TextSummary"],      
-                "associations": deleted_counts["associations"]      
+                "documents": 0      
             },      
-            "cleanup_date": datetime.now(timezone.utc).isoformat()      
-        }    
-    
-    
-async def _cleanup_via_sql(    
-    cutoff_date: datetime,    
-    dry_run: bool,    
-    user_id: Optional[UUID] = None    
-) -> Dict[str, Any]:    
-    """    
-    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().    
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),      
+            "preview": {      
+                "documents": len(unused_data)      
+            }      
+        }      
+          
+    # Delete each document using cognee.delete()      
+    deleted_count = 0      
+    from cognee.modules.users.methods import get_default_user      
+    user = await get_default_user() if user_id is None else None      
+          
+    for data, dataset_data in unused_data:      
+        try:      
+            await cognee.delete(      
+                data_id=data.id,      
+                dataset_id=dataset_data.dataset_id,      
+                mode="hard",  # Use hard mode to also remove orphaned entities      
+                user=user      
+            )      
+            deleted_count += 1      
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")      
+        except Exception as e:      
+            logger.error(f"Failed to delete document {data.id}: {e}")      
+          
+    logger.info("Cleanup completed", deleted_count=deleted_count)      
+          
+    return {      
+        "status": "completed",      
+        "unused_count": len(unused_data),      
+        "deleted_count": {      
+            "data_items": deleted_count,      
+            "documents": deleted_count      
+        },      
+        "cleanup_date": datetime.now(timezone.utc).isoformat()      
+    }      
         
-    Parameters    
-    ----------    
-    cutoff_date : datetime    
-        Cutoff date for last_accessed filtering    
-    dry_run : bool    
-        If True, only report what would be deleted    
-    user_id : UUID, optional    
-        Filter by user ID if provided    
         
-    Returns    
-    -------    
-    Dict[str, Any]    
-        Cleanup results    
-    """    
-    db_engine = get_relational_engine()    
+async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:        
+    """        
+    Find unused nodes using graph projection - database-agnostic approach.        
+            
+    Parameters        
+    ----------        
+    cutoff_timestamp_ms : int        
+        Cutoff timestamp in milliseconds since epoch        
+            
+    Returns        
+    -------        
+    Dict[str, list]        
+        Dictionary mapping node types to lists of unused node IDs        
+    """        
+    graph_engine = await get_graph_engine()        
+            
+    # Project the entire graph with necessary properties    
+    memory_fragment = CogneeGraph()    
+    await memory_fragment.project_graph_from_db(    
+        graph_engine,    
+        node_properties_to_project=["id", "type", "last_accessed_at"],    
+        edge_properties_to_project=[]    
+    )    
         
-    async with db_engine.get_async_session() as session:    
-        # Query for Data records with old last_accessed timestamps    
-        query = select(Data, DatasetData).join(    
-            DatasetData, Data.id == DatasetData.data_id    
-        ).where(    
-            or_(    
-                Data.last_accessed < cutoff_date,    
-                Data.last_accessed.is_(None)    
+    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}    
+        
+    # Get all nodes from the projected graph    
+    all_nodes = memory_fragment.get_nodes()    
+        
+    for node in all_nodes:    
+        node_type = node.get_attribute("type")    
+        if node_type not in unused_nodes:    
+            continue    
+                
+        # Check last_accessed_at property    
+        last_accessed = node.get_attribute("last_accessed_at")    
+            
+        if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
+            unused_nodes[node_type].append(node.id)    
+            logger.debug(    
+                f"Found unused {node_type}",    
+                node_id=node.id,    
+                last_accessed=last_accessed    
             )    
+        
+    return unused_nodes    
+    
+    
+async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:        
+    """        
+    Delete unused nodes from graph and vector databases.        
+            
+    Parameters        
+    ----------        
+    unused_nodes : Dict[str, list]        
+        Dictionary mapping node types to lists of node IDs to delete      
+            
+    Returns        
+    -------        
+    Dict[str, int]        
+        Count of deleted items by type        
+    """        
+    graph_engine = await get_graph_engine()        
+    vector_engine = get_vector_engine()        
+            
+    deleted_counts = {        
+        "DocumentChunk": 0,        
+        "Entity": 0,        
+        "TextSummary": 0,        
+        "associations": 0        
+    }        
+            
+    # Count associations before deletion (using graph projection for consistency)    
+    if any(unused_nodes.values()):    
+        memory_fragment = CogneeGraph()    
+        await memory_fragment.project_graph_from_db(    
+            graph_engine,    
+            node_properties_to_project=["id"],    
+            edge_properties_to_project=[]    
         )    
             
-        if user_id:    
-            from cognee.modules.data.models import Dataset    
-            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(    
-                Dataset.owner_id == user_id    
-            )    
+        for node_type, node_ids in unused_nodes.items():        
+            if not node_ids:        
+                continue        
+                    
+            # Count edges connected to these nodes    
+            for node_id in node_ids:    
+                node = memory_fragment.get_node(node_id)    
+                if node:    
+                    # Count edges from the in-memory graph    
+                    edge_count = len(node.get_skeleton_edges())    
+                    deleted_counts["associations"] += edge_count    
+        
+    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)        
+    for node_type, node_ids in unused_nodes.items():        
+        if not node_ids:        
+            continue        
+                
+        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")        
+                
+        # Delete nodes in batches (database-agnostic)        
+        await graph_engine.delete_nodes(node_ids)        
+        deleted_counts[node_type] = len(node_ids)        
             
-        result = await session.execute(query)    
-        unused_data = result.all()    
-        
-    logger.info(f"Found {len(unused_data)} unused documents in SQL")    
-        
-    if dry_run:    
-        return {    
-            "status": "dry_run",    
-            "unused_count": len(unused_data),    
-            "deleted_count": {    
-                "data_items": 0,    
-                "documents": 0    
-            },    
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),    
-            "preview": {    
-                "documents": len(unused_data)    
-            }    
-        }    
-        
-    # Delete each document using cognee.delete()    
-    deleted_count = 0    
-    from cognee.modules.users.methods import get_default_user    
-    user = await get_default_user() if user_id is None else None    
-        
-    for data, dataset_data in unused_data:    
+    # Delete from vector database        
+    vector_collections = {        
+        "DocumentChunk": "DocumentChunk_text",        
+        "Entity": "Entity_name",        
+        "TextSummary": "TextSummary_text"        
+    }        
+            
+    
+    for node_type, collection_name in vector_collections.items():    
+        node_ids = unused_nodes[node_type]    
+        if not node_ids:    
+            continue    
+    
+        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
+    
         try:    
-            await cognee.delete(    
-                data_id=data.id,    
-                dataset_id=dataset_data.dataset_id,    
-                mode="hard",  # Use hard mode to also remove orphaned entities    
-                user=user    
-            )    
-            deleted_count += 1    
-            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")    
+            if await vector_engine.has_collection(collection_name):    
+                await vector_engine.delete_data_points(    
+                    collection_name,    
+                    [str(node_id) for node_id in node_ids]    
+                )    
         except Exception as e:    
-            logger.error(f"Failed to delete document {data.id}: {e}")    
-        
-    logger.info("Cleanup completed", deleted_count=deleted_count)    
-        
-    return {    
-        "status": "completed",    
-        "unused_count": len(unused_data),    
-        "deleted_count": {    
-            "data_items": deleted_count,    
-            "documents": deleted_count    
-        },    
-        "cleanup_date": datetime.now(timezone.utc).isoformat()    
-    }    
-      
-      
-async def _find_unused_nodes(      
-    cutoff_timestamp_ms: int,      
-    user_id: Optional[UUID] = None,  
-    provider: str = "kuzu"  
-) -> Dict[str, list]:      
-    """      
-    Find unused nodes with provider-specific queries.      
-          
-    Parameters      
-    ----------      
-    cutoff_timestamp_ms : int      
-        Cutoff timestamp in milliseconds since epoch      
-    user_id : UUID, optional      
-        Filter by user ID if provided    
-    provider : str    
-        Graph database provider (kuzu, neo4j, neptune)  
-          
-    Returns      
-    -------      
-    Dict[str, list]      
-        Dictionary mapping node types to lists of unused node IDs      
-    """      
-    graph_engine = await get_graph_engine()      
-          
-    if provider == "kuzu":  
-        return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms)  
-    elif provider == "neo4j":  
-        return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms)  
-    elif provider == "neptune":  
-        return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms)  
-    else:  
-        logger.warning(f"Unsupported graph provider: {provider}")  
-        return {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-  
-  
-async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms):  
-    """Kuzu-specific unused node detection"""  
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-      
-    for node_id, node_type, props_json in results:  
-        if node_type not in unused_nodes:  
-            continue  
-              
-        if props_json:  
-            try:  
-                props = json.loads(props_json)  
-                last_accessed = props.get("last_accessed_at")  
-                  
-                if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-                    unused_nodes[node_type].append(node_id)  
-                    logger.debug(  
-                        f"Found unused {node_type}",  
-                        node_id=node_id,  
-                        last_accessed=last_accessed  
-                    )  
-            except json.JSONDecodeError:  
-                logger.warning(f"Failed to parse properties for node {node_id}")  
-                continue  
-      
-    return unused_nodes  
-  
-  
-async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms):  
-    """Neo4j-specific unused node detection"""  
-    query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-      
-    for row in results:  
-        node_id = row["n"]["id"]  
-        node_type = row["n"]["type"]  
-        last_accessed = row["n"].get("last_accessed_at")  
-          
-        if node_type not in unused_nodes:  
-            continue  
-              
-        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-            unused_nodes[node_type].append(node_id)  
-            logger.debug(  
-                f"Found unused {node_type}",  
-                node_id=node_id,  
-                last_accessed=last_accessed  
-            )  
-      
-    return unused_nodes  
-  
-  
-async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms):  
-    """Neptune-specific unused node detection"""  
-    query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at"  
-    results = await graph_engine.query(query)  
-      
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}  
-      
-    for row in results:  
-        node_id = row["n"]["id"]  
-        node_type = row["n"]["type"]  
-        last_accessed = row["n"].get("last_accessed_at")  
-          
-        if node_type not in unused_nodes:  
-            continue  
-              
-        if last_accessed is None or last_accessed < cutoff_timestamp_ms:  
-            unused_nodes[node_type].append(node_id)  
-            logger.debug(  
-                f"Found unused {node_type}",  
-                node_id=node_id,  
-                last_accessed=last_accessed  
-            )  
-      
-    return unused_nodes  
-  
-  
-async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]:      
-    """      
-    Delete unused nodes from graph and vector databases.      
-          
-    Parameters      
-    ----------      
-    unused_nodes : Dict[str, list]      
-        Dictionary mapping node types to lists of node IDs to delete    
-    provider : str    
-        Graph database provider (kuzu, neo4j, neptune)  
-          
-    Returns      
-    -------      
-    Dict[str, int]      
-        Count of deleted items by type      
-    """      
-    graph_engine = await get_graph_engine()      
-    vector_engine = get_vector_engine()      
-          
-    deleted_counts = {      
-        "DocumentChunk": 0,      
-        "Entity": 0,      
-        "TextSummary": 0,      
-        "associations": 0      
-    }      
-          
-    # Count associations before deletion      
-    for node_type, node_ids in unused_nodes.items():      
-        if not node_ids:      
-            continue      
-              
-        # Count edges connected to these nodes      
-        for node_id in node_ids:      
-            if provider == "kuzu":  
-                result = await graph_engine.query(      
-                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
-                    {"id": node_id}      
-                )  
-            elif provider == "neo4j":  
-                result = await graph_engine.query(      
-                    "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)",      
-                    {"id": node_id}      
-                )  
-            elif provider == "neptune":  
-                result = await graph_engine.query(      
-                    "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",      
-                    {"id": node_id}      
-                )  
-              
-            if result and len(result) > 0:      
-                count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"]  
-                deleted_counts["associations"] += count  
-      
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)      
-    for node_type, node_ids in unused_nodes.items():      
-        if not node_ids:      
-            continue      
-              
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")      
-              
-        # Delete nodes in batches      
-        await graph_engine.delete_nodes(node_ids)      
-        deleted_counts[node_type] = len(node_ids)      
-          
-    # Delete from vector database      
-    vector_collections = {      
-        "DocumentChunk": "DocumentChunk_text",      
-        "Entity": "Entity_name",      
-        "TextSummary": "TextSummary_text"      
-    }      
-          
-  
-    for node_type, collection_name in vector_collections.items():  
-        node_ids = unused_nodes[node_type]  
-        if not node_ids:  
-            continue  
-  
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")  
-  
-        try:  
-            if await vector_engine.has_collection(collection_name):  
-                await vector_engine.delete_data_points(  
-                    collection_name,  
-                    [str(node_id) for node_id in node_ids]  
-                )  
-        except Exception as e:  
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")  
-          
+            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
+            
     return deleted_counts

From 5f00abf3e4f3b913ae67391d487104ea3b9ae872 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Tue, 2 Dec 2025 22:25:03 +0530
Subject: [PATCH 27/37] fix: fallback and document deletion

---
 .../retrieval/utils/access_tracking.py        | 73 +++++++++++--------
 cognee/tasks/cleanup/cleanup_unused_data.py   | 41 +++++++----
 2 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index 935c47157..c7b06ee17 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -36,16 +36,22 @@ async def update_node_access_timestamps(items: List[Any]):
         return  
           
     try:  
-        # Update nodes using graph projection ( database-agnostic approach  
+        # Try to update nodes in graph database (may fail for unsupported DBs)  
         await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)  
+    except Exception as e:  
+        logger.warning(  
+            f"Failed to update node timestamps in graph database: {e}. "  
+            "Will update document-level timestamps in SQL instead."  
+        )  
               
-        # Find origin documents and update SQL  
+    # Always try to find origin documents and update SQL  
+    # This ensures document-level tracking works even if graph updates fail  
+    try:  
         doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
         if doc_ids:  
             await _update_sql_records(doc_ids, timestamp_dt)  
-                  
     except Exception as e:  
-        logger.error(f"Failed to update timestamps: {e}")  
+        logger.error(f"Failed to update SQL timestamps: {e}")  
         raise  
   
 async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):  
@@ -59,37 +65,42 @@ async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):
     )  
       
     # Update each node's last_accessed_at property  
+    provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
+      
     for node_id in node_ids:  
         node = memory_fragment.get_node(node_id)  
         if node:  
-            # Update the node in the database  
-            provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
-              
-            if provider == "kuzu":  
-                # Kuzu stores properties as JSON  
-                result = await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) RETURN n.properties",  
-                    {"id": node_id}  
-                )  
-                  
-                if result and result[0]:  
-                    props = json.loads(result[0][0]) if result[0][0] else {}  
-                    props["last_accessed_at"] = timestamp_ms  
-                      
-                    await graph_engine.query(  
-                        "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                        {"id": node_id, "props": json.dumps(props)}  
+            try:  
+                # Update the node in the database  
+                if provider == "kuzu":  
+                    # Kuzu stores properties as JSON  
+                    result = await graph_engine.query(  
+                        "MATCH (n:Node {id: $id}) RETURN n.properties",  
+                        {"id": node_id}  
                     )  
-            elif provider == "neo4j":  
-                await graph_engine.query(  
-                    "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
-                    {"id": node_id, "timestamp": timestamp_ms}  
-                )  
-            elif provider == "neptune":  
-                await graph_engine.query(  
-                    "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
-                    {"id": node_id, "timestamp": timestamp_ms}  
-                )  
+                      
+                    if result and result[0]:  
+                        props = json.loads(result[0][0]) if result[0][0] else {}  
+                        props["last_accessed_at"] = timestamp_ms  
+                          
+                        await graph_engine.query(  
+                            "MATCH (n:Node {id: $id}) SET n.properties = $props",  
+                            {"id": node_id, "props": json.dumps(props)}  
+                        )  
+                elif provider == "neo4j":  
+                    await graph_engine.query(  
+                        "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
+                        {"id": node_id, "timestamp": timestamp_ms}  
+                    )  
+                elif provider == "neptune":  
+                    await graph_engine.query(  
+                        "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
+                        {"id": node_id, "timestamp": timestamp_ms}  
+                    )  
+            except Exception as e:  
+                # Log but continue with other nodes  
+                logger.debug(f"Failed to update node {node_id}: {e}")  
+                continue  
   
 async def _find_origin_documents_via_projection(graph_engine, node_ids):  
     """Find origin documents using graph projection instead of DB queries"""  
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index c70b97a00..3894635dd 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,9 +1,9 @@
 """        
 Task for automatically deleting unused data from the memify pipeline.        
         
-This task identifies and removes data (chunks, entities, summaries)) that hasn't        
+This task identifies and removes entire documents that haven't        
 been accessed by retrievers for a specified period, helping maintain system        
-efficiency and storage optimization.        
+efficiency and storage optimization through whole-document removal.        
 """        
         
 import json        
@@ -28,22 +28,26 @@ async def cleanup_unused_data(
     minutes_threshold: Optional[int],        
     dry_run: bool = True,        
     user_id: Optional[UUID] = None,      
-    text_doc: bool = False      
+    text_doc: bool = True,  # Changed default to True for document-level cleanup  
+    node_level: bool = False  # New parameter for explicit node-level cleanup  
 ) -> Dict[str, Any]:        
     """        
     Identify and remove unused data from the memify pipeline.        
-            
+        
     Parameters        
     ----------        
     minutes_threshold : int        
-        days since last access to consider data unused         
+        Minutes since last access to consider data unused         
     dry_run : bool        
-        If True, only report what would be delete without actually deleting (default: True)        
+        If True, only report what would be deleted without actually deleting (default: True)        
     user_id : UUID, optional        
         Limit cleanup to specific user's data (default: None)      
     text_doc : bool      
-        If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
-        for proper whole-document deletion (default: False)      
+        If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
+        for proper whole-document deletion      
+    node_level : bool      
+        If True, perform chaotic node-level deletion of unused chunks, entities, and summaries      
+        (default: False - deprecated in favor of document-level cleanup)      
             
     Returns        
     -------        
@@ -91,17 +95,19 @@ async def cleanup_unused_data(
         minutes_threshold=minutes_threshold,        
         dry_run=dry_run,        
         user_id=str(user_id) if user_id else None,      
-        text_doc=text_doc      
+        text_doc=text_doc,      
+        node_level=node_level      
     )        
             
     # Calculate cutoff timestamp      
     cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)    
           
-    if text_doc:      
-        # SQL-based approach: Find unused TextDocuments and use cognee.delete()      
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
-    else:      
-        # Graph-based approach: Find unused nodes using projection (database-agnostic)      
+    if node_level:      
+        # Deprecated: Node-level approach (chaotic)      
+        logger.warning(      
+            "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. "      
+            "Consider using document-level cleanup (default) instead."      
+        )      
         cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)      
         logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")        
                 
@@ -147,6 +153,9 @@ async def cleanup_unused_data(
             },        
             "cleanup_date": datetime.now(timezone.utc).isoformat()        
         }      
+    else:      
+        # Default: Document-level approach (recommended)      
+        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
       
       
 async def _cleanup_via_sql(      
@@ -243,6 +252,7 @@ async def _cleanup_via_sql(
 async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:        
     """        
     Find unused nodes using graph projection - database-agnostic approach.        
+    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
             
     Parameters        
     ----------        
@@ -291,6 +301,7 @@ async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[st
 async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:        
     """        
     Delete unused nodes from graph and vector databases.        
+    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
             
     Parameters        
     ----------        
@@ -325,7 +336,7 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
             if not node_ids:        
                 continue        
                     
-            # Count edges connected to these nodes    
+            # Count edges from the in-memory graph    
             for node_id in node_ids:    
                 node = memory_fragment.get_node(node_id)    
                 if node:    

From 829a6f0d04bcfec6e9c9f94219a29d6ab5cd909d Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Wed, 10 Dec 2025 22:41:01 +0530
Subject: [PATCH 28/37] fix: only document level deletion

---
 .../retrieval/utils/access_tracking.py        |  80 +--
 cognee/tasks/cleanup/cleanup_unused_data.py   | 521 ++++++------------
 cognee/tests/test_cleanup_unused_data.py      | 388 ++++++-------
 3 files changed, 333 insertions(+), 656 deletions(-)

diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
index c7b06ee17..54fd043b9 100644
--- a/cognee/modules/retrieval/utils/access_tracking.py
+++ b/cognee/modules/retrieval/utils/access_tracking.py
@@ -4,7 +4,7 @@ import json
 from datetime import datetime, timezone  
 from typing import List, Any  
 from uuid import UUID  
-import os   
+import os  
 from cognee.infrastructure.databases.graph import get_graph_engine  
 from cognee.infrastructure.databases.relational import get_relational_engine  
 from cognee.modules.data.models import Data  
@@ -14,38 +14,28 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
   
 logger = get_logger(__name__)  
   
+  
 async def update_node_access_timestamps(items: List[Any]):  
     if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
         return  
-          
+  
     if not items:  
         return  
-          
+  
     graph_engine = await get_graph_engine()  
-    timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)  
     timestamp_dt = datetime.now(timezone.utc)  
-          
+  
     # Extract node IDs  
     node_ids = []  
     for item in items:  
         item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
         if item_id:  
             node_ids.append(str(item_id))  
-          
+  
     if not node_ids:  
         return  
-          
-    try:  
-        # Try to update nodes in graph database (may fail for unsupported DBs)  
-        await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)  
-    except Exception as e:  
-        logger.warning(  
-            f"Failed to update node timestamps in graph database: {e}. "  
-            "Will update document-level timestamps in SQL instead."  
-        )  
-              
-    # Always try to find origin documents and update SQL  
-    # This ensures document-level tracking works even if graph updates fail  
+  
+    # Focus on document-level tracking via projection  
     try:  
         doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
         if doc_ids:  
@@ -54,53 +44,6 @@ async def update_node_access_timestamps(items: List[Any]):
         logger.error(f"Failed to update SQL timestamps: {e}")  
         raise  
   
-async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):  
-    """Update nodes using graph projection - works with any graph database"""  
-    # Project the graph with necessary properties  
-    memory_fragment = CogneeGraph()  
-    await memory_fragment.project_graph_from_db(  
-        graph_engine,  
-        node_properties_to_project=["id"],  
-        edge_properties_to_project=[]  
-    )  
-      
-    # Update each node's last_accessed_at property  
-    provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()  
-      
-    for node_id in node_ids:  
-        node = memory_fragment.get_node(node_id)  
-        if node:  
-            try:  
-                # Update the node in the database  
-                if provider == "kuzu":  
-                    # Kuzu stores properties as JSON  
-                    result = await graph_engine.query(  
-                        "MATCH (n:Node {id: $id}) RETURN n.properties",  
-                        {"id": node_id}  
-                    )  
-                      
-                    if result and result[0]:  
-                        props = json.loads(result[0][0]) if result[0][0] else {}  
-                        props["last_accessed_at"] = timestamp_ms  
-                          
-                        await graph_engine.query(  
-                            "MATCH (n:Node {id: $id}) SET n.properties = $props",  
-                            {"id": node_id, "props": json.dumps(props)}  
-                        )  
-                elif provider == "neo4j":  
-                    await graph_engine.query(  
-                        "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",  
-                        {"id": node_id, "timestamp": timestamp_ms}  
-                    )  
-                elif provider == "neptune":  
-                    await graph_engine.query(  
-                        "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",  
-                        {"id": node_id, "timestamp": timestamp_ms}  
-                    )  
-            except Exception as e:  
-                # Log but continue with other nodes  
-                logger.debug(f"Failed to update node {node_id}: {e}")  
-                continue  
   
 async def _find_origin_documents_via_projection(graph_engine, node_ids):  
     """Find origin documents using graph projection instead of DB queries"""  
@@ -111,7 +54,7 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids):
         node_properties_to_project=["id", "type"],  
         edge_properties_to_project=["relationship_name"]  
     )  
-      
+  
     # Find origin documents by traversing the in-memory graph  
     doc_ids = set()  
     for node_id in node_ids:  
@@ -123,9 +66,10 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids):
                 neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node()  
                 if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:  
                     doc_ids.add(neighbor.id)  
-      
+  
     return list(doc_ids)  
   
+  
 async def _update_sql_records(doc_ids, timestamp_dt):  
     """Update SQL Data table (same for all providers)"""  
     db_engine = get_relational_engine()  
@@ -133,6 +77,6 @@ async def _update_sql_records(doc_ids, timestamp_dt):
         stmt = update(Data).where(  
             Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
         ).values(last_accessed=timestamp_dt)  
-              
+  
         await session.execute(stmt)  
         await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
index 3894635dd..34cde1b6f 100644
--- a/cognee/tasks/cleanup/cleanup_unused_data.py
+++ b/cognee/tasks/cleanup/cleanup_unused_data.py
@@ -1,382 +1,187 @@
-"""        
-Task for automatically deleting unused data from the memify pipeline.        
+"""          
+Task for automatically deleting unused data from the memify pipeline.          
+          
+This task identifies and removes entire documents that haven't          
+been accessed by retrievers for a specified period, helping maintain system          
+efficiency and storage optimization through whole-document removal.          
+"""          
+          
+import json          
+from datetime import datetime, timezone, timedelta          
+from typing import Optional, Dict, Any          
+from uuid import UUID          
+import os          
+from cognee.infrastructure.databases.graph import get_graph_engine          
+from cognee.infrastructure.databases.vector import get_vector_engine          
+from cognee.infrastructure.databases.relational import get_relational_engine        
+from cognee.modules.data.models import Data, DatasetData        
+from cognee.shared.logging_utils import get_logger          
+from sqlalchemy import select, or_        
+import cognee        
+import sqlalchemy as sa      
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph    
+          
+logger = get_logger(__name__)          
+          
+          
+async def cleanup_unused_data(          
+    minutes_threshold: Optional[int],          
+    dry_run: bool = True,          
+    user_id: Optional[UUID] = None        
+) -> Dict[str, Any]:          
+    """          
+    Identify and remove unused data from the memify pipeline.          
+          
+    Parameters          
+    ----------          
+    minutes_threshold : int          
+        Minutes since last access to consider data unused           
+    dry_run : bool          
+        If True, only report what would be deleted without actually deleting (default: True)          
+    user_id : UUID, optional          
+        Limit cleanup to specific user's data (default: None)        
+              
+    Returns          
+    -------          
+    Dict[str, Any]          
+        Cleanup results with status, counts, and timestamp          
+    """         
+    # Check 1: Environment variable must be enabled        
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":        
+        logger.warning(        
+            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."        
+        )        
+        return {        
+            "status": "skipped",        
+            "reason": "ENABLE_LAST_ACCESSED not enabled",        
+            "unused_count": 0,        
+            "deleted_count": {},        
+            "cleanup_date": datetime.now(timezone.utc).isoformat()        
+        }        
+            
+    # Check 2: Verify tracking has actually been running        
+    db_engine = get_relational_engine()        
+    async with db_engine.get_async_session() as session:        
+        # Count records with non-NULL last_accessed        
+        tracked_count = await session.execute(        
+            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))        
+        )        
+        tracked_records = tracked_count.scalar()        
+                
+        if tracked_records == 0:        
+            logger.warning(        
+                "Cleanup skipped: No records have been tracked yet. "        
+                "ENABLE_LAST_ACCESSED may have been recently enabled. "        
+                "Wait for retrievers to update timestamps before running cleanup."        
+            )        
+            return {        
+                "status": "skipped",        
+                "reason": "No tracked records found - tracking may be newly enabled",        
+                "unused_count": 0,        
+                "deleted_count": {},        
+                "cleanup_date": datetime.now(timezone.utc).isoformat()        
+            }        
+            
+    logger.info(          
+        "Starting cleanup task",          
+        minutes_threshold=minutes_threshold,          
+        dry_run=dry_run,          
+        user_id=str(user_id) if user_id else None        
+    )          
+              
+    # Calculate cutoff timestamp        
+    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)      
         
-This task identifies and removes entire documents that haven't        
-been accessed by retrievers for a specified period, helping maintain system        
-efficiency and storage optimization through whole-document removal.        
-"""        
-        
-import json        
-from datetime import datetime, timezone, timedelta        
-from typing import Optional, Dict, Any        
-from uuid import UUID        
-import os        
-from cognee.infrastructure.databases.graph import get_graph_engine        
-from cognee.infrastructure.databases.vector import get_vector_engine        
-from cognee.infrastructure.databases.relational import get_relational_engine      
-from cognee.modules.data.models import Data, DatasetData      
-from cognee.shared.logging_utils import get_logger        
-from sqlalchemy import select, or_      
-import cognee      
-import sqlalchemy as sa    
-from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
-        
-logger = get_logger(__name__)        
+    # Document-level approach (recommended)        
+    return await _cleanup_via_sql(cutoff_date, dry_run, user_id)        
         
         
-async def cleanup_unused_data(        
-    minutes_threshold: Optional[int],        
-    dry_run: bool = True,        
-    user_id: Optional[UUID] = None,      
-    text_doc: bool = True,  # Changed default to True for document-level cleanup  
-    node_level: bool = False  # New parameter for explicit node-level cleanup  
+async def _cleanup_via_sql(        
+    cutoff_date: datetime,        
+    dry_run: bool,        
+    user_id: Optional[UUID] = None        
 ) -> Dict[str, Any]:        
     """        
-    Identify and remove unused data from the memify pipeline.        
-        
+    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().        
+            
     Parameters        
     ----------        
-    minutes_threshold : int        
-        Minutes since last access to consider data unused         
+    cutoff_date : datetime        
+        Cutoff date for last_accessed filtering        
     dry_run : bool        
-        If True, only report what would be deleted without actually deleting (default: True)        
+        If True, only report what would be deleted        
     user_id : UUID, optional        
-        Limit cleanup to specific user's data (default: None)      
-    text_doc : bool      
-        If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete()      
-        for proper whole-document deletion      
-    node_level : bool      
-        If True, perform chaotic node-level deletion of unused chunks, entities, and summaries      
-        (default: False - deprecated in favor of document-level cleanup)      
+        Filter by user ID if provided        
             
     Returns        
     -------        
     Dict[str, Any]        
-        Cleanup results with status, counts, and timestamp        
-    """       
-    # Check 1: Environment variable must be enabled      
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":      
-        logger.warning(      
-            "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled."      
-        )      
-        return {      
-            "status": "skipped",      
-            "reason": "ENABLE_LAST_ACCESSED not enabled",      
-            "unused_count": 0,      
-            "deleted_count": {},      
-            "cleanup_date": datetime.now(timezone.utc).isoformat()      
-        }      
-          
-    # Check 2: Verify tracking has actually been running      
-    db_engine = get_relational_engine()      
-    async with db_engine.get_async_session() as session:      
-        # Count records with non-NULL last_accessed      
-        tracked_count = await session.execute(      
-            select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))      
-        )      
-        tracked_records = tracked_count.scalar()      
-              
-        if tracked_records == 0:      
-            logger.warning(      
-                "Cleanup skipped: No records have been tracked yet. "      
-                "ENABLE_LAST_ACCESSED may have been recently enabled. "      
-                "Wait for retrievers to update timestamps before running cleanup."      
-            )      
-            return {      
-                "status": "skipped",      
-                "reason": "No tracked records found - tracking may be newly enabled",      
-                "unused_count": 0,      
-                "deleted_count": {},      
-                "cleanup_date": datetime.now(timezone.utc).isoformat()      
-            }      
-          
-    logger.info(        
-        "Starting cleanup task",        
-        minutes_threshold=minutes_threshold,        
-        dry_run=dry_run,        
-        user_id=str(user_id) if user_id else None,      
-        text_doc=text_doc,      
-        node_level=node_level      
-    )        
+        Cleanup results        
+    """        
+    db_engine = get_relational_engine()        
             
-    # Calculate cutoff timestamp      
-    cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)    
-          
-    if node_level:      
-        # Deprecated: Node-level approach (chaotic)      
-        logger.warning(      
-            "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. "      
-            "Consider using document-level cleanup (default) instead."      
-        )      
-        cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)      
-        logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")        
+    async with db_engine.get_async_session() as session:        
+        # Query for Data records with old last_accessed timestamps        
+        query = select(Data, DatasetData).join(        
+            DatasetData, Data.id == DatasetData.data_id        
+        ).where(        
+            or_(        
+                Data.last_accessed < cutoff_date,        
+                Data.last_accessed.is_(None)        
+            )        
+        )        
                 
-        # Find unused nodes using graph projection    
-        unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms)    
-                
-        total_unused = sum(len(nodes) for nodes in unused_nodes.values())        
-        logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})        
-                
-        if dry_run:        
-            return {        
-                "status": "dry_run",        
-                "unused_count": total_unused,        
-                "deleted_count": {        
-                    "data_items": 0,        
-                    "chunks": 0,        
-                    "entities": 0,        
-                    "summaries": 0,        
-                    "associations": 0        
-                },        
-                "cleanup_date": datetime.now(timezone.utc).isoformat(),        
-                "preview": {        
-                    "chunks": len(unused_nodes["DocumentChunk"]),        
-                    "entities": len(unused_nodes["Entity"]),        
-                    "summaries": len(unused_nodes["TextSummary"])        
-                }        
-            }        
-                
-        # Delete unused nodes (provider-agnostic deletion)    
-        deleted_counts = await _delete_unused_nodes(unused_nodes)    
-                
-        logger.info("Cleanup completed", deleted_counts=deleted_counts)        
+        if user_id:        
+            from cognee.modules.data.models import Dataset        
+            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(        
+                Dataset.owner_id == user_id        
+            )        
                 
+        result = await session.execute(query)        
+        unused_data = result.all()        
+            
+    logger.info(f"Found {len(unused_data)} unused documents in SQL")        
+            
+    if dry_run:        
         return {        
-            "status": "completed",        
-            "unused_count": total_unused,        
+            "status": "dry_run",        
+            "unused_count": len(unused_data),        
             "deleted_count": {        
                 "data_items": 0,        
-                "chunks": deleted_counts["DocumentChunk"],        
-                "entities": deleted_counts["Entity"],        
-                "summaries": deleted_counts["TextSummary"],        
-                "associations": deleted_counts["associations"]        
+                "documents": 0        
             },        
-            "cleanup_date": datetime.now(timezone.utc).isoformat()        
-        }      
-    else:      
-        # Default: Document-level approach (recommended)      
-        return await _cleanup_via_sql(cutoff_date, dry_run, user_id)      
-      
-      
-async def _cleanup_via_sql(      
-    cutoff_date: datetime,      
-    dry_run: bool,      
-    user_id: Optional[UUID] = None      
-) -> Dict[str, Any]:      
-    """      
-    SQL-based cleanup: Query Data table for unused documents and use cognee.delete().      
-          
-    Parameters      
-    ----------      
-    cutoff_date : datetime      
-        Cutoff date for last_accessed filtering      
-    dry_run : bool      
-        If True, only report what would be deleted      
-    user_id : UUID, optional      
-        Filter by user ID if provided      
-          
-    Returns      
-    -------      
-    Dict[str, Any]      
-        Cleanup results      
-    """      
-    db_engine = get_relational_engine()      
-          
-    async with db_engine.get_async_session() as session:      
-        # Query for Data records with old last_accessed timestamps      
-        query = select(Data, DatasetData).join(      
-            DatasetData, Data.id == DatasetData.data_id      
-        ).where(      
-            or_(      
-                Data.last_accessed < cutoff_date,      
-                Data.last_accessed.is_(None)      
-            )      
-        )      
-              
-        if user_id:      
-            from cognee.modules.data.models import Dataset      
-            query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(      
-                Dataset.owner_id == user_id      
-            )      
-              
-        result = await session.execute(query)      
-        unused_data = result.all()      
-          
-    logger.info(f"Found {len(unused_data)} unused documents in SQL")      
-          
-    if dry_run:      
-        return {      
-            "status": "dry_run",      
-            "unused_count": len(unused_data),      
-            "deleted_count": {      
-                "data_items": 0,      
-                "documents": 0      
-            },      
-            "cleanup_date": datetime.now(timezone.utc).isoformat(),      
-            "preview": {      
-                "documents": len(unused_data)      
-            }      
-        }      
-          
-    # Delete each document using cognee.delete()      
-    deleted_count = 0      
-    from cognee.modules.users.methods import get_default_user      
-    user = await get_default_user() if user_id is None else None      
-          
-    for data, dataset_data in unused_data:      
-        try:      
-            await cognee.delete(      
-                data_id=data.id,      
-                dataset_id=dataset_data.dataset_id,      
-                mode="hard",  # Use hard mode to also remove orphaned entities      
-                user=user      
-            )      
-            deleted_count += 1      
-            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")      
-        except Exception as e:      
-            logger.error(f"Failed to delete document {data.id}: {e}")      
-          
-    logger.info("Cleanup completed", deleted_count=deleted_count)      
-          
-    return {      
-        "status": "completed",      
-        "unused_count": len(unused_data),      
-        "deleted_count": {      
-            "data_items": deleted_count,      
-            "documents": deleted_count      
-        },      
-        "cleanup_date": datetime.now(timezone.utc).isoformat()      
-    }      
-        
-        
-async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:        
-    """        
-    Find unused nodes using graph projection - database-agnostic approach.        
-    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
+            "cleanup_date": datetime.now(timezone.utc).isoformat(),        
+            "preview": {        
+                "documents": len(unused_data)        
+            }        
+        }        
             
-    Parameters        
-    ----------        
-    cutoff_timestamp_ms : int        
-        Cutoff timestamp in milliseconds since epoch        
+    # Delete each document using cognee.delete()        
+    deleted_count = 0        
+    from cognee.modules.users.methods import get_default_user        
+    user = await get_default_user() if user_id is None else None        
             
-    Returns        
-    -------        
-    Dict[str, list]        
-        Dictionary mapping node types to lists of unused node IDs        
-    """        
-    graph_engine = await get_graph_engine()        
+    for data, dataset_data in unused_data:        
+        try:        
+            await cognee.delete(        
+                data_id=data.id,        
+                dataset_id=dataset_data.dataset_id,        
+                mode="hard",  # Use hard mode to also remove orphaned entities        
+                user=user        
+            )        
+            deleted_count += 1        
+            logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")        
+        except Exception as e:        
+            logger.error(f"Failed to delete document {data.id}: {e}")        
             
-    # Project the entire graph with necessary properties    
-    memory_fragment = CogneeGraph()    
-    await memory_fragment.project_graph_from_db(    
-        graph_engine,    
-        node_properties_to_project=["id", "type", "last_accessed_at"],    
-        edge_properties_to_project=[]    
-    )    
-        
-    unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []}    
-        
-    # Get all nodes from the projected graph    
-    all_nodes = memory_fragment.get_nodes()    
-        
-    for node in all_nodes:    
-        node_type = node.get_attribute("type")    
-        if node_type not in unused_nodes:    
-            continue    
-                
-        # Check last_accessed_at property    
-        last_accessed = node.get_attribute("last_accessed_at")    
+    logger.info("Cleanup completed", deleted_count=deleted_count)        
             
-        if last_accessed is None or last_accessed < cutoff_timestamp_ms:    
-            unused_nodes[node_type].append(node.id)    
-            logger.debug(    
-                f"Found unused {node_type}",    
-                node_id=node.id,    
-                last_accessed=last_accessed    
-            )    
-        
-    return unused_nodes    
-    
-    
-async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:        
-    """        
-    Delete unused nodes from graph and vector databases.        
-    NOTE: This function is deprecated as it leads to fragmented knowledge graphs.        
-            
-    Parameters        
-    ----------        
-    unused_nodes : Dict[str, list]        
-        Dictionary mapping node types to lists of node IDs to delete      
-            
-    Returns        
-    -------        
-    Dict[str, int]        
-        Count of deleted items by type        
-    """        
-    graph_engine = await get_graph_engine()        
-    vector_engine = get_vector_engine()        
-            
-    deleted_counts = {        
-        "DocumentChunk": 0,        
-        "Entity": 0,        
-        "TextSummary": 0,        
-        "associations": 0        
-    }        
-            
-    # Count associations before deletion (using graph projection for consistency)    
-    if any(unused_nodes.values()):    
-        memory_fragment = CogneeGraph()    
-        await memory_fragment.project_graph_from_db(    
-            graph_engine,    
-            node_properties_to_project=["id"],    
-            edge_properties_to_project=[]    
-        )    
-            
-        for node_type, node_ids in unused_nodes.items():        
-            if not node_ids:        
-                continue        
-                    
-            # Count edges from the in-memory graph    
-            for node_id in node_ids:    
-                node = memory_fragment.get_node(node_id)    
-                if node:    
-                    # Count edges from the in-memory graph    
-                    edge_count = len(node.get_skeleton_edges())    
-                    deleted_counts["associations"] += edge_count    
-        
-    # Delete from graph database (uses DETACH DELETE, so edges are automatically removed)        
-    for node_type, node_ids in unused_nodes.items():        
-        if not node_ids:        
-            continue        
-                
-        logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")        
-                
-        # Delete nodes in batches (database-agnostic)        
-        await graph_engine.delete_nodes(node_ids)        
-        deleted_counts[node_type] = len(node_ids)        
-            
-    # Delete from vector database        
-    vector_collections = {        
-        "DocumentChunk": "DocumentChunk_text",        
-        "Entity": "Entity_name",        
-        "TextSummary": "TextSummary_text"        
-    }        
-            
-    
-    for node_type, collection_name in vector_collections.items():    
-        node_ids = unused_nodes[node_type]    
-        if not node_ids:    
-            continue    
-    
-        logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")    
-    
-        try:    
-            if await vector_engine.has_collection(collection_name):    
-                await vector_engine.delete_data_points(    
-                    collection_name,    
-                    [str(node_id) for node_id in node_ids]    
-                )    
-        except Exception as e:    
-            logger.error(f"Error deleting from vector collection {collection_name}: {e}")    
-            
-    return deleted_counts
+    return {        
+        "status": "completed",        
+        "unused_count": len(unused_data),        
+        "deleted_count": {        
+            "data_items": deleted_count,        
+            "documents": deleted_count        
+        },        
+        "cleanup_date": datetime.now(timezone.utc).isoformat()        
+    }
diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py
index c21b9f5ea..e738dcba0 100644
--- a/cognee/tests/test_cleanup_unused_data.py
+++ b/cognee/tests/test_cleanup_unused_data.py
@@ -1,244 +1,172 @@
-import os    
-import pathlib    
-import cognee    
-from datetime import datetime, timezone, timedelta    
-from uuid import UUID    
-from sqlalchemy import select, update    
-from cognee.modules.data.models import Data, DatasetData    
-from cognee.infrastructure.databases.relational import get_relational_engine    
-from cognee.modules.users.methods import get_default_user    
-from cognee.shared.logging_utils import get_logger    
-from cognee.modules.search.types import SearchType    
-    
-logger = get_logger()    
-    
-    
-async def test_textdocument_cleanup_with_sql():    
-    """    
-    End-to-end test for TextDocument cleanup based on last_accessed timestamps.    
+import os      
+import pathlib        
+import cognee        
+from datetime import datetime, timezone, timedelta        
+from uuid import UUID        
+from sqlalchemy import select, update        
+from cognee.modules.data.models import Data, DatasetData        
+from cognee.infrastructure.databases.relational import get_relational_engine        
+from cognee.modules.users.methods import get_default_user        
+from cognee.shared.logging_utils import get_logger        
+from cognee.modules.search.types import SearchType        
         
-    Tests:    
-    1. Add and cognify a document    
-    2. Perform search to populate last_accessed timestamp    
-    3. Verify last_accessed is set in SQL Data table    
-    4. Manually age the timestamp beyond cleanup threshold    
-    5. Run cleanup with text_doc=True    
-    6. Verify document was deleted from all databases (relational, graph, and vector)  
-    """    
-    # Setup test directories    
-    data_directory_path = str(    
-        pathlib.Path(    
-            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")    
-        ).resolve()    
-    )    
-    cognee_directory_path = str(    
-        pathlib.Path(    
-            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")    
-        ).resolve()    
-    )    
+logger = get_logger()        
         
-    cognee.config.data_root_directory(data_directory_path)    
-    cognee.config.system_root_directory(cognee_directory_path)    
         
-    # Initialize database    
-    from cognee.modules.engine.operations.setup import setup    
+async def test_textdocument_cleanup_with_sql():        
+    """        
+    End-to-end test for TextDocument cleanup based on last_accessed timestamps.        
+    """        
+    # Enable last accessed tracking BEFORE any cognee operations    
+    os.environ["ENABLE_LAST_ACCESSED"] = "true"    
         
-    # Clean slate    
-    await cognee.prune.prune_data()    
-    await cognee.prune.prune_system(metadata=True)    
-        
-    logger.info("🧪 Testing TextDocument cleanup based on last_accessed")    
-        
-    # Step 1: Add and cognify a test document    
-    dataset_name = "test_cleanup_dataset"    
-    test_text = """    
-    Machine learning is a subset of artificial intelligence that enables systems to learn    
-    and improve from experience without being explicitly programmed. Deep learning uses    
-    neural networks with multiple layers to process data.    
-    """    
-        
-    await setup()    
-    user = await get_default_user()    
-    await cognee.add([test_text], dataset_name=dataset_name, user=user)    
-        
-    cognify_result = await cognee.cognify([dataset_name], user=user)    
-        
-    # Extract dataset_id from cognify result (ds_id is already a UUID)    
-    dataset_id = None    
-    for ds_id, pipeline_result in cognify_result.items():    
-        dataset_id = ds_id  # Don't wrap in UUID() - it's already a UUID object    
-        break    
-        
-    assert dataset_id is not None, "Failed to get dataset_id from cognify result"    
-    logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")    
-        
-    # Step 2: Perform search to trigger last_accessed update    
-    logger.info("Triggering search to update last_accessed...")    
-    search_results = await cognee.search(    
-        query_type=SearchType.CHUNKS,    
-        query_text="machine learning",    
-        datasets=[dataset_name],    
-        user=user    
-    )    
-    logger.info(f"✅ Search completed, found {len(search_results)} results")    
-        
-    # Step 3: Verify last_accessed was set in SQL Data table    
-    db_engine = get_relational_engine()    
-    async with db_engine.get_async_session() as session:    
-        # Get the Data record for this dataset    
-        result = await session.execute(    
-            select(Data, DatasetData)    
-            .join(DatasetData, Data.id == DatasetData.data_id)    
-            .where(DatasetData.dataset_id == dataset_id)    
-        )    
-        data_records = result.all()    
-        assert len(data_records) > 0, "No Data records found for the dataset"    
-        data_record = data_records[0][0]  
-        data_id = data_record.id    
+    # Setup test directories        
+    data_directory_path = str(        
+        pathlib.Path(        
+            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")        
+        ).resolve()        
+    )        
+    cognee_directory_path = str(        
+        pathlib.Path(        
+            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")        
+        ).resolve()        
+    )        
             
-        # Verify last_accessed is set (should be set by search operation)    
-        assert data_record.last_accessed is not None, (    
-            "last_accessed should be set after search operation"    
-        )    
+    cognee.config.data_root_directory(data_directory_path)        
+    cognee.config.system_root_directory(cognee_directory_path)        
             
-        original_last_accessed = data_record.last_accessed    
-        logger.info(f"✅ last_accessed verified: {original_last_accessed}")    
-        
-    # Step 4: Manually age the timestamp to be older than cleanup threshold    
-    days_threshold = 30   
-    aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10)    
-        
-    async with db_engine.get_async_session() as session:    
-        stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)    
-        await session.execute(stmt)    
-        await session.commit()    
-        
-    # Query in a NEW session to avoid cached values    
-    async with db_engine.get_async_session() as session:    
-        result = await session.execute(select(Data).where(Data.id == data_id))    
-        updated_data = result.scalar_one_or_none()    
+    # Initialize database        
+    from cognee.modules.engine.operations.setup import setup        
             
-        # Make both timezone-aware for comparison    
-        retrieved_timestamp = updated_data.last_accessed    
-        if retrieved_timestamp.tzinfo is None:    
-            # If database returned naive datetime, make it UTC-aware    
-            retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)    
+    # Clean slate        
+    await cognee.prune.prune_data()        
+    await cognee.prune.prune_system(metadata=True)        
             
-        assert retrieved_timestamp == aged_timestamp, (    
-            f"Timestamp should be updated to aged value. "    
-            f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}"    
-        )  
-          
-    # Step 5: Test cleanup with text_doc=True    
-    from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data    
-        
-    # First do a dry run    
-    logger.info("Testing dry run with text_doc=True...")    
-    dry_run_result = await cleanup_unused_data(    
-        days_threshold=30,    
-        dry_run=True,    
-        user_id=user.id,    
-        text_doc=True    
-    )    
-        
-    assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'"    
-    assert dry_run_result['unused_count'] > 0, (    
-        "Should find at least one unused document"    
-    )    
-    logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")    
-        
-    # Now run actual cleanup    
-    logger.info("Executing cleanup with text_doc=True...")    
-    cleanup_result = await cleanup_unused_data(    
-        days_threshold=30,    
-        dry_run=False,    
-        user_id=user.id,    
-        text_doc=True    
-    )    
-        
-    assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"    
-    assert cleanup_result["deleted_count"]["documents"] > 0, (    
-        "At least one document should be deleted"    
-    )    
-    logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents")    
-        
-    # Step 6: Verify the document was actually deleted from SQL    
-    async with db_engine.get_async_session() as session:    
-        deleted_data = (    
-            await session.execute(select(Data).where(Data.id == data_id))    
-        ).scalar_one_or_none()    
+    logger.info("🧪 Testing TextDocument cleanup based on last_accessed")        
             
-        assert deleted_data is None, (    
-            "Data record should be deleted after cleanup"    
-        )    
-        logger.info("✅ Confirmed: Data record was deleted from SQL database")    
-        
-    # Verify the dataset-data link was also removed    
-    async with db_engine.get_async_session() as session:    
-        dataset_data_link = (    
-            await session.execute(    
-                select(DatasetData).where(    
-                    DatasetData.data_id == data_id,    
-                    DatasetData.dataset_id == dataset_id    
-                )    
-            )    
-        ).scalar_one_or_none()    
+    # Step 1: Add and cognify a test document        
+    dataset_name = "test_cleanup_dataset"        
+    test_text = """        
+    Machine learning is a subset of artificial intelligence that enables systems to learn        
+    and improve from experience without being explicitly programmed. Deep learning uses        
+    neural networks with multiple layers to process data.        
+    """        
             
-        assert dataset_data_link is None, (    
-            "DatasetData link should be deleted after cleanup"    
-        )    
-        logger.info("✅ Confirmed: DatasetData link was deleted")    
+    await setup()        
+    user = await get_default_user()        
+    await cognee.add([test_text], dataset_name=dataset_name, user=user)        
+            
+    cognify_result = await cognee.cognify([dataset_name], user=user)        
+            
+    # Extract dataset_id from cognify result      
+    dataset_id = None        
+    for ds_id, pipeline_result in cognify_result.items():        
+        dataset_id = ds_id        
+        break        
+            
+    assert dataset_id is not None, "Failed to get dataset_id from cognify result"        
+    logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")        
+            
+    # Step 2: Perform search to trigger last_accessed update        
+    logger.info("Triggering search to update last_accessed...")        
+    search_results = await cognee.search(        
+        query_type=SearchType.CHUNKS,        
+        query_text="machine learning",        
+        datasets=[dataset_name],        
+        user=user        
+    )        
+    logger.info(f"✅ Search completed, found {len(search_results)} results")        
+    assert len(search_results) > 0, "Search should return results"    
+            
+    # Step 3: Verify last_accessed was set and get data_id    
+    db_engine = get_relational_engine()        
+    async with db_engine.get_async_session() as session:        
+        result = await session.execute(        
+            select(Data, DatasetData)        
+            .join(DatasetData, Data.id == DatasetData.data_id)        
+            .where(DatasetData.dataset_id == dataset_id)        
+        )        
+        data_records = result.all()        
+        assert len(data_records) > 0, "No Data records found for the dataset"        
+        data_record = data_records[0][0]      
+        data_id = data_record.id        
+                
+        # Verify last_accessed is set      
+        assert data_record.last_accessed is not None, (        
+            "last_accessed should be set after search operation"        
+        )        
+                
+        original_last_accessed = data_record.last_accessed        
+        logger.info(f"✅ last_accessed verified: {original_last_accessed}")        
+            
+    # Step 4: Manually age the timestamp      
+    minutes_threshold = 30       
+    aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10)        
+            
+    async with db_engine.get_async_session() as session:        
+        stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)        
+        await session.execute(stmt)        
+        await session.commit()        
+            
+    # Verify timestamp was updated      
+    async with db_engine.get_async_session() as session:        
+        result = await session.execute(select(Data).where(Data.id == data_id))        
+        updated_data = result.scalar_one_or_none()        
+        assert updated_data is not None, "Data record should exist"    
+        retrieved_timestamp = updated_data.last_accessed        
+        if retrieved_timestamp.tzinfo is None:        
+            retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)        
+        assert retrieved_timestamp == aged_timestamp, (        
+            f"Timestamp should be updated to aged value"        
+        )        
+              
+    # Step 5: Test cleanup (document-level is now the default)        
+    from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data        
+            
+    # First do a dry run        
+    logger.info("Testing dry run...")        
+    dry_run_result = await cleanup_unused_data(        
+        minutes_threshold=10,        
+        dry_run=True,        
+        user_id=user.id        
+    )        
+            
+    # Debug: Print the actual result    
+    logger.info(f"Dry run result: {dry_run_result}")    
         
-    # Verify graph nodes were cleaned up    
-    from cognee.infrastructure.databases.graph import get_graph_engine    
+    assert dry_run_result['status'] == 'dry_run', f"Status should be 'dry_run', got: {dry_run_result['status']}"        
+    assert dry_run_result['unused_count'] > 0, (        
+        "Should find at least one unused document"        
+    )        
+    logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")        
+            
+    # Now run actual cleanup        
+    logger.info("Executing cleanup...")        
+    cleanup_result = await cleanup_unused_data(        
+        minutes_threshold=30,        
+        dry_run=False,        
+        user_id=user.id        
+    )        
+            
+    assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"        
+    assert cleanup_result["deleted_count"]["documents"] > 0, (        
+        "At least one document should be deleted"        
+    )        
+    logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents")        
+            
+    # Step 6: Verify deletion      
+    async with db_engine.get_async_session() as session:        
+        deleted_data = (        
+            await session.execute(select(Data).where(Data.id == data_id))        
+        ).scalar_one_or_none()        
+        assert deleted_data is None, "Data record should be deleted"        
+        logger.info("✅ Confirmed: Data record was deleted")        
+            
+    logger.info("🎉 All cleanup tests passed!")        
+    return True        
         
-    graph_engine = await get_graph_engine()    
         
-    # Try to find the TextDocument node - it should not exist    
-    result = await graph_engine.query(    
-        "MATCH (n:Node {id: $id}) RETURN n",    
-        {"id": str(data_id)}    
-    )    
-        
-    assert len(result) == 0, (    
-        "TextDocument node should be deleted from graph database"    
-    )    
-    logger.info("✅ Confirmed: TextDocument node was deleted from graph database")    
-      
-    # Verify vector database was cleaned up  
-    from cognee.infrastructure.databases.vector import get_vector_engine  
-      
-    vector_engine = get_vector_engine()  
-      
-    # Check each collection that should have been cleaned up  
-    vector_collections = [  
-        "DocumentChunk_text",  
-        "Entity_name",   
-        "TextSummary_text"  
-    ]  
-      
-    for collection_name in vector_collections:  
-        if await vector_engine.has_collection(collection_name):  
-            # Try to retrieve the deleted data points  
-            try:  
-                results = await vector_engine.retrieve(collection_name, [str(data_id)])  
-                assert len(results) == 0, (  
-                    f"Data points should be deleted from {collection_name} collection"  
-                )  
-                logger.info(f"✅ Confirmed: {collection_name} collection is clean")  
-            except Exception as e:  
-                # Collection might be empty or not exist, which is fine  
-                logger.info(f"✅ Confirmed: {collection_name} collection is empty or doesn't exist")  
-                pass  
-      
-    logger.info("✅ Confirmed: Vector database entries were deleted")  
-        
-    logger.info("🎉 All cleanup tests passed!")    
-        
-    return True    
-    
-    
-if __name__ == "__main__":    
-    import asyncio    
-    success = asyncio.run(test_textdocument_cleanup_with_sql())    
+if __name__ == "__main__":        
+    import asyncio        
+    success = asyncio.run(test_textdocument_cleanup_with_sql())        
     exit(0 if success else 1)

From 2485c3f5f0c2b25572213fe7638467859679c8d2 Mon Sep 17 00:00:00 2001
From: chinu0609 <chinmayhbhosale02@gmail.com>
Date: Thu, 11 Dec 2025 12:48:06 +0530
Subject: [PATCH 29/37] fix: only document level deletion

---
 cognee/infrastructure/engine/models/DataPoint.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
index 3178713c8..812380eaa 100644
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@@ -43,9 +43,6 @@ class DataPoint(BaseModel):
     updated_at: int = Field(
         default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
     )
-    last_accessed_at: int = Field(  
-        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)  
-    )
     ontology_valid: bool = False
     version: int = 1  # Default version
     topological_rank: Optional[int] = 0

From cd60ae31740acc9444f5aaf61fd7720deb2a5c51 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Thu, 11 Dec 2025 15:25:44 +0100
Subject: [PATCH 30/37] test: remove docs tests. add trigger to docs repo

---
 .github/workflows/docs_tests.yml              | 280 ------------------
 .github/workflows/release_test.yml            |  23 +-
 .../tests/docs/guides/custom_data_models.py   |  38 ---
 cognee/tests/docs/guides/custom_prompts.py    |  30 --
 .../docs/guides/custom_tasks_and_pipelines.py |  53 ----
 .../tests/docs/guides/graph_visualization.py  |  13 -
 cognee/tests/docs/guides/low_level_llm.py     |  31 --
 cognee/tests/docs/guides/memify_quickstart.py |  29 --
 .../tests/docs/guides/ontology_quickstart.py  |  30 --
 cognee/tests/docs/guides/s3_storage.py        |  25 --
 cognee/tests/docs/guides/search_basics.py     |  58 ----
 cognee/tests/docs/guides/temporal_cognify.py  |  57 ----
 12 files changed, 16 insertions(+), 651 deletions(-)
 delete mode 100644 .github/workflows/docs_tests.yml
 delete mode 100644 cognee/tests/docs/guides/custom_data_models.py
 delete mode 100644 cognee/tests/docs/guides/custom_prompts.py
 delete mode 100644 cognee/tests/docs/guides/custom_tasks_and_pipelines.py
 delete mode 100644 cognee/tests/docs/guides/graph_visualization.py
 delete mode 100644 cognee/tests/docs/guides/low_level_llm.py
 delete mode 100644 cognee/tests/docs/guides/memify_quickstart.py
 delete mode 100644 cognee/tests/docs/guides/ontology_quickstart.py
 delete mode 100644 cognee/tests/docs/guides/s3_storage.py
 delete mode 100644 cognee/tests/docs/guides/search_basics.py
 delete mode 100644 cognee/tests/docs/guides/temporal_cognify.py

diff --git a/.github/workflows/docs_tests.yml b/.github/workflows/docs_tests.yml
deleted file mode 100644
index 7f7282bb2..000000000
--- a/.github/workflows/docs_tests.yml
+++ /dev/null
@@ -1,280 +0,0 @@
-name: Docs Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  workflow_call:
-    secrets:
-      LLM_PROVIDER:
-        required: true
-      LLM_MODEL:
-        required: true
-      LLM_ENDPOINT:
-        required: true
-      LLM_API_KEY:
-        required: true
-      LLM_API_VERSION:
-        required: true
-      EMBEDDING_PROVIDER:
-        required: true
-      EMBEDDING_MODEL:
-        required: true
-      EMBEDDING_ENDPOINT:
-        required: true
-      EMBEDDING_API_KEY:
-        required: true
-      EMBEDDING_API_VERSION:
-        required: true
-
-env:
-  ENV: 'dev'
-
-jobs:
-  test-search-basics:
-    name: Test Search Basics
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Search Basics Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/search_basics.py
-
-  test-temporal-cognify:
-    name: Test Temporal Cognify
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Temporal Cognify Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py
-
-  test-ontology-quickstart:
-    name: Test Temporal Cognify
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Temporal Cognify Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py
-
-  test-s3-storage:
-    name: Test S3 Docs Guide
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-          extra-dependencies: "aws"
-
-      - name: Run S3 Docs Guide Test
-        env:
-          ENABLE_BACKEND_ACCESS_CONTROL: True
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-          STORAGE_BACKEND: s3
-          AWS_REGION: eu-west-1
-          AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }}
-        run: uv run python ./cognee/tests/docs/guides/s3_storage.py
-
-  test-graph-visualization:
-    name: Test Graph Visualization
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Graph Visualization Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/graph_visualization.py
-
-  test-low-level-llm:
-    name: Test Low Level LLM
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Low Level LLM Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/low_level_llm.py
-
-  test-memify-quickstart:
-    name: Test Memify Quickstart
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Memify Quickstart Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/memify_quickstart.py
-
-  test-custom-data-models:
-    name: Test Custom Data Models
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Custom Data Models Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/custom_data_models.py
-
-  test-custom-tasks-and-pipelines:
-    name: Test Custom Tasks and Pipelines
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Custom Tasks and Pipelines Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/custom_tasks_and_pipelines.py
-
-  test-custom-prompts:
-    name: Test Custom Prompts
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Custom Prompts Test
-        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/docs/guides/custom_prompts.py
\ No newline at end of file
diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index 89540fcfb..c6dd68484 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -5,18 +5,27 @@ permissions:
   contents: read
 
 on:
+  push:
+    branches:
+      - feature/cog-3213-docs-set-up-guide-script-tests
   workflow_dispatch:
   pull_request:
     branches:
       - main
 
 jobs:
-  load-tests:
-    name: Load Tests
-    uses: ./.github/workflows/load_tests.yml
-    secrets: inherit
+#  load-tests:
+#    name: Load Tests
+#    uses: ./.github/workflows/load_tests.yml
+#    secrets: inherit
 
   docs-tests:
-    name: Docs Tests
-    uses: ./.github/workflows/docs_tests.yml
-    secrets: inherit
\ No newline at end of file
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Trigger docs tests
+        run: |
+          curl -sS -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \
+            https://api.github.com/repos/your-org/repo-b/dispatches \
+            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
diff --git a/cognee/tests/docs/guides/custom_data_models.py b/cognee/tests/docs/guides/custom_data_models.py
deleted file mode 100644
index 0eb314227..000000000
--- a/cognee/tests/docs/guides/custom_data_models.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import asyncio
-from typing import Any
-from pydantic import SkipValidation
-
-import cognee
-from cognee.infrastructure.engine import DataPoint
-from cognee.infrastructure.engine.models.Edge import Edge
-from cognee.tasks.storage import add_data_points
-
-
-class Person(DataPoint):
-    name: str
-    # Keep it simple for forward refs / mixed values
-    knows: SkipValidation[Any] = None  # single Person or list[Person]
-    # Recommended: specify which fields to index for search
-    metadata: dict = {"index_fields": ["name"]}
-
-
-async def main():
-    # Start clean (optional in your app)
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata=True)
-
-    alice = Person(name="Alice")
-    bob = Person(name="Bob")
-    charlie = Person(name="Charlie")
-
-    # Create relationships - field name becomes edge label
-    alice.knows = bob
-    # You can also do lists: alice.knows = [bob, charlie]
-
-    # Optional: add weights and custom relationship types
-    bob.knows = (Edge(weight=0.9, relationship_type="friend_of"), charlie)
-
-    await add_data_points([alice, bob, charlie])
-
-
-asyncio.run(main())
diff --git a/cognee/tests/docs/guides/custom_prompts.py b/cognee/tests/docs/guides/custom_prompts.py
deleted file mode 100644
index 0d0a55a80..000000000
--- a/cognee/tests/docs/guides/custom_prompts.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import asyncio
-import cognee
-from cognee.api.v1.search import SearchType
-
-custom_prompt = """
-Extract only people and cities as entities.
-Connect people to cities with the relationship "lives_in".
-Ignore all other entities.
-"""
-
-
-async def main():
-    await cognee.add(
-        [
-            "Alice moved to Paris in 2010, while Bob has always lived in New York.",
-            "Andreas was born in Venice, but later settled in Lisbon.",
-            "Diana and Tom were born and raised in Helsingy. Diana currently resides in Berlin, while Tom never moved.",
-        ]
-    )
-    await cognee.cognify(custom_prompt=custom_prompt)
-
-    res = await cognee.search(
-        query_type=SearchType.GRAPH_COMPLETION,
-        query_text="Where does Alice live?",
-    )
-    print(res)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/custom_tasks_and_pipelines.py b/cognee/tests/docs/guides/custom_tasks_and_pipelines.py
deleted file mode 100644
index 202bb128a..000000000
--- a/cognee/tests/docs/guides/custom_tasks_and_pipelines.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import asyncio
-from typing import Any, Dict, List
-from pydantic import BaseModel, SkipValidation
-
-import cognee
-from cognee.modules.engine.operations.setup import setup
-from cognee.infrastructure.llm.LLMGateway import LLMGateway
-from cognee.infrastructure.engine import DataPoint
-from cognee.tasks.storage import add_data_points
-from cognee.modules.pipelines import Task, run_pipeline
-
-
-class Person(DataPoint):
-    name: str
-    # Optional relationships (we'll let the LLM populate this)
-    knows: List["Person"] = []
-    # Make names searchable in the vector store
-    metadata: Dict[str, Any] = {"index_fields": ["name"]}
-
-
-class People(BaseModel):
-    persons: List[Person]
-
-
-async def extract_people(text: str) -> List[Person]:
-    system_prompt = (
-        "Extract people mentioned in the text. "
-        "Return as `persons: Person[]` with each Person having `name` and optional `knows` relations. "
-        "If the text says someone knows someone set `knows` accordingly. "
-        "Only include facts explicitly stated."
-    )
-    people = await LLMGateway.acreate_structured_output(text, system_prompt, People)
-    return people.persons
-
-
-async def main():
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata=True)
-    await setup()
-
-    text = "Alice knows Bob."
-
-    tasks = [
-        Task(extract_people),  # input: text -> output: list[Person]
-        Task(add_data_points),  # input: list[Person] -> output: list[Person]
-    ]
-
-    async for _ in run_pipeline(tasks=tasks, data=text, datasets=["people_demo"]):
-        pass
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/graph_visualization.py b/cognee/tests/docs/guides/graph_visualization.py
deleted file mode 100644
index d463cbb56..000000000
--- a/cognee/tests/docs/guides/graph_visualization.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import asyncio
-import cognee
-from cognee.api.v1.visualize.visualize import visualize_graph
-
-
-async def main():
-    await cognee.add(["Alice knows Bob.", "NLP is a subfield of CS."])
-    await cognee.cognify()
-
-    await visualize_graph("./graph_after_cognify.html")
-
-
-asyncio.run(main())
diff --git a/cognee/tests/docs/guides/low_level_llm.py b/cognee/tests/docs/guides/low_level_llm.py
deleted file mode 100644
index 454f53f44..000000000
--- a/cognee/tests/docs/guides/low_level_llm.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import asyncio
-
-from pydantic import BaseModel
-from typing import List
-from cognee.infrastructure.llm.LLMGateway import LLMGateway
-
-
-class MiniEntity(BaseModel):
-    name: str
-    type: str
-
-
-class MiniGraph(BaseModel):
-    nodes: List[MiniEntity]
-
-
-async def main():
-    system_prompt = (
-        "Extract entities as nodes with name and type. "
-        "Use concise, literal values present in the text."
-    )
-
-    text = "Apple develops iPhone; Audi produces the R8."
-
-    result = await LLMGateway.acreate_structured_output(text, system_prompt, MiniGraph)
-    print(result)
-    # MiniGraph(nodes=[MiniEntity(name='Apple', type='Organization'), ...])
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/memify_quickstart.py b/cognee/tests/docs/guides/memify_quickstart.py
deleted file mode 100644
index 040654350..000000000
--- a/cognee/tests/docs/guides/memify_quickstart.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import asyncio
-import cognee
-from cognee import SearchType
-
-
-async def main():
-    # 1) Add two short chats and build a graph
-    await cognee.add(
-        [
-            "We follow PEP8. Add type hints and docstrings.",
-            "Releases should not be on Friday. Susan must review PRs.",
-        ],
-        dataset_name="rules_demo",
-    )
-    await cognee.cognify(datasets=["rules_demo"])  # builds graph
-
-    # 2) Enrich the graph (uses default memify tasks)
-    await cognee.memify(dataset="rules_demo")
-
-    # 3) Query the new coding rules
-    rules = await cognee.search(
-        query_type=SearchType.CODING_RULES,
-        query_text="List coding rules",
-        node_name=["coding_agent_rules"],
-    )
-    print("Rules:", rules)
-
-
-asyncio.run(main())
diff --git a/cognee/tests/docs/guides/ontology_quickstart.py b/cognee/tests/docs/guides/ontology_quickstart.py
deleted file mode 100644
index 2784dab19..000000000
--- a/cognee/tests/docs/guides/ontology_quickstart.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import asyncio
-import cognee
-
-
-async def main():
-    texts = ["Audi produces the R8 and e-tron.", "Apple develops iPhone and MacBook."]
-
-    await cognee.add(texts)
-    # or: await cognee.add("/path/to/folder/of/files")
-
-    import os
-    from cognee.modules.ontology.ontology_config import Config
-    from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
-
-    ontology_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "ontology_input_example/basic_ontology.owl"
-    )
-
-    # Create full config structure manually
-    config: Config = {
-        "ontology_config": {
-            "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_path)
-        }
-    }
-
-    await cognee.cognify(config=config)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/s3_storage.py b/cognee/tests/docs/guides/s3_storage.py
deleted file mode 100644
index 1044e05b4..000000000
--- a/cognee/tests/docs/guides/s3_storage.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import asyncio
-import cognee
-
-
-async def main():
-    # Single file
-    await cognee.add("s3://cognee-temp/2024-11-04.md")
-
-    # Folder/prefix (recursively expands)
-    await cognee.add("s3://cognee-temp")
-
-    # Mixed list
-    await cognee.add(
-        [
-            "s3://cognee-temp/2024-11-04.md",
-            "Some inline text to ingest",
-        ]
-    )
-
-    # Process the data
-    await cognee.cognify()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py
deleted file mode 100644
index f1847ad4b..000000000
--- a/cognee/tests/docs/guides/search_basics.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import asyncio
-import cognee
-
-from cognee.modules.search.types import SearchType, CombinedSearchResult
-
-
-async def main():
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata=True)
-
-    text = """
-        Natural language processing (NLP) is an interdisciplinary
-        subfield of computer science and information retrieval.
-        First rule of coding: Do not talk about coding.
-        """
-
-    text2 = """
-    Sandwiches are best served toasted with cheese, ham, mayo,
-    lettuce, mustard, and salt & pepper.
-    """
-
-    await cognee.add(text, dataset_name="NLP_coding")
-    await cognee.add(text2, dataset_name="Sandwiches")
-    await cognee.add(text2)
-
-    await cognee.cognify()
-
-    # Make sure you've already run cognee.cognify(...) so the graph has content
-    answers = await cognee.search(query_text="What are the main themes in my data?")
-    assert len(answers) > 0
-
-    answers = await cognee.search(
-        query_text="List coding guidelines",
-        query_type=SearchType.CODING_RULES,
-    )
-    assert len(answers) > 0
-
-    answers = await cognee.search(
-        query_text="Give me a confident answer: What is NLP?",
-        system_prompt="Answer succinctly and state confidence at the end.",
-    )
-    assert len(answers) > 0
-
-    answers = await cognee.search(
-        query_text="Tell me about NLP",
-        only_context=True,
-    )
-    assert len(answers) > 0
-
-    answers = await cognee.search(
-        query_text="Quarterly financial highlights",
-        datasets=["NLP_coding", "Sandwiches"],
-        use_combined_context=True,
-    )
-    assert isinstance(answers, CombinedSearchResult)
-
-
-asyncio.run(main())
diff --git a/cognee/tests/docs/guides/temporal_cognify.py b/cognee/tests/docs/guides/temporal_cognify.py
deleted file mode 100644
index 34c1ee33c..000000000
--- a/cognee/tests/docs/guides/temporal_cognify.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import asyncio
-import cognee
-
-
-async def main():
-    text = """
-    In 1998 the project launched. In 2001 version 1.0 shipped. In 2004 the team merged
-    with another group. In 2010 support for v1 ended.
-    """
-
-    await cognee.add(text, dataset_name="timeline_demo")
-
-    await cognee.cognify(datasets=["timeline_demo"], temporal_cognify=True)
-
-    from cognee.api.v1.search import SearchType
-
-    # Before / after queries
-    result = await cognee.search(
-        query_type=SearchType.TEMPORAL, query_text="What happened before 2000?", top_k=10
-    )
-
-    assert result != []
-
-    result = await cognee.search(
-        query_type=SearchType.TEMPORAL, query_text="What happened after 2010?", top_k=10
-    )
-
-    assert result != []
-
-    # Between queries
-    result = await cognee.search(
-        query_type=SearchType.TEMPORAL, query_text="Events between 2001 and 2004", top_k=10
-    )
-
-    assert result != []
-
-    # Scoped descriptions
-    result = await cognee.search(
-        query_type=SearchType.TEMPORAL,
-        query_text="Key project milestones between 1998 and 2010",
-        top_k=10,
-    )
-
-    assert result != []
-
-    result = await cognee.search(
-        query_type=SearchType.TEMPORAL,
-        query_text="What happened after 2004?",
-        datasets=["timeline_demo"],
-        top_k=10,
-    )
-
-    assert result != []
-
-
-if __name__ == "__main__":
-    asyncio.run(main())

From 41edeb0cf890e0d0b733bcd4befb03b870e70cbc Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Thu, 11 Dec 2025 16:01:26 +0100
Subject: [PATCH 31/37] test: change target repo name

---
 .github/workflows/release_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index c6dd68484..3fef0732a 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -27,5 +27,5 @@ jobs:
           curl -sS -X POST \
             -H "Accept: application/vnd.github+json" \
             -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \
-            https://api.github.com/repos/your-org/repo-b/dispatches \
+            https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
             -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'

From 0f4cf15d588e5dfa672d680e5258de284d308367 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Thu, 11 Dec 2025 16:24:47 +0100
Subject: [PATCH 32/37] test: fix docs test trigger

---
 .github/workflows/release_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index 3fef0732a..76ce3b09d 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -24,8 +24,9 @@ jobs:
     steps:
       - name: Trigger docs tests
         run: |
-          curl -sS -X POST \
+          curl -L -X POST \
             -H "Accept: application/vnd.github+json" \
             -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
             https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
             -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'

From e92d8f57b56823e0a1a4bf5ccf6734cdda01d56f Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 17 Dec 2025 13:14:14 +0100
Subject: [PATCH 33/37] feat: add comunity test trigger

---
 .github/workflows/release_test.yml | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index 76ce3b09d..be57c7fbf 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -19,14 +19,28 @@ jobs:
 #    uses: ./.github/workflows/load_tests.yml
 #    secrets: inherit
 
-  docs-tests:
+#  docs-tests:
+#    runs-on: ubuntu-22.04
+#    steps:
+#      - name: Trigger docs tests
+#        run: |
+#          curl -L -X POST \
+#            -H "Accept: application/vnd.github+json" \
+#            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
+#            -H "X-GitHub-Api-Version: 2022-11-28" \
+#            https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
+#            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
+
+  trigger-community-test-suite:
+    needs: release-pypi-package
+    if: ${{ inputs.flavour == 'main' }}
     runs-on: ubuntu-22.04
     steps:
-      - name: Trigger docs tests
+      - name: Trigger community tests
         run: |
           curl -L -X POST \
             -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \
+            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
             -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
+            https://api.github.com/repos/topoteretes/cognee-community/dispatches \
             -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'

From 601f74db4fda3c1bc3603d03bfbe22be7c8d6a24 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 17 Dec 2025 13:15:43 +0100
Subject: [PATCH 34/37] test: remove dependency from community trigger

---
 .github/workflows/release_test.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index be57c7fbf..dcb709ead 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -14,6 +14,18 @@ on:
       - main
 
 jobs:
+  trigger-community-test-suite:
+    if: ${{ inputs.flavour == 'main' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Trigger community tests
+        run: |
+          curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/topoteretes/cognee-community/dispatches \
+            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
 #  load-tests:
 #    name: Load Tests
 #    uses: ./.github/workflows/load_tests.yml
@@ -30,17 +42,3 @@ jobs:
 #            -H "X-GitHub-Api-Version: 2022-11-28" \
 #            https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
 #            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
-
-  trigger-community-test-suite:
-    needs: release-pypi-package
-    if: ${{ inputs.flavour == 'main' }}
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Trigger community tests
-        run: |
-          curl -L -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/topoteretes/cognee-community/dispatches \
-            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'

From a5a7ae2564abd90c0bf9b51b3abfc2a24a067a8f Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 17 Dec 2025 13:16:46 +0100
Subject: [PATCH 35/37] test: remove if

---
 .github/workflows/release_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index dcb709ead..08595a01e 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -15,7 +15,6 @@ on:
 
 jobs:
   trigger-community-test-suite:
-    if: ${{ inputs.flavour == 'main' }}
     runs-on: ubuntu-22.04
     steps:
       - name: Trigger community tests

From 6958b4edd462615e2e973d7cabd369181c030eba Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 17 Dec 2025 13:50:03 +0100
Subject: [PATCH 36/37] feat: add the triggers to release, after pypi
 publishing

---
 .github/workflows/release.yml      | 28 ++++++++++++++++++++++++++++
 .github/workflows/release_test.yml | 30 ++++--------------------------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 84601edf7..26ccce1f0 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -136,3 +136,31 @@ jobs:
             flavour=${{ inputs.flavour }}
           cache-from: type=registry,ref=cognee/cognee:buildcache
           cache-to: type=registry,ref=cognee/cognee:buildcache,mode=max
+
+  trigger-docs-test-suite:
+    needs: release-pypi-package
+    if: ${{ inputs.flavour == 'main' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Trigger docs tests
+        run: |
+          curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
+            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
+
+  trigger-community-test-suite:
+    needs: release-pypi-package
+    if: ${{ inputs.flavour == 'main' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Trigger community tests
+        run: |
+          curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/topoteretes/cognee-community/dispatches \
+            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
\ No newline at end of file
diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index 08595a01e..6090a1217 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -14,30 +14,8 @@ on:
       - main
 
 jobs:
-  trigger-community-test-suite:
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Trigger community tests
-        run: |
-          curl -L -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/topoteretes/cognee-community/dispatches \
-            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'
-#  load-tests:
-#    name: Load Tests
-#    uses: ./.github/workflows/load_tests.yml
-#    secrets: inherit
+  load-tests:
+    name: Load Tests
+    uses: ./.github/workflows/load_tests.yml
+    secrets: inherit
 
-#  docs-tests:
-#    runs-on: ubuntu-22.04
-#    steps:
-#      - name: Trigger docs tests
-#        run: |
-#          curl -L -X POST \
-#            -H "Accept: application/vnd.github+json" \
-#            -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \
-#            -H "X-GitHub-Api-Version: 2022-11-28" \
-#            https://api.github.com/repos/topoteretes/cognee-docs/dispatches \
-#            -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}'

From 431a83247fff487357a253cbddb00e779e8bda9b Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 17 Dec 2025 13:50:43 +0100
Subject: [PATCH 37/37] chore: remove unnecessary 'on push' setting

---
 .github/workflows/release_test.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index 6090a1217..b31b431a4 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -5,9 +5,6 @@ permissions:
   contents: read
 
 on:
-  push:
-    branches:
-      - feature/cog-3213-docs-set-up-guide-script-tests
   workflow_dispatch:
   pull_request:
     branches: