From 3372679f7bb40c01ffd9e337ead27fe9f8981d54 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 29 Oct 2025 20:12:14 +0530 Subject: [PATCH 01/37] feat: adding last_accessed_at field to the models and updating the retrievers to update the timestamp --- .../modules/chunking/models/DocumentChunk.py | 7 +++ cognee/modules/engine/models/Entity.py | 7 ++- cognee/modules/retrieval/chunks_retriever.py | 55 +++++++---------- .../modules/retrieval/summaries_retriever.py | 28 ++++----- .../retrieval/utils/access_tracking.py | 61 +++++++++++++++++++ cognee/tasks/summarization/models.py | 8 ++- 6 files changed, 115 insertions(+), 51 deletions(-) create mode 100644 cognee/modules/retrieval/utils/access_tracking.py diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 9f8c57486..c4c6a2ed3 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,5 +1,7 @@ from typing import List, Union +from pydantic import BaseModel, Field +from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.data.processing.document_types import Document from cognee.modules.engine.models import Entity @@ -22,6 +24,7 @@ class DocumentChunk(DataPoint): - cut_type: The type of cut that defined this chunk. - is_part_of: The document to which this chunk belongs. - contains: A list of entities or events contained within the chunk (default is None). + - last_accessed_at: The timestamp of the last time the chunk was accessed. - metadata: A dictionary to hold meta information related to the chunk, including index fields. """ @@ -32,5 +35,9 @@ class DocumentChunk(DataPoint): cut_type: str is_part_of: Document contains: List[Union[Entity, Event]] = None + + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) metadata: dict = {"index_fields": ["text"]} diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 36da2e344..3e48ea02a 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -1,11 +1,14 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.engine.models.EntityType import EntityType from typing import Optional - +from datetime import datetime, timezone +from pydantic import BaseModel, Field class Entity(DataPoint): name: str is_a: Optional[EntityType] = None description: str - + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) metadata: dict = {"index_fields": ["name"]} diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index 94b9d3fb9..74634b71e 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -1,10 +1,11 @@ from typing import Any, Optional - +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError +from datetime import datetime, timezone logger = get_logger("ChunksRetriever") @@ -27,38 +28,26 @@ class ChunksRetriever(BaseRetriever): ): self.top_k = top_k - async def get_context(self, query: str) -> Any: - """ - Retrieves document chunks context based on the query. - - Searches for document chunks relevant to the specified query using a vector engine. - Raises a NoDataError if no data is found in the system. - - Parameters: - ----------- - - - query (str): The query string to search for relevant document chunks. - - Returns: - -------- - - - Any: A list of document chunk payloads retrieved from the search. - """ - logger.info( - f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" - ) - - vector_engine = get_vector_engine() - - try: - found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) - logger.info(f"Found {len(found_chunks)} chunks from vector search") - except CollectionNotFoundError as error: - logger.error("DocumentChunk_text collection not found in vector database") - raise NoDataError("No data found in the system, please add data first.") from error - - chunk_payloads = [result.payload for result in found_chunks] - logger.info(f"Returning {len(chunk_payloads)} chunk payloads") + async def get_context(self, query: str) -> Any: + """Retrieves document chunks context based on the query.""" + logger.info( + f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" + ) + + vector_engine = get_vector_engine() + + try: + found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) + logger.info(f"Found {len(found_chunks)} chunks from vector search") + + # NEW: Update access timestamps + await update_node_access_timestamps(found_chunks, "DocumentChunk") + except CollectionNotFoundError as error: + logger.error("DocumentChunk_text collection not found in vector database") + raise NoDataError("No data found in the system, please add data first.") from error + + chunk_payloads = [result.payload for result in found_chunks] + logger.info(f"Returning {len(chunk_payloads)} chunk payloads") return chunk_payloads async def get_completion( diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 87b224946..7f996274e 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -4,6 +4,7 @@ from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.exceptions.exceptions import NoDataError +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError logger = get_logger("SummariesRetriever") @@ -47,20 +48,19 @@ class SummariesRetriever(BaseRetriever): f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" ) - vector_engine = get_vector_engine() - - try: - summaries_results = await vector_engine.search( - "TextSummary_text", query, limit=self.top_k - ) - logger.info(f"Found {len(summaries_results)} summaries from vector search") - except CollectionNotFoundError as error: - logger.error("TextSummary_text collection not found in vector database") - raise NoDataError("No data found in the system, please add data first.") from error - - summary_payloads = [summary.payload for summary in summaries_results] - logger.info(f"Returning {len(summary_payloads)} summary payloads") - return summary_payloads + vector_engine = get_vector_engine() + + try: + summaries_results = await vector_engine.search( + "TextSummary_text", query, limit=self.top_k + ) + + await update_node_access_timestamps(summaries_results, "TextSummary") + + except CollectionNotFoundError as error: + raise NoDataError("No data found in the system, please add data first.") from error + + return [summary.payload for summary in summaries_results] async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py new file mode 100644 index 000000000..ca5ed88cd --- /dev/null +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -0,0 +1,61 @@ + +"""Utilities for tracking data access in retrievers.""" + +import json +from datetime import datetime, timezone +from typing import List, Any + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.shared.logging_utils import get_logger + +logger = get_logger(__name__) + + +async def update_node_access_timestamps(items: List[Any], node_type: str): + """ + Update last_accessed_at for nodes in Kuzu graph database. + + Parameters + ---------- + items : List[Any] + List of items with payload containing 'id' field (from vector search results) + node_type : str + Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary') + """ + if not items: + return + + graph_engine = await get_graph_engine() + # Convert to milliseconds since epoch (matching the field format) + timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) + + for item in items: + # Extract ID from payload (vector search results have this structure) + item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") + if not item_id: + continue + + try: + # Get current node properties from Kuzu's Node table + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props", + {"id": str(item_id), "node_type": node_type} + ) + + if result and len(result) > 0 and result[0][0]: + # Parse existing properties JSON + props = json.loads(result[0][0]) if result[0][0] else {} + # Update last_accessed_at with millisecond timestamp + props["last_accessed_at"] = timestamp_ms + + # Write back to graph database + await graph_engine.query( + "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props", + {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)} + ) + except Exception as e: + logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}") + continue + + logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes") + diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 75ed82d50..46f9a8d8b 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -1,5 +1,7 @@ -from typing import Union +from pydantic import BaseModel, Field +from typing import Union +from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models import DocumentChunk from cognee.shared.CodeGraphEntities import CodeFile, CodePart @@ -17,7 +19,9 @@ class TextSummary(DataPoint): text: str made_from: DocumentChunk - + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) metadata: dict = {"index_fields": ["text"]} From 3f27c5592b58af29369125362510e96b72c56cbc Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 29 Oct 2025 20:17:27 +0530 Subject: [PATCH 02/37] feat: adding last_accessed_at field to the models and updating the retrievers to update the timestamp --- cognee/modules/retrieval/chunks_retriever.py | 48 +++++++++++-------- .../modules/retrieval/summaries_retriever.py | 28 ++++++----- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index 74634b71e..f821fc902 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -29,26 +29,34 @@ class ChunksRetriever(BaseRetriever): self.top_k = top_k async def get_context(self, query: str) -> Any: - """Retrieves document chunks context based on the query.""" - logger.info( - f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" - ) - - vector_engine = get_vector_engine() - - try: - found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) - logger.info(f"Found {len(found_chunks)} chunks from vector search") - - # NEW: Update access timestamps - await update_node_access_timestamps(found_chunks, "DocumentChunk") - except CollectionNotFoundError as error: - logger.error("DocumentChunk_text collection not found in vector database") - raise NoDataError("No data found in the system, please add data first.") from error - - chunk_payloads = [result.payload for result in found_chunks] - logger.info(f"Returning {len(chunk_payloads)} chunk payloads") - return chunk_payloads + """ + Retrieves document chunks context based on the query. + Searches for document chunks relevant to the specified query using a vector engine. + Raises a NoDataError if no data is found in the system. + Parameters: + ----------- + - query (str): The query string to search for relevant document chunks. + Returns: + -------- + - Any: A list of document chunk payloads retrieved from the search. + """ + logger.info( + f"Starting chunk retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" + ) + + vector_engine = get_vector_engine() + + try: + found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) + logger.info(f"Found {len(found_chunks)} chunks from vector search") + await update_node_access_timestamps(found_chunks, "DocumentChunk") + + except CollectionNotFoundError as error: + logger.error("DocumentChunk_text collection not found in vector database") + raise NoDataError("No data found in the system, please add data first.") from error + + chunk_payloads = [result.payload for result in found_chunks] + logger.info(f"Returning {len(chunk_payloads)} chunk payloads") async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 7f996274e..9ac8b096d 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -48,19 +48,23 @@ class SummariesRetriever(BaseRetriever): f"Starting summary retrieval for query: '{query[:100]}{'...' if len(query) > 100 else ''}'" ) - vector_engine = get_vector_engine() - - try: - summaries_results = await vector_engine.search( - "TextSummary_text", query, limit=self.top_k - ) - + vector_engine = get_vector_engine() + + try: + summaries_results = await vector_engine.search( + "TextSummary_text", query, limit=self.top_k + ) + logger.info(f"Found {len(summaries_results)} summaries from vector search") + await update_node_access_timestamps(summaries_results, "TextSummary") - - except CollectionNotFoundError as error: - raise NoDataError("No data found in the system, please add data first.") from error - - return [summary.payload for summary in summaries_results] + + except CollectionNotFoundError as error: + logger.error("TextSummary_text collection not found in vector database") + raise NoDataError("No data found in the system, please add data first.") from error + + summary_payloads = [summary.payload for summary in summaries_results] + logger.info(f"Returning {len(summary_payloads)} summary payloads") + return summary_payloads async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None, **kwargs From 5f6f0502c832d129749b453121c6f5be565044bc Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 31 Oct 2025 00:00:18 +0530 Subject: [PATCH 03/37] fix: removing last_acessed_at from individual model and adding it to DataPoint --- cognee/infrastructure/engine/models/DataPoint.py | 3 +++ cognee/modules/chunking/models/DocumentChunk.py | 5 ----- cognee/modules/engine/models/Entity.py | 3 --- cognee/tasks/summarization/models.py | 3 --- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 812380eaa..3178713c8 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -43,6 +43,9 @@ class DataPoint(BaseModel): updated_at: int = Field( default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) ) + last_accessed_at: int = Field( + default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) + ) ontology_valid: bool = False version: int = 1 # Default version topological_rank: Optional[int] = 0 diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index c4c6a2ed3..601454802 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -35,9 +35,4 @@ class DocumentChunk(DataPoint): cut_type: str is_part_of: Document contains: List[Union[Entity, Event]] = None - - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) - metadata: dict = {"index_fields": ["text"]} diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 3e48ea02a..4083cd2e6 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -8,7 +8,4 @@ class Entity(DataPoint): name: str is_a: Optional[EntityType] = None description: str - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) metadata: dict = {"index_fields": ["name"]} diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 46f9a8d8b..8cee2ade3 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -19,9 +19,6 @@ class TextSummary(DataPoint): text: str made_from: DocumentChunk - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) metadata: dict = {"index_fields": ["text"]} From 6f06e4a5eb1143ddcb2ad08132486630b8a2deae Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 31 Oct 2025 00:17:13 +0530 Subject: [PATCH 04/37] fix: removing node_type and try except --- cognee/modules/retrieval/chunks_retriever.py | 2 +- .../modules/retrieval/summaries_retriever.py | 2 +- .../retrieval/utils/access_tracking.py | 55 ++++++++++--------- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index f821fc902..be1f95811 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -49,7 +49,7 @@ class ChunksRetriever(BaseRetriever): try: found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) logger.info(f"Found {len(found_chunks)} chunks from vector search") - await update_node_access_timestamps(found_chunks, "DocumentChunk") + await update_node_access_timestamps(found_chunks) except CollectionNotFoundError as error: logger.error("DocumentChunk_text collection not found in vector database") diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 9ac8b096d..0df750d22 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -56,7 +56,7 @@ class SummariesRetriever(BaseRetriever): ) logger.info(f"Found {len(summaries_results)} summaries from vector search") - await update_node_access_timestamps(summaries_results, "TextSummary") + await update_node_access_timestamps(summaries_results) except CollectionNotFoundError as error: logger.error("TextSummary_text collection not found in vector database") diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index ca5ed88cd..79afd25db 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -1,4 +1,4 @@ - + """Utilities for tracking data access in retrievers.""" import json @@ -11,51 +11,54 @@ from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) -async def update_node_access_timestamps(items: List[Any], node_type: str): +async def update_node_access_timestamps(items: List[Any]): """ Update last_accessed_at for nodes in Kuzu graph database. + Automatically determines node type from the graph database. Parameters ---------- items : List[Any] List of items with payload containing 'id' field (from vector search results) - node_type : str - Type of node to update (e.g., 'DocumentChunk', 'Entity', 'TextSummary') """ if not items: return graph_engine = await get_graph_engine() - # Convert to milliseconds since epoch (matching the field format) timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) for item in items: - # Extract ID from payload (vector search results have this structure) + # Extract ID from payload item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if not item_id: continue - try: - # Get current node properties from Kuzu's Node table - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) WHERE n.type = $node_type RETURN n.properties as props", - {"id": str(item_id), "node_type": node_type} + # try: + # Query to get both node type and properties in one call + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props", + {"id": str(item_id)} + ) + + if result and len(result) > 0 and result[0]: + node_type = result[0][0] # First column: node_type + props_json = result[0][1] # Second column: properties + + # Parse existing properties JSON + props = json.loads(props_json) if props_json else {} + # Update last_accessed_at with millisecond timestamp + props["last_accessed_at"] = timestamp_ms + + # Write back to graph database + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": str(item_id), "props": json.dumps(props)} ) - if result and len(result) > 0 and result[0][0]: - # Parse existing properties JSON - props = json.loads(result[0][0]) if result[0][0] else {} - # Update last_accessed_at with millisecond timestamp - props["last_accessed_at"] = timestamp_ms + logger.debug(f"Updated access timestamp for {node_type} node {item_id}") - # Write back to graph database - await graph_engine.query( - "MATCH (n:Node {id: $id}) WHERE n.type = $node_type SET n.properties = $props", - {"id": str(item_id), "node_type": node_type, "props": json.dumps(props)} - ) - except Exception as e: - logger.warning(f"Failed to update timestamp for {node_type} {item_id}: {e}") - continue + # except Exception as e: + # logger.error(f"Failed to update timestamp for node {item_id}: {e}") + # continue - logger.debug(f"Updated access timestamps for {len(items)} {node_type} nodes") - + logger.debug(f"Updated access timestamps for {len(items)} nodes") From f1afd1f0a2a5433dc341c485b08ce33d1bc16252 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 31 Oct 2025 15:49:34 +0530 Subject: [PATCH 05/37] feat: adding cleanup function and adding update_node_acess_timestamps in completion retriever and graph_completion retriever --- .../modules/retrieval/completion_retriever.py | 3 +- .../retrieval/graph_completion_retriever.py | 13 +- cognee/tasks/cleanup/cleanup_unused_data.py | 232 ++++++++++++++++++ 3 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 cognee/tasks/cleanup/cleanup_unused_data.py diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index bb568924d..fc8ef747f 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -8,6 +8,7 @@ from cognee.modules.retrieval.utils.session_cache import ( save_conversation_history, get_conversation_history, ) +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError @@ -65,7 +66,7 @@ class CompletionRetriever(BaseRetriever): if len(found_chunks) == 0: return "" - + await update_node_access_timestamps(found_chunks) # Combine all chunks text returned from vector search (number of chunks is determined by top_k chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks] combined_context = "\n".join(chunks_payload) diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index b7ab4edae..ac7e45e3c 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -16,6 +16,7 @@ from cognee.modules.retrieval.utils.session_cache import ( ) from cognee.shared.logging_utils import get_logger from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node +from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps from cognee.modules.retrieval.utils.models import CogneeUserInteraction from cognee.modules.engine.models.node_set import NodeSet from cognee.infrastructure.databases.graph import get_graph_engine @@ -138,7 +139,17 @@ class GraphCompletionRetriever(BaseGraphRetriever): return [] # context = await self.resolve_edges_to_text(triplets) - + entity_nodes = [] + seen_ids = set() + for triplet in triplets: + if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node1.id)}) + seen_ids.add(triplet.node1.id) + if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node2.id)}) + seen_ids.add(triplet.node2.id) + + await update_node_access_timestamps(entity_nodes) return triplets async def get_completion( diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py new file mode 100644 index 000000000..e97692bb4 --- /dev/null +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -0,0 +1,232 @@ +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.shared.logging_utils import get_logger + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: int = 30, + dry_run: bool = True, + user_id: Optional[UUID] = None +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + Minutes since last access to consider data unused (default: 30) + dry_run : bool + If True, only report what would be deleted without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None + ) + + # Calculate cutoff timestamp in milliseconds + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes + unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + +async def _find_unused_nodes( + cutoff_timestamp_ms: int, + user_id: Optional[UUID] = None +) -> Dict[str, list]: + """ + Query Kuzu for nodes with old last_accessed_at timestamps. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + user_id : UUID, optional + Filter by user ID if provided + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Query all nodes with their properties + query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" + results = await graph_engine.query(query) + + unused_nodes = { + "DocumentChunk": [], + "Entity": [], + "TextSummary": [] + } + + for node_id, node_type, props_json in results: + # Only process tracked node types + if node_type not in unused_nodes: + continue + + # Parse properties JSON + if props_json: + try: + props = json.loads(props_json) + last_accessed = props.get("last_accessed_at") + + # Check if node is unused (never accessed or accessed before cutoff) + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + # TODO: Add user_id filtering when user ownership is implemented + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties for node {node_id}") + continue + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + if result and len(result) > 0: + deleted_counts["associations"] += result[0][0] + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) + + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + # Delete from vector collection + if await vector_engine.has_collection(collection_name): + for node_id in node_ids: + try: + await vector_engine.delete(collection_name, {"id": str(node_id)}) + except Exception as e: + logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + + return deleted_counts From 5080e8f8a5c20d092b917b66eb52a577fe899231 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 3 Nov 2025 00:59:04 +0530 Subject: [PATCH 06/37] feat: genarlizing getting entities from triplets --- cognee/modules/graph/utils/__init__.py | 1 + .../graph/utils/get_entity_nodes_from_triplets.py | 13 +++++++++++++ .../modules/retrieval/graph_completion_retriever.py | 12 +++--------- 3 files changed, 17 insertions(+), 9 deletions(-) create mode 100644 cognee/modules/graph/utils/get_entity_nodes_from_triplets.py diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py index ebc648495..4c0b29d47 100644 --- a/cognee/modules/graph/utils/__init__.py +++ b/cognee/modules/graph/utils/__init__.py @@ -5,3 +5,4 @@ from .retrieve_existing_edges import retrieve_existing_edges from .convert_node_to_data_point import convert_node_to_data_point from .deduplicate_nodes_and_edges import deduplicate_nodes_and_edges from .resolve_edges_to_text import resolve_edges_to_text +from .get_entity_nodes_from_triplets import get_entity_nodes_from_triplets diff --git a/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py new file mode 100644 index 000000000..598a36854 --- /dev/null +++ b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py @@ -0,0 +1,13 @@ + +def get_entity_nodes_from_triplets(triplets): + entity_nodes = [] + seen_ids = set() + for triplet in triplets: + if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node1.id)}) + seen_ids.add(triplet.node1.id) + if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: + entity_nodes.append({"id": str(triplet.node2.id)}) + seen_ids.add(triplet.node2.id) + + return entity_nodes diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index ac7e45e3c..122cc943f 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -22,6 +22,7 @@ from cognee.modules.engine.models.node_set import NodeSet from cognee.infrastructure.databases.graph import get_graph_engine from cognee.context_global_variables import session_user from cognee.infrastructure.databases.cache.config import CacheConfig +from cognee.modules.graph.utils import get_entity_nodes_from_triplets logger = get_logger("GraphCompletionRetriever") @@ -139,15 +140,8 @@ class GraphCompletionRetriever(BaseGraphRetriever): return [] # context = await self.resolve_edges_to_text(triplets) - entity_nodes = [] - seen_ids = set() - for triplet in triplets: - if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: - entity_nodes.append({"id": str(triplet.node1.id)}) - seen_ids.add(triplet.node1.id) - if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: - entity_nodes.append({"id": str(triplet.node2.id)}) - seen_ids.add(triplet.node2.id) + + entity_nodes = get_entity_nodes_from_triplets(triplets) await update_node_access_timestamps(entity_nodes) return triplets From 90d10e6f9af50c85fbbf282dd961719d5da7f922 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 15:31:09 +0100 Subject: [PATCH 07/37] test: Add docs tests. Initial commit, still WIP. --- .github/workflows/docs_tests.yml | 18 ++++++ .../tests/docs/guides/custom_data_models.py | 38 +++++++++++++ cognee/tests/docs/guides/custom_prompts.py | 30 ++++++++++ .../docs/guides/custom_tasks_and_pipelines.py | 53 +++++++++++++++++ .../tests/docs/guides/graph_visualization.py | 13 +++++ cognee/tests/docs/guides/low_level_llm.py | 31 ++++++++++ cognee/tests/docs/guides/memify_quickstart.py | 29 ++++++++++ .../tests/docs/guides/ontology_quickstart.py | 30 ++++++++++ cognee/tests/docs/guides/s3_storage.py | 25 ++++++++ cognee/tests/docs/guides/search_basics.py | 17 ++++++ cognee/tests/docs/guides/temporal_cognify.py | 57 +++++++++++++++++++ 11 files changed, 341 insertions(+) create mode 100644 .github/workflows/docs_tests.yml create mode 100644 cognee/tests/docs/guides/custom_data_models.py create mode 100644 cognee/tests/docs/guides/custom_prompts.py create mode 100644 cognee/tests/docs/guides/custom_tasks_and_pipelines.py create mode 100644 cognee/tests/docs/guides/graph_visualization.py create mode 100644 cognee/tests/docs/guides/low_level_llm.py create mode 100644 cognee/tests/docs/guides/memify_quickstart.py create mode 100644 cognee/tests/docs/guides/ontology_quickstart.py create mode 100644 cognee/tests/docs/guides/s3_storage.py create mode 100644 cognee/tests/docs/guides/search_basics.py create mode 100644 cognee/tests/docs/guides/temporal_cognify.py diff --git a/.github/workflows/docs_tests.yml b/.github/workflows/docs_tests.yml new file mode 100644 index 000000000..b3c538668 --- /dev/null +++ b/.github/workflows/docs_tests.yml @@ -0,0 +1,18 @@ +name: Docs Test Suite +permissions: + contents: read + +on: + release: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' + +jobs: + diff --git a/cognee/tests/docs/guides/custom_data_models.py b/cognee/tests/docs/guides/custom_data_models.py new file mode 100644 index 000000000..0eb314227 --- /dev/null +++ b/cognee/tests/docs/guides/custom_data_models.py @@ -0,0 +1,38 @@ +import asyncio +from typing import Any +from pydantic import SkipValidation + +import cognee +from cognee.infrastructure.engine import DataPoint +from cognee.infrastructure.engine.models.Edge import Edge +from cognee.tasks.storage import add_data_points + + +class Person(DataPoint): + name: str + # Keep it simple for forward refs / mixed values + knows: SkipValidation[Any] = None # single Person or list[Person] + # Recommended: specify which fields to index for search + metadata: dict = {"index_fields": ["name"]} + + +async def main(): + # Start clean (optional in your app) + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + alice = Person(name="Alice") + bob = Person(name="Bob") + charlie = Person(name="Charlie") + + # Create relationships - field name becomes edge label + alice.knows = bob + # You can also do lists: alice.knows = [bob, charlie] + + # Optional: add weights and custom relationship types + bob.knows = (Edge(weight=0.9, relationship_type="friend_of"), charlie) + + await add_data_points([alice, bob, charlie]) + + +asyncio.run(main()) diff --git a/cognee/tests/docs/guides/custom_prompts.py b/cognee/tests/docs/guides/custom_prompts.py new file mode 100644 index 000000000..0d0a55a80 --- /dev/null +++ b/cognee/tests/docs/guides/custom_prompts.py @@ -0,0 +1,30 @@ +import asyncio +import cognee +from cognee.api.v1.search import SearchType + +custom_prompt = """ +Extract only people and cities as entities. +Connect people to cities with the relationship "lives_in". +Ignore all other entities. +""" + + +async def main(): + await cognee.add( + [ + "Alice moved to Paris in 2010, while Bob has always lived in New York.", + "Andreas was born in Venice, but later settled in Lisbon.", + "Diana and Tom were born and raised in Helsingy. Diana currently resides in Berlin, while Tom never moved.", + ] + ) + await cognee.cognify(custom_prompt=custom_prompt) + + res = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="Where does Alice live?", + ) + print(res) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/docs/guides/custom_tasks_and_pipelines.py b/cognee/tests/docs/guides/custom_tasks_and_pipelines.py new file mode 100644 index 000000000..202bb128a --- /dev/null +++ b/cognee/tests/docs/guides/custom_tasks_and_pipelines.py @@ -0,0 +1,53 @@ +import asyncio +from typing import Any, Dict, List +from pydantic import BaseModel, SkipValidation + +import cognee +from cognee.modules.engine.operations.setup import setup +from cognee.infrastructure.llm.LLMGateway import LLMGateway +from cognee.infrastructure.engine import DataPoint +from cognee.tasks.storage import add_data_points +from cognee.modules.pipelines import Task, run_pipeline + + +class Person(DataPoint): + name: str + # Optional relationships (we'll let the LLM populate this) + knows: List["Person"] = [] + # Make names searchable in the vector store + metadata: Dict[str, Any] = {"index_fields": ["name"]} + + +class People(BaseModel): + persons: List[Person] + + +async def extract_people(text: str) -> List[Person]: + system_prompt = ( + "Extract people mentioned in the text. " + "Return as `persons: Person[]` with each Person having `name` and optional `knows` relations. " + "If the text says someone knows someone set `knows` accordingly. " + "Only include facts explicitly stated." + ) + people = await LLMGateway.acreate_structured_output(text, system_prompt, People) + return people.persons + + +async def main(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + text = "Alice knows Bob." + + tasks = [ + Task(extract_people), # input: text -> output: list[Person] + Task(add_data_points), # input: list[Person] -> output: list[Person] + ] + + async for _ in run_pipeline(tasks=tasks, data=text, datasets=["people_demo"]): + pass + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/docs/guides/graph_visualization.py b/cognee/tests/docs/guides/graph_visualization.py new file mode 100644 index 000000000..d463cbb56 --- /dev/null +++ b/cognee/tests/docs/guides/graph_visualization.py @@ -0,0 +1,13 @@ +import asyncio +import cognee +from cognee.api.v1.visualize.visualize import visualize_graph + + +async def main(): + await cognee.add(["Alice knows Bob.", "NLP is a subfield of CS."]) + await cognee.cognify() + + await visualize_graph("./graph_after_cognify.html") + + +asyncio.run(main()) diff --git a/cognee/tests/docs/guides/low_level_llm.py b/cognee/tests/docs/guides/low_level_llm.py new file mode 100644 index 000000000..454f53f44 --- /dev/null +++ b/cognee/tests/docs/guides/low_level_llm.py @@ -0,0 +1,31 @@ +import asyncio + +from pydantic import BaseModel +from typing import List +from cognee.infrastructure.llm.LLMGateway import LLMGateway + + +class MiniEntity(BaseModel): + name: str + type: str + + +class MiniGraph(BaseModel): + nodes: List[MiniEntity] + + +async def main(): + system_prompt = ( + "Extract entities as nodes with name and type. " + "Use concise, literal values present in the text." + ) + + text = "Apple develops iPhone; Audi produces the R8." + + result = await LLMGateway.acreate_structured_output(text, system_prompt, MiniGraph) + print(result) + # MiniGraph(nodes=[MiniEntity(name='Apple', type='Organization'), ...]) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/docs/guides/memify_quickstart.py b/cognee/tests/docs/guides/memify_quickstart.py new file mode 100644 index 000000000..040654350 --- /dev/null +++ b/cognee/tests/docs/guides/memify_quickstart.py @@ -0,0 +1,29 @@ +import asyncio +import cognee +from cognee import SearchType + + +async def main(): + # 1) Add two short chats and build a graph + await cognee.add( + [ + "We follow PEP8. Add type hints and docstrings.", + "Releases should not be on Friday. Susan must review PRs.", + ], + dataset_name="rules_demo", + ) + await cognee.cognify(datasets=["rules_demo"]) # builds graph + + # 2) Enrich the graph (uses default memify tasks) + await cognee.memify(dataset="rules_demo") + + # 3) Query the new coding rules + rules = await cognee.search( + query_type=SearchType.CODING_RULES, + query_text="List coding rules", + node_name=["coding_agent_rules"], + ) + print("Rules:", rules) + + +asyncio.run(main()) diff --git a/cognee/tests/docs/guides/ontology_quickstart.py b/cognee/tests/docs/guides/ontology_quickstart.py new file mode 100644 index 000000000..2784dab19 --- /dev/null +++ b/cognee/tests/docs/guides/ontology_quickstart.py @@ -0,0 +1,30 @@ +import asyncio +import cognee + + +async def main(): + texts = ["Audi produces the R8 and e-tron.", "Apple develops iPhone and MacBook."] + + await cognee.add(texts) + # or: await cognee.add("/path/to/folder/of/files") + + import os + from cognee.modules.ontology.ontology_config import Config + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + + ontology_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "ontology_input_example/basic_ontology.owl" + ) + + # Create full config structure manually + config: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_path) + } + } + + await cognee.cognify(config=config) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/docs/guides/s3_storage.py b/cognee/tests/docs/guides/s3_storage.py new file mode 100644 index 000000000..1044e05b4 --- /dev/null +++ b/cognee/tests/docs/guides/s3_storage.py @@ -0,0 +1,25 @@ +import asyncio +import cognee + + +async def main(): + # Single file + await cognee.add("s3://cognee-temp/2024-11-04.md") + + # Folder/prefix (recursively expands) + await cognee.add("s3://cognee-temp") + + # Mixed list + await cognee.add( + [ + "s3://cognee-temp/2024-11-04.md", + "Some inline text to ingest", + ] + ) + + # Process the data + await cognee.cognify() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py new file mode 100644 index 000000000..67d0c938d --- /dev/null +++ b/cognee/tests/docs/guides/search_basics.py @@ -0,0 +1,17 @@ +import asyncio +import cognee + + +async def main(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + text = "First rule of coding: Do not talk about coding." + + # Make sure you've already run cognee.cognify(...) so the graph has content + answers = await cognee.search(query_text="What are the main themes in my data?") + for answer in answers: + print(answer) + + +asyncio.run(main()) diff --git a/cognee/tests/docs/guides/temporal_cognify.py b/cognee/tests/docs/guides/temporal_cognify.py new file mode 100644 index 000000000..34c1ee33c --- /dev/null +++ b/cognee/tests/docs/guides/temporal_cognify.py @@ -0,0 +1,57 @@ +import asyncio +import cognee + + +async def main(): + text = """ + In 1998 the project launched. In 2001 version 1.0 shipped. In 2004 the team merged + with another group. In 2010 support for v1 ended. + """ + + await cognee.add(text, dataset_name="timeline_demo") + + await cognee.cognify(datasets=["timeline_demo"], temporal_cognify=True) + + from cognee.api.v1.search import SearchType + + # Before / after queries + result = await cognee.search( + query_type=SearchType.TEMPORAL, query_text="What happened before 2000?", top_k=10 + ) + + assert result != [] + + result = await cognee.search( + query_type=SearchType.TEMPORAL, query_text="What happened after 2010?", top_k=10 + ) + + assert result != [] + + # Between queries + result = await cognee.search( + query_type=SearchType.TEMPORAL, query_text="Events between 2001 and 2004", top_k=10 + ) + + assert result != [] + + # Scoped descriptions + result = await cognee.search( + query_type=SearchType.TEMPORAL, + query_text="Key project milestones between 1998 and 2010", + top_k=10, + ) + + assert result != [] + + result = await cognee.search( + query_type=SearchType.TEMPORAL, + query_text="What happened after 2004?", + datasets=["timeline_demo"], + top_k=10, + ) + + assert result != [] + + +if __name__ == "__main__": + asyncio.run(main()) From d34fd9237bf41c6b421bd556541b50ea68246e45 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 4 Nov 2025 22:04:32 +0530 Subject: [PATCH 08/37] feat: adding last_acessed in the Data model --- .../e1ec1dcb50b6_add_last_accessed_to_data.py | 30 ++++++ cognee/modules/data/models/Data.py | 1 + .../retrieval/utils/access_tracking.py | 102 ++++++++++++------ 3 files changed, 100 insertions(+), 33 deletions(-) create mode 100644 alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py new file mode 100644 index 000000000..0ccefa63b --- /dev/null +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -0,0 +1,30 @@ +"""add_last_accessed_to_data + +Revision ID: e1ec1dcb50b6 +Revises: 211ab850ef3d +Create Date: 2025-11-04 21:45:52.642322 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e1ec1dcb50b6' +down_revision: Union[str, None] = '211ab850ef3d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column('data', + sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) + ) + # Optionally initialize with created_at values for existing records + op.execute("UPDATE data SET last_accessed = created_at") + + +def downgrade() -> None: + op.drop_column('data', 'last_accessed') diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py index ef228f2e1..27ab7481e 100644 --- a/cognee/modules/data/models/Data.py +++ b/cognee/modules/data/models/Data.py @@ -36,6 +36,7 @@ class Data(Base): data_size = Column(Integer, nullable=True) # File size in bytes created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) + last_accessed = Column(DateTime(timezone=True), nullable=True) datasets = relationship( "Dataset", diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 79afd25db..621e09e27 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -1,20 +1,27 @@ - """Utilities for tracking data access in retrievers.""" import json from datetime import datetime, timezone from typing import List, Any +from uuid import UUID from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data from cognee.shared.logging_utils import get_logger +from sqlalchemy import update logger = get_logger(__name__) async def update_node_access_timestamps(items: List[Any]): """ - Update last_accessed_at for nodes in Kuzu graph database. - Automatically determines node type from the graph database. + Update last_accessed_at for nodes in graph database and corresponding Data records in SQL. + + This function: + 1. Updates last_accessed_at in the graph database nodes (in properties JSON) + 2. Traverses to find origin TextDocument nodes + 3. Updates last_accessed in the SQL Data table for those documents Parameters ---------- @@ -26,39 +33,68 @@ async def update_node_access_timestamps(items: List[Any]): graph_engine = await get_graph_engine() timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) + timestamp_dt = datetime.now(timezone.utc) + # Extract node IDs + node_ids = [] for item in items: - # Extract ID from payload item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") - if not item_id: - continue - - # try: - # Query to get both node type and properties in one call - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.type as node_type, n.properties as props", - {"id": str(item_id)} - ) - - if result and len(result) > 0 and result[0]: - node_type = result[0][0] # First column: node_type - props_json = result[0][1] # Second column: properties - - # Parse existing properties JSON - props = json.loads(props_json) if props_json else {} - # Update last_accessed_at with millisecond timestamp - props["last_accessed_at"] = timestamp_ms - - # Write back to graph database - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": str(item_id), "props": json.dumps(props)} + if item_id: + node_ids.append(str(item_id)) + + if not node_ids: + return + + try: + # Step 1: Batch update graph nodes + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} ) - logger.debug(f"Updated access timestamp for {node_type} node {item_id}") + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms - # except Exception as e: - # logger.error(f"Failed to update timestamp for node {item_id}: {e}") - # continue - - logger.debug(f"Updated access timestamps for {len(items)} nodes") + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + + logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes") + + # Step 2: Find origin TextDocument nodes + origin_query = """ + UNWIND $node_ids AS node_id + MATCH (n:Node {id: node_id}) + OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node) + WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from') + AND chunk.type = 'DocumentChunk' + OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node) + WHERE e2.relationship_name = 'is_part_of' + AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] + RETURN DISTINCT doc.id as doc_id + """ + + result = await graph_engine.query(origin_query, {"node_ids": node_ids}) + + # Extract document IDs + doc_ids = [row[0] for row in result if row and row[0]] if result else [] + + # Step 3: Update SQL Data table + if doc_ids: + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + stmt = update(Data).where( + Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) + ).values(last_accessed=timestamp_dt) + + await session.execute(stmt) + await session.commit() + + logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL") + + except Exception as e: + logger.error(f"Failed to update timestamps: {e}") + raise From 3c0e915812a4ffb8662419647572c6229ed963a9 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 12:25:51 +0530 Subject: [PATCH 09/37] fix: removing hard relations --- .../modules/retrieval/utils/access_tracking.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 621e09e27..36c0b7f50 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -20,7 +20,7 @@ async def update_node_access_timestamps(items: List[Any]): This function: 1. Updates last_accessed_at in the graph database nodes (in properties JSON) - 2. Traverses to find origin TextDocument nodes + 2. Traverses to find origin TextDocument nodes (without hardcoded relationship names) 3. Updates last_accessed in the SQL Data table for those documents Parameters @@ -64,23 +64,21 @@ async def update_node_access_timestamps(items: List[Any]): logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes") - # Step 2: Find origin TextDocument nodes + # Step 2: Find origin TextDocument nodes (without hardcoded relationship names) origin_query = """ UNWIND $node_ids AS node_id MATCH (n:Node {id: node_id}) OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node) - WHERE (e.relationship_name = 'contains' OR e.relationship_name = 'made_from') - AND chunk.type = 'DocumentChunk' - OPTIONAL MATCH (chunk)-[e2:EDGE]->(doc:Node) - WHERE e2.relationship_name = 'is_part_of' - AND doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] + WHERE chunk.type = 'DocumentChunk' + OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node) + WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] RETURN DISTINCT doc.id as doc_id """ result = await graph_engine.query(origin_query, {"node_ids": node_ids}) - # Extract document IDs - doc_ids = [row[0] for row in result if row and row[0]] if result else [] + # Extract and deduplicate document IDs + doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else [] # Step 3: Update SQL Data table if doc_ids: From 9041a804ecc2d0be1903c2de0ac875f32fcc553c Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 18:32:49 +0530 Subject: [PATCH 10/37] fix: add text_doc flag --- cognee/tasks/cleanup/cleanup_unused_data.py | 520 ++++++++++++-------- 1 file changed, 312 insertions(+), 208 deletions(-) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index e97692bb4..c9c711fe2 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,232 +1,336 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID - -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.shared.logging_utils import get_logger - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: int = 30, + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + Minutes since last access to consider data unused (default: 30) + dry_run : bool + If True, only report what would be deleted without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes directly from graph + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes + unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - minutes_threshold: int = 30, - dry_run: bool = True, +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - minutes_threshold : int - Minutes since last access to consider data unused (default: 30) + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be deleted without actually deleting (default: True) - user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - - Returns - ------- - Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None - ) - - # Calculate cutoff timestamp in milliseconds - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") - - # Find unused nodes - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes - deleted_counts = await _delete_unused_nodes(unused_nodes) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) - - return { - "status": "completed", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None -) -> Dict[str, list]: - """ - Query Kuzu for nodes with old last_accessed_at timestamps. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch + If True, only report what would be deleted user_id : UUID, optional Filter by user ID if provided Returns ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs + Dict[str, Any] + Cleanup results """ - graph_engine = await get_graph_engine() + db_engine = get_relational_engine() - # Query all nodes with their properties - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = { - "DocumentChunk": [], - "Entity": [], - "TextSummary": [] - } - - for node_id, node_type, props_json in results: - # Only process tracked node types - if node_type not in unused_nodes: - continue - - # Parse properties JSON - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - # Check if node is unused (never accessed or accessed before cutoff) - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - # TODO: Add user_id filtering when user ownership is implemented - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) ) - if result and len(result) > 0: - deleted_counts["associations"] += result[0][0] + ) + + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + + result = await session.execute(query) + unused_data = result.all() - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) + logger.info(f"Found {len(unused_data)} unused documents in SQL") - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } + if dry_run: + return { + "status": "dry_run", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": 0, + "documents": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: try: - # Delete from vector collection - if await vector_engine.has_collection(collection_name): - for node_id in node_ids: - try: - await vector_engine.delete(collection_name, {"id": str(node_id)}) - except Exception as e: - logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") + logger.error(f"Failed to delete document {data.id}: {e}") + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + +async def _find_unused_nodes( + cutoff_timestamp_ms: int, + user_id: Optional[UUID] = None +) -> Dict[str, list]: + """ + Query Kuzu for nodes with old last_accessed_at timestamps. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + user_id : UUID, optional + Filter by user ID if provided + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Query all nodes with their properties + query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" + results = await graph_engine.query(query) + + unused_nodes = { + "DocumentChunk": [], + "Entity": [], + "TextSummary": [] + } + + for node_id, node_type, props_json in results: + # Only process tracked node types + if node_type not in unused_nodes: + continue + + # Parse properties JSON + if props_json: + try: + props = json.loads(props_json) + last_accessed = props.get("last_accessed_at") + + # Check if node is unused (never accessed or accessed before cutoff) + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties for node {node_id}") + continue + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + if result and len(result) > 0: + deleted_counts["associations"] += result[0][0] + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) + + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + # Delete from vector collection + if await vector_engine.has_collection(collection_name): + for node_id in node_ids: + try: + await vector_engine.delete(collection_name, {"id": str(node_id)}) + except Exception as e: + logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts From ff263c0132b170b3c03961606db56c2a174d2b90 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 18:40:58 +0530 Subject: [PATCH 11/37] fix: add column check in migration --- .../e1ec1dcb50b6_add_last_accessed_to_data.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py index 0ccefa63b..267e11fb2 100644 --- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -17,14 +17,30 @@ down_revision: Union[str, None] = '211ab850ef3d' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + def upgrade() -> None: - op.add_column('data', - sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) - ) - # Optionally initialize with created_at values for existing records - op.execute("UPDATE data SET last_accessed = created_at") + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if not last_accessed_column: + op.add_column('data', + sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) + ) + # Optionally initialize with created_at values for existing records + op.execute("UPDATE data SET last_accessed = created_at") def downgrade() -> None: - op.drop_column('data', 'last_accessed') + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if last_accessed_column: + op.drop_column('data', 'last_accessed') From c5f0c4af87ff13bf8e3cbe0f4e9163ece44c3094 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 5 Nov 2025 20:22:17 +0530 Subject: [PATCH 12/37] fix: add text_doc flag --- cognee/modules/retrieval/utils/access_tracking.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 36c0b7f50..65d597a93 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -67,12 +67,9 @@ async def update_node_access_timestamps(items: List[Any]): # Step 2: Find origin TextDocument nodes (without hardcoded relationship names) origin_query = """ UNWIND $node_ids AS node_id - MATCH (n:Node {id: node_id}) - OPTIONAL MATCH (n)-[e:EDGE]-(chunk:Node) - WHERE chunk.type = 'DocumentChunk' - OPTIONAL MATCH (chunk)-[e2:EDGE]-(doc:Node) - WHERE doc.type IN ['TextDocument', 'PdfDocument', 'AudioDocument', 'ImageDocument', 'UnstructuredDocument'] - RETURN DISTINCT doc.id as doc_id + MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id """ result = await graph_engine.query(origin_query, {"node_ids": node_ids}) From fdf037b3d0117bd29f0c541ed027895c070678df Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Thu, 6 Nov 2025 23:00:56 +0530 Subject: [PATCH 13/37] fix: min to days --- cognee/tasks/cleanup/cleanup_unused_data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index c9c711fe2..4df622a2c 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -23,7 +23,7 @@ logger = get_logger(__name__) async def cleanup_unused_data( - minutes_threshold: int = 30, + days_threshold: Optional[int], dry_run: bool = True, user_id: Optional[UUID] = None, text_doc: bool = False @@ -33,8 +33,8 @@ async def cleanup_unused_data( Parameters ---------- - minutes_threshold : int - Minutes since last access to consider data unused (default: 30) + days_threshold : int + days since last access to consider data unused dry_run : bool If True, only report what would be deleted without actually deleting (default: True) user_id : UUID, optional @@ -50,14 +50,14 @@ async def cleanup_unused_data( """ logger.info( "Starting cleanup task", - minutes_threshold=minutes_threshold, + days_threshold=days_threshold, dry_run=dry_run, user_id=str(user_id) if user_id else None, text_doc=text_doc ) # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold) if text_doc: # SQL-based approach: Find unused TextDocuments and use cognee.delete() From 84c8e07ddd980af7c11b89c7e510b38e5c44f119 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 7 Nov 2025 12:03:17 +0530 Subject: [PATCH 14/37] fix: remove uneccessary imports --- cognee/modules/chunking/models/DocumentChunk.py | 2 -- cognee/modules/engine/models/Entity.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index a9fb08a9e..e2b216a9b 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,7 +1,5 @@ from typing import List, Union -from pydantic import BaseModel, Field -from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine.models.Edge import Edge from cognee.modules.data.processing.document_types import Document diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 4083cd2e6..a34a6503c 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -1,8 +1,6 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.engine.models.EntityType import EntityType from typing import Optional -from datetime import datetime, timezone -from pydantic import BaseModel, Field class Entity(DataPoint): name: str From 84bd2f38f7513c244ed1040937a1e5a5297cec2e Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Fri, 7 Nov 2025 12:12:46 +0530 Subject: [PATCH 15/37] fix: remove uneccessary imports --- cognee/tasks/summarization/models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 8cee2ade3..8420cfaa5 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -1,7 +1,5 @@ -from pydantic import BaseModel, Field from typing import Union -from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models import DocumentChunk from cognee.shared.CodeGraphEntities import CodeFile, CodePart From d351c9a009d12a8a8a4869afa7aee38c61482e21 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 10 Nov 2025 21:58:01 +0530 Subject: [PATCH 16/37] fix: return chunk payload --- cognee/modules/retrieval/chunks_retriever.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py index be1f95811..b7a90238a 100644 --- a/cognee/modules/retrieval/chunks_retriever.py +++ b/cognee/modules/retrieval/chunks_retriever.py @@ -57,6 +57,7 @@ class ChunksRetriever(BaseRetriever): chunk_payloads = [result.payload for result in found_chunks] logger.info(f"Returning {len(chunk_payloads)} chunk payloads") + return chunk_payloads async def get_completion( self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None From ac3300760b7a521aebe452d041bb7ceaa35f8052 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 12 Nov 2025 12:26:28 +0100 Subject: [PATCH 17/37] test: add search tests docs --- cognee/tests/docs/guides/search_basics.py | 46 +++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py index 67d0c938d..09dee3f92 100644 --- a/cognee/tests/docs/guides/search_basics.py +++ b/cognee/tests/docs/guides/search_basics.py @@ -1,17 +1,57 @@ import asyncio import cognee +from cognee.modules.search.types import SearchType, CombinedSearchResult + async def main(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - text = "First rule of coding: Do not talk about coding." + text = """ + Natural language processing (NLP) is an interdisciplinary + subfield of computer science and information retrieval. + First rule of coding: Do not talk about coding. + """ + + text2 = """ + Sandwiches are best served toasted with cheese, ham, mayo, + lettuce, mustard, and salt & pepper. + """ + + await cognee.add(text, dataset_name="NLP_coding") + await cognee.add(text2, dataset_name="Sandwiches") + await cognee.add(text2) + + await cognee.cognify() # Make sure you've already run cognee.cognify(...) so the graph has content answers = await cognee.search(query_text="What are the main themes in my data?") - for answer in answers: - print(answer) + assert len(answers) > 0 + answers = await cognee.search( + query_text="List coding guidelines", + query_type=SearchType.CODING_RULES, + ) + assert len(answers) > 0 + + answers = await cognee.search( + query_text="Give me a confident answer: What is NLP?", + system_prompt="Answer succinctly and state confidence at the end.", + ) + assert len(answers) > 0 + + answers = await cognee.search( + query_text="Tell me about NLP", + only_context=True, + ) + assert len(answers) > 0 + + answers = await cognee.search( + query_text="Quarterly financial highlights", + datasets=["NLP_coding", "Sandwiches"], + use_combined_context=True, + ) + assert isinstance(answers, CombinedSearchResult) asyncio.run(main()) From 503bdc34f38e18e2ec3dccb6e47aaff669702f55 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 12 Nov 2025 13:20:23 +0100 Subject: [PATCH 18/37] test: add tests to workflows --- .github/workflows/docs_tests.yml | 276 ++++++++++++++++++++++++++++- .github/workflows/release_test.yml | 5 + 2 files changed, 274 insertions(+), 7 deletions(-) diff --git a/.github/workflows/docs_tests.yml b/.github/workflows/docs_tests.yml index b3c538668..7f7282bb2 100644 --- a/.github/workflows/docs_tests.yml +++ b/.github/workflows/docs_tests.yml @@ -1,18 +1,280 @@ -name: Docs Test Suite +name: Docs Tests + permissions: contents: read on: - release: workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + workflow_call: + secrets: + LLM_PROVIDER: + required: true + LLM_MODEL: + required: true + LLM_ENDPOINT: + required: true + LLM_API_KEY: + required: true + LLM_API_VERSION: + required: true + EMBEDDING_PROVIDER: + required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: + required: true env: - RUNTIME__LOG_LEVEL: ERROR ENV: 'dev' jobs: + test-search-basics: + name: Test Search Basics + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Search Basics Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/search_basics.py + + test-temporal-cognify: + name: Test Temporal Cognify + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Temporal Cognify Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py + + test-ontology-quickstart: + name: Test Temporal Cognify + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Temporal Cognify Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py + + test-s3-storage: + name: Test S3 Docs Guide + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: "aws" + + - name: Run S3 Docs Guide Test + env: + ENABLE_BACKEND_ACCESS_CONTROL: True + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + STORAGE_BACKEND: s3 + AWS_REGION: eu-west-1 + AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} + run: uv run python ./cognee/tests/docs/guides/s3_storage.py + + test-graph-visualization: + name: Test Graph Visualization + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Graph Visualization Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/graph_visualization.py + + test-low-level-llm: + name: Test Low Level LLM + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Low Level LLM Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/low_level_llm.py + + test-memify-quickstart: + name: Test Memify Quickstart + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Memify Quickstart Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/memify_quickstart.py + + test-custom-data-models: + name: Test Custom Data Models + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Custom Data Models Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/custom_data_models.py + + test-custom-tasks-and-pipelines: + name: Test Custom Tasks and Pipelines + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Custom Tasks and Pipelines Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/custom_tasks_and_pipelines.py + + test-custom-prompts: + name: Test Custom Prompts + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Custom Prompts Test + env: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/docs/guides/custom_prompts.py \ No newline at end of file diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 6ac3ca515..89540fcfb 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -14,4 +14,9 @@ jobs: load-tests: name: Load Tests uses: ./.github/workflows/load_tests.yml + secrets: inherit + + docs-tests: + name: Docs Tests + uses: ./.github/workflows/docs_tests.yml secrets: inherit \ No newline at end of file From 1e56d6dc389e1f33c08a7ee897689a941a7b8a9f Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 12 Nov 2025 13:42:53 +0100 Subject: [PATCH 19/37] chore: ruff format --- cognee/tests/docs/guides/search_basics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py index 09dee3f92..f1847ad4b 100644 --- a/cognee/tests/docs/guides/search_basics.py +++ b/cognee/tests/docs/guides/search_basics.py @@ -54,4 +54,5 @@ async def main(): ) assert isinstance(answers, CombinedSearchResult) + asyncio.run(main()) From 7bd7079aac9fcb003bcc20e118bc65d066e9029c Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 18 Nov 2025 22:17:23 +0530 Subject: [PATCH 20/37] fix: vecto_engine.delte_data_points --- cognee/tasks/cleanup/cleanup_unused_data.py | 33 ++++++++++----------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index 4df622a2c..fd4b68204 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -315,22 +315,21 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: "TextSummary": "TextSummary_text" } - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - # Delete from vector collection - if await vector_engine.has_collection(collection_name): - for node_id in node_ids: - try: - await vector_engine.delete(collection_name, {"id": str(node_id)}) - except Exception as e: - logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}") - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") return deleted_counts From 5fac3b40b94e4c81a7d9828ca9d2d84ab5e82bc1 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 18 Nov 2025 22:26:59 +0530 Subject: [PATCH 21/37] fix: test file for cleanup unused data --- cognee/tests/test_cleanup_unused_data.py | 244 +++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 cognee/tests/test_cleanup_unused_data.py diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py new file mode 100644 index 000000000..c21b9f5ea --- /dev/null +++ b/cognee/tests/test_cleanup_unused_data.py @@ -0,0 +1,244 @@ +import os +import pathlib +import cognee +from datetime import datetime, timezone, timedelta +from uuid import UUID +from sqlalchemy import select, update +from cognee.modules.data.models import Data, DatasetData +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import get_logger +from cognee.modules.search.types import SearchType + +logger = get_logger() + + +async def test_textdocument_cleanup_with_sql(): + """ + End-to-end test for TextDocument cleanup based on last_accessed timestamps. + + Tests: + 1. Add and cognify a document + 2. Perform search to populate last_accessed timestamp + 3. Verify last_accessed is set in SQL Data table + 4. Manually age the timestamp beyond cleanup threshold + 5. Run cleanup with text_doc=True + 6. Verify document was deleted from all databases (relational, graph, and vector) + """ + # Setup test directories + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") + ).resolve() + ) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") + ).resolve() + ) + + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) + + # Initialize database + from cognee.modules.engine.operations.setup import setup + + # Clean slate + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + logger.info("๐Ÿงช Testing TextDocument cleanup based on last_accessed") + + # Step 1: Add and cognify a test document + dataset_name = "test_cleanup_dataset" + test_text = """ + Machine learning is a subset of artificial intelligence that enables systems to learn + and improve from experience without being explicitly programmed. Deep learning uses + neural networks with multiple layers to process data. + """ + + await setup() + user = await get_default_user() + await cognee.add([test_text], dataset_name=dataset_name, user=user) + + cognify_result = await cognee.cognify([dataset_name], user=user) + + # Extract dataset_id from cognify result (ds_id is already a UUID) + dataset_id = None + for ds_id, pipeline_result in cognify_result.items(): + dataset_id = ds_id # Don't wrap in UUID() - it's already a UUID object + break + + assert dataset_id is not None, "Failed to get dataset_id from cognify result" + logger.info(f"โœ… Document added and cognified. Dataset ID: {dataset_id}") + + # Step 2: Perform search to trigger last_accessed update + logger.info("Triggering search to update last_accessed...") + search_results = await cognee.search( + query_type=SearchType.CHUNKS, + query_text="machine learning", + datasets=[dataset_name], + user=user + ) + logger.info(f"โœ… Search completed, found {len(search_results)} results") + + # Step 3: Verify last_accessed was set in SQL Data table + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Get the Data record for this dataset + result = await session.execute( + select(Data, DatasetData) + .join(DatasetData, Data.id == DatasetData.data_id) + .where(DatasetData.dataset_id == dataset_id) + ) + data_records = result.all() + assert len(data_records) > 0, "No Data records found for the dataset" + data_record = data_records[0][0] + data_id = data_record.id + + # Verify last_accessed is set (should be set by search operation) + assert data_record.last_accessed is not None, ( + "last_accessed should be set after search operation" + ) + + original_last_accessed = data_record.last_accessed + logger.info(f"โœ… last_accessed verified: {original_last_accessed}") + + # Step 4: Manually age the timestamp to be older than cleanup threshold + days_threshold = 30 + aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10) + + async with db_engine.get_async_session() as session: + stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) + await session.execute(stmt) + await session.commit() + + # Query in a NEW session to avoid cached values + async with db_engine.get_async_session() as session: + result = await session.execute(select(Data).where(Data.id == data_id)) + updated_data = result.scalar_one_or_none() + + # Make both timezone-aware for comparison + retrieved_timestamp = updated_data.last_accessed + if retrieved_timestamp.tzinfo is None: + # If database returned naive datetime, make it UTC-aware + retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) + + assert retrieved_timestamp == aged_timestamp, ( + f"Timestamp should be updated to aged value. " + f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}" + ) + + # Step 5: Test cleanup with text_doc=True + from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data + + # First do a dry run + logger.info("Testing dry run with text_doc=True...") + dry_run_result = await cleanup_unused_data( + days_threshold=30, + dry_run=True, + user_id=user.id, + text_doc=True + ) + + assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'" + assert dry_run_result['unused_count'] > 0, ( + "Should find at least one unused document" + ) + logger.info(f"โœ… Dry run found {dry_run_result['unused_count']} unused documents") + + # Now run actual cleanup + logger.info("Executing cleanup with text_doc=True...") + cleanup_result = await cleanup_unused_data( + days_threshold=30, + dry_run=False, + user_id=user.id, + text_doc=True + ) + + assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" + assert cleanup_result["deleted_count"]["documents"] > 0, ( + "At least one document should be deleted" + ) + logger.info(f"โœ… Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") + + # Step 6: Verify the document was actually deleted from SQL + async with db_engine.get_async_session() as session: + deleted_data = ( + await session.execute(select(Data).where(Data.id == data_id)) + ).scalar_one_or_none() + + assert deleted_data is None, ( + "Data record should be deleted after cleanup" + ) + logger.info("โœ… Confirmed: Data record was deleted from SQL database") + + # Verify the dataset-data link was also removed + async with db_engine.get_async_session() as session: + dataset_data_link = ( + await session.execute( + select(DatasetData).where( + DatasetData.data_id == data_id, + DatasetData.dataset_id == dataset_id + ) + ) + ).scalar_one_or_none() + + assert dataset_data_link is None, ( + "DatasetData link should be deleted after cleanup" + ) + logger.info("โœ… Confirmed: DatasetData link was deleted") + + # Verify graph nodes were cleaned up + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + + # Try to find the TextDocument node - it should not exist + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n", + {"id": str(data_id)} + ) + + assert len(result) == 0, ( + "TextDocument node should be deleted from graph database" + ) + logger.info("โœ… Confirmed: TextDocument node was deleted from graph database") + + # Verify vector database was cleaned up + from cognee.infrastructure.databases.vector import get_vector_engine + + vector_engine = get_vector_engine() + + # Check each collection that should have been cleaned up + vector_collections = [ + "DocumentChunk_text", + "Entity_name", + "TextSummary_text" + ] + + for collection_name in vector_collections: + if await vector_engine.has_collection(collection_name): + # Try to retrieve the deleted data points + try: + results = await vector_engine.retrieve(collection_name, [str(data_id)]) + assert len(results) == 0, ( + f"Data points should be deleted from {collection_name} collection" + ) + logger.info(f"โœ… Confirmed: {collection_name} collection is clean") + except Exception as e: + # Collection might be empty or not exist, which is fine + logger.info(f"โœ… Confirmed: {collection_name} collection is empty or doesn't exist") + pass + + logger.info("โœ… Confirmed: Vector database entries were deleted") + + logger.info("๐ŸŽ‰ All cleanup tests passed!") + + return True + + +if __name__ == "__main__": + import asyncio + success = asyncio.run(test_textdocument_cleanup_with_sql()) + exit(0 if success else 1) From 43290af1b23d24d6ab8b5d57c243abe1cee8787e Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 19 Nov 2025 21:00:16 +0530 Subject: [PATCH 22/37] fix: set last_acessed to current timestamp --- alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py index 267e11fb2..a16c99e9f 100644 --- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -34,7 +34,7 @@ def upgrade() -> None: sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) ) # Optionally initialize with created_at values for existing records - op.execute("UPDATE data SET last_accessed = created_at") + op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP") def downgrade() -> None: From b52c1a1e25e6edffe112462836ab315b36bec567 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 24 Nov 2025 12:50:39 +0530 Subject: [PATCH 23/37] fix: flag to enable and disable last_accessed --- .../e1ec1dcb50b6_add_last_accessed_to_data.py | 88 ++++++++++--------- .../retrieval/utils/access_tracking.py | 7 +- cognee/tasks/cleanup/cleanup_unused_data.py | 40 ++++++++- 3 files changed, 90 insertions(+), 45 deletions(-) diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py index a16c99e9f..f1a36ae59 100644 --- a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +++ b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py @@ -1,46 +1,52 @@ -"""add_last_accessed_to_data - -Revision ID: e1ec1dcb50b6 -Revises: 211ab850ef3d -Create Date: 2025-11-04 21:45:52.642322 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = 'e1ec1dcb50b6' -down_revision: Union[str, None] = '211ab850ef3d' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - -def _get_column(inspector, table, name, schema=None): - for col in inspector.get_columns(table, schema=schema): - if col["name"] == name: - return col - return None +"""add_last_accessed_to_data + +Revision ID: e1ec1dcb50b6 +Revises: 211ab850ef3d +Create Date: 2025-11-04 21:45:52.642322 + +""" +import os +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa -def upgrade() -> None: - conn = op.get_bind() - insp = sa.inspect(conn) - - last_accessed_column = _get_column(insp, "data", "last_accessed") - if not last_accessed_column: - op.add_column('data', - sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) - ) - # Optionally initialize with created_at values for existing records - op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP") +# revision identifiers, used by Alembic. +revision: str = 'e1ec1dcb50b6' +down_revision: Union[str, None] = '211ab850ef3d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None -def downgrade() -> None: - conn = op.get_bind() - insp = sa.inspect(conn) - - last_accessed_column = _get_column(insp, "data", "last_accessed") - if last_accessed_column: +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if not last_accessed_column: + # Always create the column for schema consistency + op.add_column('data', + sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) + ) + + # Only initialize existing records if feature is enabled + enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true" + if enable_last_accessed: + op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP") + + +def downgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + last_accessed_column = _get_column(insp, "data", "last_accessed") + if last_accessed_column: op.drop_column('data', 'last_accessed') diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 65d597a93..6df0284ec 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,7 +4,7 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID - +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data @@ -27,7 +27,10 @@ async def update_node_access_timestamps(items: List[Any]): ---------- items : List[Any] List of items with payload containing 'id' field (from vector search results) - """ + """ + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + return + if not items: return diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index fd4b68204..175452a0a 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -10,7 +10,7 @@ import json from datetime import datetime, timezone, timedelta from typing import Optional, Dict, Any from uuid import UUID - +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.relational import get_relational_engine @@ -47,7 +47,43 @@ async def cleanup_unused_data( ------- Dict[str, Any] Cleanup results with status, counts, and timestamp - """ + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + logger.info( "Starting cleanup task", days_threshold=days_threshold, From 5cb6510205742e7a5abf2afe23d2527b229931d0 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Mon, 24 Nov 2025 13:12:46 +0530 Subject: [PATCH 24/37] fix: import --- cognee/tasks/cleanup/cleanup_unused_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index 175452a0a..a90d96b5c 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -18,6 +18,7 @@ from cognee.modules.data.models import Data, DatasetData from cognee.shared.logging_utils import get_logger from sqlalchemy import select, or_ import cognee +import sqlalchemy as sa logger = get_logger(__name__) From 12ce80005ceccafac38a63da458e6df376776b61 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 26 Nov 2025 17:32:50 +0530 Subject: [PATCH 25/37] fix: generalized queries --- .../retrieval/utils/access_tracking.py | 147 ++-- cognee/tasks/cleanup/cleanup_unused_data.py | 778 ++++++++++-------- 2 files changed, 516 insertions(+), 409 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 6df0284ec..12a66f8bc 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -13,24 +13,10 @@ from sqlalchemy import update logger = get_logger(__name__) - async def update_node_access_timestamps(items: List[Any]): - """ - Update last_accessed_at for nodes in graph database and corresponding Data records in SQL. - - This function: - 1. Updates last_accessed_at in the graph database nodes (in properties JSON) - 2. Traverses to find origin TextDocument nodes (without hardcoded relationship names) - 3. Updates last_accessed in the SQL Data table for those documents - - Parameters - ---------- - items : List[Any] - List of items with payload containing 'id' field (from vector search results) - """ if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return @@ -49,50 +35,95 @@ async def update_node_access_timestamps(items: List[Any]): return try: - # Step 1: Batch update graph nodes - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) + # Detect database provider and use appropriate queries + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + + if provider == "kuzu": + await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms) + elif provider == "neo4j": + await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms) + elif provider == "neptune": + await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms) + else: + logger.warning(f"Unsupported graph provider: {provider}") + return - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - - logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes") - - # Step 2: Find origin TextDocument nodes (without hardcoded relationship names) - origin_query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - - result = await graph_engine.query(origin_query, {"node_ids": node_ids}) - - # Extract and deduplicate document IDs - doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else [] - - # Step 3: Update SQL Data table + # Find origin documents and update SQL + doc_ids = await _find_origin_documents(graph_engine, node_ids, provider) if doc_ids: - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - stmt = update(Data).where( - Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) - ).values(last_accessed=timestamp_dt) - - await session.execute(stmt) - await session.commit() - - logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL") - + await _update_sql_records(doc_ids, timestamp_dt) + except Exception as e: logger.error(f"Failed to update timestamps: {e}") - raise + raise + +async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms): + """Kuzu-specific node updates""" + for node_id in node_ids: + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} + ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + +async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms): + """Neo4j-specific node updates""" + for node_id in node_ids: + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms): + """Neptune-specific node updates""" + for node_id in node_ids: + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _find_origin_documents(graph_engine, node_ids, provider): + """Find origin documents with provider-specific queries""" + if provider == "kuzu": + query = """ + UNWIND $node_ids AS node_id + MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id + """ + elif provider == "neo4j": + query = """ + UNWIND $node_ids AS node_id + MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id + """ + elif provider == "neptune": + query = """ + UNWIND $node_ids AS node_id + MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) + WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] + RETURN DISTINCT doc.id + """ + + result = await graph_engine.query(query, {"node_ids": node_ids}) + return list(set([row[0] for row in result if row and row[0]])) if result else [] + +async def _update_sql_records(doc_ids, timestamp_dt): + """Update SQL Data table (same for all providers)""" + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + stmt = update(Data).where( + Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) + ).values(last_accessed=timestamp_dt) + + await session.execute(stmt) + await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index a90d96b5c..b89c939a8 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,372 +1,448 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries)) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + days since last access to consider data unused + dry_run : bool + If True, only report what would be delete without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes directly from graph + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Detect database provider and find unused nodes + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes with provider-specific logic + deleted_counts = await _delete_unused_nodes(unused_nodes, provider) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - days_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = False +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - days_threshold : int - days since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be deleted without actually deleting (default: True) - user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) - - Returns - ------- - Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - days_threshold=days_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc - ) - - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold) - - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes directly from graph - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") - - # Find unused nodes - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes - deleted_counts = await _delete_unused_nodes(unused_nodes) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) - - return { - "status": "completed", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). - - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() - - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) - ) - ) - - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) - - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: - try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") - except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None -) -> Dict[str, list]: - """ - Query Kuzu for nodes with old last_accessed_at timestamps. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch + If True, only report what would be deleted user_id : UUID, optional Filter by user ID if provided Returns ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs + Dict[str, Any] + Cleanup results """ - graph_engine = await get_graph_engine() + db_engine = get_relational_engine() - # Query all nodes with their properties - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = { - "DocumentChunk": [], - "Entity": [], - "TextSummary": [] - } - - for node_id, node_type, props_json in results: - # Only process tracked node types - if node_type not in unused_nodes: - continue - - # Parse properties JSON - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - # Check if node is unused (never accessed or accessed before cutoff) - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) ) - if result and len(result) > 0: - deleted_counts["associations"] += result[0][0] - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue + ) - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) + result = await session.execute(query) + unused_data = result.all() - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: + return { + "status": "dry_run", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": 0, + "documents": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } + + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") + + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - + + +async def _find_unused_nodes( + cutoff_timestamp_ms: int, + user_id: Optional[UUID] = None, + provider: str = "kuzu" +) -> Dict[str, list]: + """ + Find unused nodes with provider-specific queries. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + user_id : UUID, optional + Filter by user ID if provided + provider : str + Graph database provider (kuzu, neo4j, neptune) + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + if provider == "kuzu": + return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms) + elif provider == "neo4j": + return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms) + elif provider == "neptune": + return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms) + else: + logger.warning(f"Unsupported graph provider: {provider}") + return {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + +async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms): + """Kuzu-specific unused node detection""" + query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" + results = await graph_engine.query(query) + + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + for node_id, node_type, props_json in results: + if node_type not in unused_nodes: + continue + + if props_json: + try: + props = json.loads(props_json) + last_accessed = props.get("last_accessed_at") + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties for node {node_id}") + continue + + return unused_nodes + + +async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms): + """Neo4j-specific unused node detection""" + query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at" + results = await graph_engine.query(query) + + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + for row in results: + node_id = row["n"]["id"] + node_type = row["n"]["type"] + last_accessed = row["n"].get("last_accessed_at") + + if node_type not in unused_nodes: + continue + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + + return unused_nodes + + +async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms): + """Neptune-specific unused node detection""" + query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at" + results = await graph_engine.query(query) + + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + for row in results: + node_id = row["n"]["id"] + node_type = row["n"]["type"] + last_accessed = row["n"].get("last_accessed_at") + + if node_type not in unused_nodes: + continue + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node_id) + logger.debug( + f"Found unused {node_type}", + node_id=node_id, + last_accessed=last_accessed + ) + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + provider : str + Graph database provider (kuzu, neo4j, neptune) + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + if provider == "kuzu": + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + elif provider == "neo4j": + result = await graph_engine.query( + "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + elif provider == "neptune": + result = await graph_engine.query( + "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", + {"id": node_id} + ) + + if result and len(result) > 0: + count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"] + deleted_counts["associations"] += count + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) + + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + + try: + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) + except Exception as e: + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts From 6a4d31356bb613e5cf74e7972445f804796ee6d4 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 2 Dec 2025 18:55:47 +0530 Subject: [PATCH 26/37] fix: using graph projection instead of conditions --- .../retrieval/utils/access_tracking.py | 156 ++-- cognee/tasks/cleanup/cleanup_unused_data.py | 759 ++++++++---------- 2 files changed, 418 insertions(+), 497 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 12a66f8bc..935c47157 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,118 +4,116 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID -import os +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data from cognee.shared.logging_utils import get_logger from sqlalchemy import update +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph logger = get_logger(__name__) async def update_node_access_timestamps(items: List[Any]): if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return - + graph_engine = await get_graph_engine() timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) timestamp_dt = datetime.now(timezone.utc) - + # Extract node IDs node_ids = [] for item in items: item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if item_id: node_ids.append(str(item_id)) - + if not node_ids: return - - try: - # Detect database provider and use appropriate queries - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - if provider == "kuzu": - await _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms) - elif provider == "neo4j": - await _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms) - elif provider == "neptune": - await _update_neptune_nodes(graph_engine, node_ids, timestamp_ms) - else: - logger.warning(f"Unsupported graph provider: {provider}") - return + try: + # Update nodes using graph projection ( database-agnostic approach + await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) # Find origin documents and update SQL - doc_ids = await _find_origin_documents(graph_engine, node_ids, provider) + doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: await _update_sql_records(doc_ids, timestamp_dt) - + except Exception as e: logger.error(f"Failed to update timestamps: {e}") raise -async def _update_kuzu_nodes(graph_engine, node_ids, timestamp_ms): - """Kuzu-specific node updates""" - for node_id in node_ids: - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - -async def _update_neo4j_nodes(graph_engine, node_ids, timestamp_ms): - """Neo4j-specific node updates""" - for node_id in node_ids: - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - -async def _update_neptune_nodes(graph_engine, node_ids, timestamp_ms): - """Neptune-specific node updates""" - for node_id in node_ids: - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - -async def _find_origin_documents(graph_engine, node_ids, provider): - """Find origin documents with provider-specific queries""" - if provider == "kuzu": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - elif provider == "neo4j": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:__Node__ {id: node_id})-[e:EDGE]-(doc:__Node__) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ - elif provider == "neptune": - query = """ - UNWIND $node_ids AS node_id - MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node) - WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document'] - RETURN DISTINCT doc.id - """ +async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): + """Update nodes using graph projection - works with any graph database""" + # Project the graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id"], + edge_properties_to_project=[] + ) - result = await graph_engine.query(query, {"node_ids": node_ids}) - return list(set([row[0] for row in result if row and row[0]])) if result else [] + # Update each node's last_accessed_at property + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node: + # Update the node in the database + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + + if provider == "kuzu": + # Kuzu stores properties as JSON + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} + ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + elif provider == "neo4j": + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + elif provider == "neptune": + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + +async def _find_origin_documents_via_projection(graph_engine, node_ids): + """Find origin documents using graph projection instead of DB queries""" + # Project the entire graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id", "type"], + edge_properties_to_project=["relationship_name"] + ) + + # Find origin documents by traversing the in-memory graph + doc_ids = set() + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node and node.get_attribute("type") == "DocumentChunk": + # Traverse edges to find connected documents + for edge in node.get_skeleton_edges(): + # Get the neighbor node + neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node() + if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: + doc_ids.add(neighbor.id) + + return list(doc_ids) async def _update_sql_records(doc_ids, timestamp_dt): """Update SQL Data table (same for all providers)""" @@ -124,6 +122,6 @@ async def _update_sql_records(doc_ids, timestamp_dt): stmt = update(Data).where( Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) ).values(last_accessed=timestamp_dt) - + await session.execute(stmt) await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index b89c939a8..c70b97a00 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,448 +1,371 @@ -""" -Task for automatically deleting unused data from the memify pipeline. - -This task identifies and removes data (chunks, entities, summaries)) that hasn't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa - -logger = get_logger(__name__) +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes data (chunks, entities, summaries)) that hasn't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None, + text_doc: bool = False +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + days since last access to consider data unused + dry_run : bool + If True, only report what would be delete without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + text_doc : bool + If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion (default: False) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None, + text_doc=text_doc + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) + + if text_doc: + # SQL-based approach: Find unused TextDocuments and use cognee.delete() + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) + else: + # Graph-based approach: Find unused nodes using projection (database-agnostic) + cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) + logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + + # Find unused nodes using graph projection + unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms) + + total_unused = sum(len(nodes) for nodes in unused_nodes.values()) + logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) + + if dry_run: + return { + "status": "dry_run", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": 0, + "entities": 0, + "summaries": 0, + "associations": 0 + }, + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "chunks": len(unused_nodes["DocumentChunk"]), + "entities": len(unused_nodes["Entity"]), + "summaries": len(unused_nodes["TextSummary"]) + } + } + + # Delete unused nodes (provider-agnostic deletion) + deleted_counts = await _delete_unused_nodes(unused_nodes) + + logger.info("Cleanup completed", deleted_counts=deleted_counts) + + return { + "status": "completed", + "unused_count": total_unused, + "deleted_count": { + "data_items": 0, + "chunks": deleted_counts["DocumentChunk"], + "entities": deleted_counts["Entity"], + "summaries": deleted_counts["TextSummary"], + "associations": deleted_counts["associations"] + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } -async def cleanup_unused_data( - minutes_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = False +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). Parameters ---------- - minutes_threshold : int - days since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be delete without actually deleting (default: True) + If True, only report what would be deleted user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) + Filter by user ID if provided Returns ------- Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc - ) + Cleanup results + """ + db_engine = get_relational_engine() - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes directly from graph - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) + ) + ) - # Detect database provider and find unused nodes - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id, provider) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes with provider-specific logic - deleted_counts = await _delete_unused_nodes(unused_nodes, provider) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + result = await session.execute(query) + unused_data = result.all() + + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: return { - "status": "completed", - "unused_count": total_unused, + "status": "dry_run", + "unused_count": len(unused_data), "deleted_count": { "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] + "documents": 0 }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } + + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None + + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") + + logger.info("Cleanup completed", deleted_count=deleted_count) + + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() +async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: + """ + Find unused nodes using graph projection - database-agnostic approach. + + Parameters + ---------- + cutoff_timestamp_ms : int + Cutoff timestamp in milliseconds since epoch + + Returns + ------- + Dict[str, list] + Dictionary mapping node types to lists of unused node IDs + """ + graph_engine = await get_graph_engine() + + # Project the entire graph with necessary properties + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id", "type", "last_accessed_at"], + edge_properties_to_project=[] + ) - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) + unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} + + # Get all nodes from the projected graph + all_nodes = memory_fragment.get_nodes() + + for node in all_nodes: + node_type = node.get_attribute("type") + if node_type not in unused_nodes: + continue + + # Check last_accessed_at property + last_accessed = node.get_attribute("last_accessed_at") + + if last_accessed is None or last_accessed < cutoff_timestamp_ms: + unused_nodes[node_type].append(node.id) + logger.debug( + f"Found unused {node_type}", + node_id=node.id, + last_accessed=last_accessed ) + + return unused_nodes + + +async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: + """ + Delete unused nodes from graph and vector databases. + + Parameters + ---------- + unused_nodes : Dict[str, list] + Dictionary mapping node types to lists of node IDs to delete + + Returns + ------- + Dict[str, int] + Count of deleted items by type + """ + graph_engine = await get_graph_engine() + vector_engine = get_vector_engine() + + deleted_counts = { + "DocumentChunk": 0, + "Entity": 0, + "TextSummary": 0, + "associations": 0 + } + + # Count associations before deletion (using graph projection for consistency) + if any(unused_nodes.values()): + memory_fragment = CogneeGraph() + await memory_fragment.project_graph_from_db( + graph_engine, + node_properties_to_project=["id"], + edge_properties_to_project=[] ) - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + # Count edges connected to these nodes + for node_id in node_ids: + node = memory_fragment.get_node(node_id) + if node: + # Count edges from the in-memory graph + edge_count = len(node.get_skeleton_edges()) + deleted_counts["associations"] += edge_count + + # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) + for node_type, node_ids in unused_nodes.items(): + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") + + # Delete nodes in batches (database-agnostic) + await graph_engine.delete_nodes(node_ids) + deleted_counts[node_type] = len(node_ids) - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: + # Delete from vector database + vector_collections = { + "DocumentChunk": "DocumentChunk_text", + "Entity": "Entity_name", + "TextSummary": "TextSummary_text" + } + + + for node_type, collection_name in vector_collections.items(): + node_ids = unused_nodes[node_type] + if not node_ids: + continue + + logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") + try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + if await vector_engine.has_collection(collection_name): + await vector_engine.delete_data_points( + collection_name, + [str(node_id) for node_id in node_ids] + ) except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes( - cutoff_timestamp_ms: int, - user_id: Optional[UUID] = None, - provider: str = "kuzu" -) -> Dict[str, list]: - """ - Find unused nodes with provider-specific queries. - - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch - user_id : UUID, optional - Filter by user ID if provided - provider : str - Graph database provider (kuzu, neo4j, neptune) - - Returns - ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs - """ - graph_engine = await get_graph_engine() - - if provider == "kuzu": - return await _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms) - elif provider == "neo4j": - return await _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms) - elif provider == "neptune": - return await _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms) - else: - logger.warning(f"Unsupported graph provider: {provider}") - return {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - -async def _find_unused_nodes_kuzu(graph_engine, cutoff_timestamp_ms): - """Kuzu-specific unused node detection""" - query = "MATCH (n:Node) RETURN n.id, n.type, n.properties" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for node_id, node_type, props_json in results: - if node_type not in unused_nodes: - continue - - if props_json: - try: - props = json.loads(props_json) - last_accessed = props.get("last_accessed_at") - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - except json.JSONDecodeError: - logger.warning(f"Failed to parse properties for node {node_id}") - continue - - return unused_nodes - - -async def _find_unused_nodes_neo4j(graph_engine, cutoff_timestamp_ms): - """Neo4j-specific unused node detection""" - query = "MATCH (n:__Node__) RETURN n.id, n.type, n.last_accessed_at" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for row in results: - node_id = row["n"]["id"] - node_type = row["n"]["type"] - last_accessed = row["n"].get("last_accessed_at") - - if node_type not in unused_nodes: - continue - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _find_unused_nodes_neptune(graph_engine, cutoff_timestamp_ms): - """Neptune-specific unused node detection""" - query = "MATCH (n:Node) RETURN n.id, n.type, n.last_accessed_at" - results = await graph_engine.query(query) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - for row in results: - node_id = row["n"]["id"] - node_type = row["n"]["type"] - last_accessed = row["n"].get("last_accessed_at") - - if node_type not in unused_nodes: - continue - - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node_id) - logger.debug( - f"Found unused {node_type}", - node_id=node_id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list], provider: str) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - provider : str - Graph database provider (kuzu, neo4j, neptune) - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges connected to these nodes - for node_id in node_ids: - if provider == "kuzu": - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - elif provider == "neo4j": - result = await graph_engine.query( - "MATCH (n:__Node__ {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - elif provider == "neptune": - result = await graph_engine.query( - "MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)", - {"id": node_id} - ) - - if result and len(result) > 0: - count = result[0][0] if provider == "kuzu" else result[0]["count_count(r)"] - deleted_counts["associations"] += count - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) - - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - + logger.error(f"Error deleting from vector collection {collection_name}: {e}") + return deleted_counts From 5f00abf3e4f3b913ae67391d487104ea3b9ae872 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Tue, 2 Dec 2025 22:25:03 +0530 Subject: [PATCH 27/37] fix: fallback and document deletion --- .../retrieval/utils/access_tracking.py | 73 +++++++++++-------- cognee/tasks/cleanup/cleanup_unused_data.py | 41 +++++++---- 2 files changed, 68 insertions(+), 46 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index 935c47157..c7b06ee17 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -36,16 +36,22 @@ async def update_node_access_timestamps(items: List[Any]): return try: - # Update nodes using graph projection ( database-agnostic approach + # Try to update nodes in graph database (may fail for unsupported DBs) await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) + except Exception as e: + logger.warning( + f"Failed to update node timestamps in graph database: {e}. " + "Will update document-level timestamps in SQL instead." + ) - # Find origin documents and update SQL + # Always try to find origin documents and update SQL + # This ensures document-level tracking works even if graph updates fail + try: doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: await _update_sql_records(doc_ids, timestamp_dt) - except Exception as e: - logger.error(f"Failed to update timestamps: {e}") + logger.error(f"Failed to update SQL timestamps: {e}") raise async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): @@ -59,37 +65,42 @@ async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): ) # Update each node's last_accessed_at property + provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() + for node_id in node_ids: node = memory_fragment.get_node(node_id) if node: - # Update the node in the database - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - - if provider == "kuzu": - # Kuzu stores properties as JSON - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} + try: + # Update the node in the database + if provider == "kuzu": + # Kuzu stores properties as JSON + result = await graph_engine.query( + "MATCH (n:Node {id: $id}) RETURN n.properties", + {"id": node_id} ) - elif provider == "neo4j": - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - elif provider == "neptune": - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) + + if result and result[0]: + props = json.loads(result[0][0]) if result[0][0] else {} + props["last_accessed_at"] = timestamp_ms + + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.properties = $props", + {"id": node_id, "props": json.dumps(props)} + ) + elif provider == "neo4j": + await graph_engine.query( + "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + elif provider == "neptune": + await graph_engine.query( + "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", + {"id": node_id, "timestamp": timestamp_ms} + ) + except Exception as e: + # Log but continue with other nodes + logger.debug(f"Failed to update node {node_id}: {e}") + continue async def _find_origin_documents_via_projection(graph_engine, node_ids): """Find origin documents using graph projection instead of DB queries""" diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index c70b97a00..3894635dd 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,9 +1,9 @@ """ Task for automatically deleting unused data from the memify pipeline. -This task identifies and removes data (chunks, entities, summaries)) that hasn't +This task identifies and removes entire documents that haven't been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization. +efficiency and storage optimization through whole-document removal. """ import json @@ -28,22 +28,26 @@ async def cleanup_unused_data( minutes_threshold: Optional[int], dry_run: bool = True, user_id: Optional[UUID] = None, - text_doc: bool = False + text_doc: bool = True, # Changed default to True for document-level cleanup + node_level: bool = False # New parameter for explicit node-level cleanup ) -> Dict[str, Any]: """ Identify and remove unused data from the memify pipeline. - + Parameters ---------- minutes_threshold : int - days since last access to consider data unused + Minutes since last access to consider data unused dry_run : bool - If True, only report what would be delete without actually deleting (default: True) + If True, only report what would be deleted without actually deleting (default: True) user_id : UUID, optional Limit cleanup to specific user's data (default: None) text_doc : bool - If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion (default: False) + If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete() + for proper whole-document deletion + node_level : bool + If True, perform chaotic node-level deletion of unused chunks, entities, and summaries + (default: False - deprecated in favor of document-level cleanup) Returns ------- @@ -91,17 +95,19 @@ async def cleanup_unused_data( minutes_threshold=minutes_threshold, dry_run=dry_run, user_id=str(user_id) if user_id else None, - text_doc=text_doc + text_doc=text_doc, + node_level=node_level ) # Calculate cutoff timestamp cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - if text_doc: - # SQL-based approach: Find unused TextDocuments and use cognee.delete() - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - else: - # Graph-based approach: Find unused nodes using projection (database-agnostic) + if node_level: + # Deprecated: Node-level approach (chaotic) + logger.warning( + "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. " + "Consider using document-level cleanup (default) instead." + ) cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") @@ -147,6 +153,9 @@ async def cleanup_unused_data( }, "cleanup_date": datetime.now(timezone.utc).isoformat() } + else: + # Default: Document-level approach (recommended) + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) async def _cleanup_via_sql( @@ -243,6 +252,7 @@ async def _cleanup_via_sql( async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: """ Find unused nodes using graph projection - database-agnostic approach. + NOTE: This function is deprecated as it leads to fragmented knowledge graphs. Parameters ---------- @@ -291,6 +301,7 @@ async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[st async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: """ Delete unused nodes from graph and vector databases. + NOTE: This function is deprecated as it leads to fragmented knowledge graphs. Parameters ---------- @@ -325,7 +336,7 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: if not node_ids: continue - # Count edges connected to these nodes + # Count edges from the in-memory graph for node_id in node_ids: node = memory_fragment.get_node(node_id) if node: From 829a6f0d04bcfec6e9c9f94219a29d6ab5cd909d Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Wed, 10 Dec 2025 22:41:01 +0530 Subject: [PATCH 28/37] fix: only document level deletion --- .../retrieval/utils/access_tracking.py | 80 +-- cognee/tasks/cleanup/cleanup_unused_data.py | 521 ++++++------------ cognee/tests/test_cleanup_unused_data.py | 388 ++++++------- 3 files changed, 333 insertions(+), 656 deletions(-) diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py index c7b06ee17..54fd043b9 100644 --- a/cognee/modules/retrieval/utils/access_tracking.py +++ b/cognee/modules/retrieval/utils/access_tracking.py @@ -4,7 +4,7 @@ import json from datetime import datetime, timezone from typing import List, Any from uuid import UUID -import os +import os from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data @@ -14,38 +14,28 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph logger = get_logger(__name__) + async def update_node_access_timestamps(items: List[Any]): if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return - + if not items: return - + graph_engine = await get_graph_engine() - timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000) timestamp_dt = datetime.now(timezone.utc) - + # Extract node IDs node_ids = [] for item in items: item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") if item_id: node_ids.append(str(item_id)) - + if not node_ids: return - - try: - # Try to update nodes in graph database (may fail for unsupported DBs) - await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms) - except Exception as e: - logger.warning( - f"Failed to update node timestamps in graph database: {e}. " - "Will update document-level timestamps in SQL instead." - ) - - # Always try to find origin documents and update SQL - # This ensures document-level tracking works even if graph updates fail + + # Focus on document-level tracking via projection try: doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if doc_ids: @@ -54,53 +44,6 @@ async def update_node_access_timestamps(items: List[Any]): logger.error(f"Failed to update SQL timestamps: {e}") raise -async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms): - """Update nodes using graph projection - works with any graph database""" - # Project the graph with necessary properties - memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db( - graph_engine, - node_properties_to_project=["id"], - edge_properties_to_project=[] - ) - - # Update each node's last_accessed_at property - provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower() - - for node_id in node_ids: - node = memory_fragment.get_node(node_id) - if node: - try: - # Update the node in the database - if provider == "kuzu": - # Kuzu stores properties as JSON - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n.properties", - {"id": node_id} - ) - - if result and result[0]: - props = json.loads(result[0][0]) if result[0][0] else {} - props["last_accessed_at"] = timestamp_ms - - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.properties = $props", - {"id": node_id, "props": json.dumps(props)} - ) - elif provider == "neo4j": - await graph_engine.query( - "MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - elif provider == "neptune": - await graph_engine.query( - "MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp", - {"id": node_id, "timestamp": timestamp_ms} - ) - except Exception as e: - # Log but continue with other nodes - logger.debug(f"Failed to update node {node_id}: {e}") - continue async def _find_origin_documents_via_projection(graph_engine, node_ids): """Find origin documents using graph projection instead of DB queries""" @@ -111,7 +54,7 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids): node_properties_to_project=["id", "type"], edge_properties_to_project=["relationship_name"] ) - + # Find origin documents by traversing the in-memory graph doc_ids = set() for node_id in node_ids: @@ -123,9 +66,10 @@ async def _find_origin_documents_via_projection(graph_engine, node_ids): neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node() if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: doc_ids.add(neighbor.id) - + return list(doc_ids) + async def _update_sql_records(doc_ids, timestamp_dt): """Update SQL Data table (same for all providers)""" db_engine = get_relational_engine() @@ -133,6 +77,6 @@ async def _update_sql_records(doc_ids, timestamp_dt): stmt = update(Data).where( Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) ).values(last_accessed=timestamp_dt) - + await session.execute(stmt) await session.commit() diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py index 3894635dd..34cde1b6f 100644 --- a/cognee/tasks/cleanup/cleanup_unused_data.py +++ b/cognee/tasks/cleanup/cleanup_unused_data.py @@ -1,382 +1,187 @@ -""" -Task for automatically deleting unused data from the memify pipeline. +""" +Task for automatically deleting unused data from the memify pipeline. + +This task identifies and removes entire documents that haven't +been accessed by retrievers for a specified period, helping maintain system +efficiency and storage optimization through whole-document removal. +""" + +import json +from datetime import datetime, timezone, timedelta +from typing import Optional, Dict, Any +from uuid import UUID +import os +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.data.models import Data, DatasetData +from cognee.shared.logging_utils import get_logger +from sqlalchemy import select, or_ +import cognee +import sqlalchemy as sa +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph + +logger = get_logger(__name__) + + +async def cleanup_unused_data( + minutes_threshold: Optional[int], + dry_run: bool = True, + user_id: Optional[UUID] = None +) -> Dict[str, Any]: + """ + Identify and remove unused data from the memify pipeline. + + Parameters + ---------- + minutes_threshold : int + Minutes since last access to consider data unused + dry_run : bool + If True, only report what would be deleted without actually deleting (default: True) + user_id : UUID, optional + Limit cleanup to specific user's data (default: None) + + Returns + ------- + Dict[str, Any] + Cleanup results with status, counts, and timestamp + """ + # Check 1: Environment variable must be enabled + if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": + logger.warning( + "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." + ) + return { + "status": "skipped", + "reason": "ENABLE_LAST_ACCESSED not enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + # Check 2: Verify tracking has actually been running + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + # Count records with non-NULL last_accessed + tracked_count = await session.execute( + select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) + ) + tracked_records = tracked_count.scalar() + + if tracked_records == 0: + logger.warning( + "Cleanup skipped: No records have been tracked yet. " + "ENABLE_LAST_ACCESSED may have been recently enabled. " + "Wait for retrievers to update timestamps before running cleanup." + ) + return { + "status": "skipped", + "reason": "No tracked records found - tracking may be newly enabled", + "unused_count": 0, + "deleted_count": {}, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } + + logger.info( + "Starting cleanup task", + minutes_threshold=minutes_threshold, + dry_run=dry_run, + user_id=str(user_id) if user_id else None + ) + + # Calculate cutoff timestamp + cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) -This task identifies and removes entire documents that haven't -been accessed by retrievers for a specified period, helping maintain system -efficiency and storage optimization through whole-document removal. -""" - -import json -from datetime import datetime, timezone, timedelta -from typing import Optional, Dict, Any -from uuid import UUID -import os -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.data.models import Data, DatasetData -from cognee.shared.logging_utils import get_logger -from sqlalchemy import select, or_ -import cognee -import sqlalchemy as sa -from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph - -logger = get_logger(__name__) + # Document-level approach (recommended) + return await _cleanup_via_sql(cutoff_date, dry_run, user_id) -async def cleanup_unused_data( - minutes_threshold: Optional[int], - dry_run: bool = True, - user_id: Optional[UUID] = None, - text_doc: bool = True, # Changed default to True for document-level cleanup - node_level: bool = False # New parameter for explicit node-level cleanup +async def _cleanup_via_sql( + cutoff_date: datetime, + dry_run: bool, + user_id: Optional[UUID] = None ) -> Dict[str, Any]: """ - Identify and remove unused data from the memify pipeline. - + SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). + Parameters ---------- - minutes_threshold : int - Minutes since last access to consider data unused + cutoff_date : datetime + Cutoff date for last_accessed filtering dry_run : bool - If True, only report what would be deleted without actually deleting (default: True) + If True, only report what would be deleted user_id : UUID, optional - Limit cleanup to specific user's data (default: None) - text_doc : bool - If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete() - for proper whole-document deletion - node_level : bool - If True, perform chaotic node-level deletion of unused chunks, entities, and summaries - (default: False - deprecated in favor of document-level cleanup) + Filter by user ID if provided Returns ------- Dict[str, Any] - Cleanup results with status, counts, and timestamp - """ - # Check 1: Environment variable must be enabled - if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": - logger.warning( - "Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." - ) - return { - "status": "skipped", - "reason": "ENABLE_LAST_ACCESSED not enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - # Check 2: Verify tracking has actually been running - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Count records with non-NULL last_accessed - tracked_count = await session.execute( - select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) - ) - tracked_records = tracked_count.scalar() - - if tracked_records == 0: - logger.warning( - "Cleanup skipped: No records have been tracked yet. " - "ENABLE_LAST_ACCESSED may have been recently enabled. " - "Wait for retrievers to update timestamps before running cleanup." - ) - return { - "status": "skipped", - "reason": "No tracked records found - tracking may be newly enabled", - "unused_count": 0, - "deleted_count": {}, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - logger.info( - "Starting cleanup task", - minutes_threshold=minutes_threshold, - dry_run=dry_run, - user_id=str(user_id) if user_id else None, - text_doc=text_doc, - node_level=node_level - ) + Cleanup results + """ + db_engine = get_relational_engine() - # Calculate cutoff timestamp - cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold) - - if node_level: - # Deprecated: Node-level approach (chaotic) - logger.warning( - "Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. " - "Consider using document-level cleanup (default) instead." - ) - cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000) - logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)") + async with db_engine.get_async_session() as session: + # Query for Data records with old last_accessed timestamps + query = select(Data, DatasetData).join( + DatasetData, Data.id == DatasetData.data_id + ).where( + or_( + Data.last_accessed < cutoff_date, + Data.last_accessed.is_(None) + ) + ) - # Find unused nodes using graph projection - unused_nodes = await _find_unused_nodes_via_projection(cutoff_timestamp_ms) - - total_unused = sum(len(nodes) for nodes in unused_nodes.values()) - logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()}) - - if dry_run: - return { - "status": "dry_run", - "unused_count": total_unused, - "deleted_count": { - "data_items": 0, - "chunks": 0, - "entities": 0, - "summaries": 0, - "associations": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "chunks": len(unused_nodes["DocumentChunk"]), - "entities": len(unused_nodes["Entity"]), - "summaries": len(unused_nodes["TextSummary"]) - } - } - - # Delete unused nodes (provider-agnostic deletion) - deleted_counts = await _delete_unused_nodes(unused_nodes) - - logger.info("Cleanup completed", deleted_counts=deleted_counts) + if user_id: + from cognee.modules.data.models import Dataset + query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( + Dataset.owner_id == user_id + ) + result = await session.execute(query) + unused_data = result.all() + + logger.info(f"Found {len(unused_data)} unused documents in SQL") + + if dry_run: return { - "status": "completed", - "unused_count": total_unused, + "status": "dry_run", + "unused_count": len(unused_data), "deleted_count": { "data_items": 0, - "chunks": deleted_counts["DocumentChunk"], - "entities": deleted_counts["Entity"], - "summaries": deleted_counts["TextSummary"], - "associations": deleted_counts["associations"] + "documents": 0 }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - else: - # Default: Document-level approach (recommended) - return await _cleanup_via_sql(cutoff_date, dry_run, user_id) - - -async def _cleanup_via_sql( - cutoff_date: datetime, - dry_run: bool, - user_id: Optional[UUID] = None -) -> Dict[str, Any]: - """ - SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). - - Parameters - ---------- - cutoff_date : datetime - Cutoff date for last_accessed filtering - dry_run : bool - If True, only report what would be deleted - user_id : UUID, optional - Filter by user ID if provided - - Returns - ------- - Dict[str, Any] - Cleanup results - """ - db_engine = get_relational_engine() - - async with db_engine.get_async_session() as session: - # Query for Data records with old last_accessed timestamps - query = select(Data, DatasetData).join( - DatasetData, Data.id == DatasetData.data_id - ).where( - or_( - Data.last_accessed < cutoff_date, - Data.last_accessed.is_(None) - ) - ) - - if user_id: - from cognee.modules.data.models import Dataset - query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where( - Dataset.owner_id == user_id - ) - - result = await session.execute(query) - unused_data = result.all() - - logger.info(f"Found {len(unused_data)} unused documents in SQL") - - if dry_run: - return { - "status": "dry_run", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": 0, - "documents": 0 - }, - "cleanup_date": datetime.now(timezone.utc).isoformat(), - "preview": { - "documents": len(unused_data) - } - } - - # Delete each document using cognee.delete() - deleted_count = 0 - from cognee.modules.users.methods import get_default_user - user = await get_default_user() if user_id is None else None - - for data, dataset_data in unused_data: - try: - await cognee.delete( - data_id=data.id, - dataset_id=dataset_data.dataset_id, - mode="hard", # Use hard mode to also remove orphaned entities - user=user - ) - deleted_count += 1 - logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") - except Exception as e: - logger.error(f"Failed to delete document {data.id}: {e}") - - logger.info("Cleanup completed", deleted_count=deleted_count) - - return { - "status": "completed", - "unused_count": len(unused_data), - "deleted_count": { - "data_items": deleted_count, - "documents": deleted_count - }, - "cleanup_date": datetime.now(timezone.utc).isoformat() - } - - -async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]: - """ - Find unused nodes using graph projection - database-agnostic approach. - NOTE: This function is deprecated as it leads to fragmented knowledge graphs. + "cleanup_date": datetime.now(timezone.utc).isoformat(), + "preview": { + "documents": len(unused_data) + } + } - Parameters - ---------- - cutoff_timestamp_ms : int - Cutoff timestamp in milliseconds since epoch + # Delete each document using cognee.delete() + deleted_count = 0 + from cognee.modules.users.methods import get_default_user + user = await get_default_user() if user_id is None else None - Returns - ------- - Dict[str, list] - Dictionary mapping node types to lists of unused node IDs - """ - graph_engine = await get_graph_engine() + for data, dataset_data in unused_data: + try: + await cognee.delete( + data_id=data.id, + dataset_id=dataset_data.dataset_id, + mode="hard", # Use hard mode to also remove orphaned entities + user=user + ) + deleted_count += 1 + logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}") + except Exception as e: + logger.error(f"Failed to delete document {data.id}: {e}") - # Project the entire graph with necessary properties - memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db( - graph_engine, - node_properties_to_project=["id", "type", "last_accessed_at"], - edge_properties_to_project=[] - ) - - unused_nodes = {"DocumentChunk": [], "Entity": [], "TextSummary": []} - - # Get all nodes from the projected graph - all_nodes = memory_fragment.get_nodes() - - for node in all_nodes: - node_type = node.get_attribute("type") - if node_type not in unused_nodes: - continue - - # Check last_accessed_at property - last_accessed = node.get_attribute("last_accessed_at") + logger.info("Cleanup completed", deleted_count=deleted_count) - if last_accessed is None or last_accessed < cutoff_timestamp_ms: - unused_nodes[node_type].append(node.id) - logger.debug( - f"Found unused {node_type}", - node_id=node.id, - last_accessed=last_accessed - ) - - return unused_nodes - - -async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]: - """ - Delete unused nodes from graph and vector databases. - NOTE: This function is deprecated as it leads to fragmented knowledge graphs. - - Parameters - ---------- - unused_nodes : Dict[str, list] - Dictionary mapping node types to lists of node IDs to delete - - Returns - ------- - Dict[str, int] - Count of deleted items by type - """ - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - deleted_counts = { - "DocumentChunk": 0, - "Entity": 0, - "TextSummary": 0, - "associations": 0 - } - - # Count associations before deletion (using graph projection for consistency) - if any(unused_nodes.values()): - memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db( - graph_engine, - node_properties_to_project=["id"], - edge_properties_to_project=[] - ) - - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - # Count edges from the in-memory graph - for node_id in node_ids: - node = memory_fragment.get_node(node_id) - if node: - # Count edges from the in-memory graph - edge_count = len(node.get_skeleton_edges()) - deleted_counts["associations"] += edge_count - - # Delete from graph database (uses DETACH DELETE, so edges are automatically removed) - for node_type, node_ids in unused_nodes.items(): - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database") - - # Delete nodes in batches (database-agnostic) - await graph_engine.delete_nodes(node_ids) - deleted_counts[node_type] = len(node_ids) - - # Delete from vector database - vector_collections = { - "DocumentChunk": "DocumentChunk_text", - "Entity": "Entity_name", - "TextSummary": "TextSummary_text" - } - - - for node_type, collection_name in vector_collections.items(): - node_ids = unused_nodes[node_type] - if not node_ids: - continue - - logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database") - - try: - if await vector_engine.has_collection(collection_name): - await vector_engine.delete_data_points( - collection_name, - [str(node_id) for node_id in node_ids] - ) - except Exception as e: - logger.error(f"Error deleting from vector collection {collection_name}: {e}") - - return deleted_counts + return { + "status": "completed", + "unused_count": len(unused_data), + "deleted_count": { + "data_items": deleted_count, + "documents": deleted_count + }, + "cleanup_date": datetime.now(timezone.utc).isoformat() + } diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py index c21b9f5ea..e738dcba0 100644 --- a/cognee/tests/test_cleanup_unused_data.py +++ b/cognee/tests/test_cleanup_unused_data.py @@ -1,244 +1,172 @@ -import os -import pathlib -import cognee -from datetime import datetime, timezone, timedelta -from uuid import UUID -from sqlalchemy import select, update -from cognee.modules.data.models import Data, DatasetData -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.modules.users.methods import get_default_user -from cognee.shared.logging_utils import get_logger -from cognee.modules.search.types import SearchType - -logger = get_logger() - - -async def test_textdocument_cleanup_with_sql(): - """ - End-to-end test for TextDocument cleanup based on last_accessed timestamps. +import os +import pathlib +import cognee +from datetime import datetime, timezone, timedelta +from uuid import UUID +from sqlalchemy import select, update +from cognee.modules.data.models import Data, DatasetData +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import get_logger +from cognee.modules.search.types import SearchType - Tests: - 1. Add and cognify a document - 2. Perform search to populate last_accessed timestamp - 3. Verify last_accessed is set in SQL Data table - 4. Manually age the timestamp beyond cleanup threshold - 5. Run cleanup with text_doc=True - 6. Verify document was deleted from all databases (relational, graph, and vector) - """ - # Setup test directories - data_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") - ).resolve() - ) - cognee_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") - ).resolve() - ) +logger = get_logger() - cognee.config.data_root_directory(data_directory_path) - cognee.config.system_root_directory(cognee_directory_path) - # Initialize database - from cognee.modules.engine.operations.setup import setup +async def test_textdocument_cleanup_with_sql(): + """ + End-to-end test for TextDocument cleanup based on last_accessed timestamps. + """ + # Enable last accessed tracking BEFORE any cognee operations + os.environ["ENABLE_LAST_ACCESSED"] = "true" - # Clean slate - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - logger.info("๐Ÿงช Testing TextDocument cleanup based on last_accessed") - - # Step 1: Add and cognify a test document - dataset_name = "test_cleanup_dataset" - test_text = """ - Machine learning is a subset of artificial intelligence that enables systems to learn - and improve from experience without being explicitly programmed. Deep learning uses - neural networks with multiple layers to process data. - """ - - await setup() - user = await get_default_user() - await cognee.add([test_text], dataset_name=dataset_name, user=user) - - cognify_result = await cognee.cognify([dataset_name], user=user) - - # Extract dataset_id from cognify result (ds_id is already a UUID) - dataset_id = None - for ds_id, pipeline_result in cognify_result.items(): - dataset_id = ds_id # Don't wrap in UUID() - it's already a UUID object - break - - assert dataset_id is not None, "Failed to get dataset_id from cognify result" - logger.info(f"โœ… Document added and cognified. Dataset ID: {dataset_id}") - - # Step 2: Perform search to trigger last_accessed update - logger.info("Triggering search to update last_accessed...") - search_results = await cognee.search( - query_type=SearchType.CHUNKS, - query_text="machine learning", - datasets=[dataset_name], - user=user - ) - logger.info(f"โœ… Search completed, found {len(search_results)} results") - - # Step 3: Verify last_accessed was set in SQL Data table - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - # Get the Data record for this dataset - result = await session.execute( - select(Data, DatasetData) - .join(DatasetData, Data.id == DatasetData.data_id) - .where(DatasetData.dataset_id == dataset_id) - ) - data_records = result.all() - assert len(data_records) > 0, "No Data records found for the dataset" - data_record = data_records[0][0] - data_id = data_record.id + # Setup test directories + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") + ).resolve() + ) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") + ).resolve() + ) - # Verify last_accessed is set (should be set by search operation) - assert data_record.last_accessed is not None, ( - "last_accessed should be set after search operation" - ) + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) - original_last_accessed = data_record.last_accessed - logger.info(f"โœ… last_accessed verified: {original_last_accessed}") - - # Step 4: Manually age the timestamp to be older than cleanup threshold - days_threshold = 30 - aged_timestamp = datetime.now(timezone.utc) - timedelta(days=days_threshold + 10) - - async with db_engine.get_async_session() as session: - stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) - await session.execute(stmt) - await session.commit() - - # Query in a NEW session to avoid cached values - async with db_engine.get_async_session() as session: - result = await session.execute(select(Data).where(Data.id == data_id)) - updated_data = result.scalar_one_or_none() + # Initialize database + from cognee.modules.engine.operations.setup import setup - # Make both timezone-aware for comparison - retrieved_timestamp = updated_data.last_accessed - if retrieved_timestamp.tzinfo is None: - # If database returned naive datetime, make it UTC-aware - retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) + # Clean slate + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) - assert retrieved_timestamp == aged_timestamp, ( - f"Timestamp should be updated to aged value. " - f"Expected: {aged_timestamp}, Got: {retrieved_timestamp}" - ) - - # Step 5: Test cleanup with text_doc=True - from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data - - # First do a dry run - logger.info("Testing dry run with text_doc=True...") - dry_run_result = await cleanup_unused_data( - days_threshold=30, - dry_run=True, - user_id=user.id, - text_doc=True - ) - - assert dry_run_result['status'] == 'dry_run', "Status should be 'dry_run'" - assert dry_run_result['unused_count'] > 0, ( - "Should find at least one unused document" - ) - logger.info(f"โœ… Dry run found {dry_run_result['unused_count']} unused documents") - - # Now run actual cleanup - logger.info("Executing cleanup with text_doc=True...") - cleanup_result = await cleanup_unused_data( - days_threshold=30, - dry_run=False, - user_id=user.id, - text_doc=True - ) - - assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" - assert cleanup_result["deleted_count"]["documents"] > 0, ( - "At least one document should be deleted" - ) - logger.info(f"โœ… Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") - - # Step 6: Verify the document was actually deleted from SQL - async with db_engine.get_async_session() as session: - deleted_data = ( - await session.execute(select(Data).where(Data.id == data_id)) - ).scalar_one_or_none() + logger.info("๐Ÿงช Testing TextDocument cleanup based on last_accessed") - assert deleted_data is None, ( - "Data record should be deleted after cleanup" - ) - logger.info("โœ… Confirmed: Data record was deleted from SQL database") - - # Verify the dataset-data link was also removed - async with db_engine.get_async_session() as session: - dataset_data_link = ( - await session.execute( - select(DatasetData).where( - DatasetData.data_id == data_id, - DatasetData.dataset_id == dataset_id - ) - ) - ).scalar_one_or_none() + # Step 1: Add and cognify a test document + dataset_name = "test_cleanup_dataset" + test_text = """ + Machine learning is a subset of artificial intelligence that enables systems to learn + and improve from experience without being explicitly programmed. Deep learning uses + neural networks with multiple layers to process data. + """ - assert dataset_data_link is None, ( - "DatasetData link should be deleted after cleanup" - ) - logger.info("โœ… Confirmed: DatasetData link was deleted") + await setup() + user = await get_default_user() + await cognee.add([test_text], dataset_name=dataset_name, user=user) + + cognify_result = await cognee.cognify([dataset_name], user=user) + + # Extract dataset_id from cognify result + dataset_id = None + for ds_id, pipeline_result in cognify_result.items(): + dataset_id = ds_id + break + + assert dataset_id is not None, "Failed to get dataset_id from cognify result" + logger.info(f"โœ… Document added and cognified. Dataset ID: {dataset_id}") + + # Step 2: Perform search to trigger last_accessed update + logger.info("Triggering search to update last_accessed...") + search_results = await cognee.search( + query_type=SearchType.CHUNKS, + query_text="machine learning", + datasets=[dataset_name], + user=user + ) + logger.info(f"โœ… Search completed, found {len(search_results)} results") + assert len(search_results) > 0, "Search should return results" + + # Step 3: Verify last_accessed was set and get data_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + result = await session.execute( + select(Data, DatasetData) + .join(DatasetData, Data.id == DatasetData.data_id) + .where(DatasetData.dataset_id == dataset_id) + ) + data_records = result.all() + assert len(data_records) > 0, "No Data records found for the dataset" + data_record = data_records[0][0] + data_id = data_record.id + + # Verify last_accessed is set + assert data_record.last_accessed is not None, ( + "last_accessed should be set after search operation" + ) + + original_last_accessed = data_record.last_accessed + logger.info(f"โœ… last_accessed verified: {original_last_accessed}") + + # Step 4: Manually age the timestamp + minutes_threshold = 30 + aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10) + + async with db_engine.get_async_session() as session: + stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) + await session.execute(stmt) + await session.commit() + + # Verify timestamp was updated + async with db_engine.get_async_session() as session: + result = await session.execute(select(Data).where(Data.id == data_id)) + updated_data = result.scalar_one_or_none() + assert updated_data is not None, "Data record should exist" + retrieved_timestamp = updated_data.last_accessed + if retrieved_timestamp.tzinfo is None: + retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) + assert retrieved_timestamp == aged_timestamp, ( + f"Timestamp should be updated to aged value" + ) + + # Step 5: Test cleanup (document-level is now the default) + from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data + + # First do a dry run + logger.info("Testing dry run...") + dry_run_result = await cleanup_unused_data( + minutes_threshold=10, + dry_run=True, + user_id=user.id + ) + + # Debug: Print the actual result + logger.info(f"Dry run result: {dry_run_result}") - # Verify graph nodes were cleaned up - from cognee.infrastructure.databases.graph import get_graph_engine + assert dry_run_result['status'] == 'dry_run', f"Status should be 'dry_run', got: {dry_run_result['status']}" + assert dry_run_result['unused_count'] > 0, ( + "Should find at least one unused document" + ) + logger.info(f"โœ… Dry run found {dry_run_result['unused_count']} unused documents") + + # Now run actual cleanup + logger.info("Executing cleanup...") + cleanup_result = await cleanup_unused_data( + minutes_threshold=30, + dry_run=False, + user_id=user.id + ) + + assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" + assert cleanup_result["deleted_count"]["documents"] > 0, ( + "At least one document should be deleted" + ) + logger.info(f"โœ… Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") + + # Step 6: Verify deletion + async with db_engine.get_async_session() as session: + deleted_data = ( + await session.execute(select(Data).where(Data.id == data_id)) + ).scalar_one_or_none() + assert deleted_data is None, "Data record should be deleted" + logger.info("โœ… Confirmed: Data record was deleted") + + logger.info("๐ŸŽ‰ All cleanup tests passed!") + return True - graph_engine = await get_graph_engine() - # Try to find the TextDocument node - it should not exist - result = await graph_engine.query( - "MATCH (n:Node {id: $id}) RETURN n", - {"id": str(data_id)} - ) - - assert len(result) == 0, ( - "TextDocument node should be deleted from graph database" - ) - logger.info("โœ… Confirmed: TextDocument node was deleted from graph database") - - # Verify vector database was cleaned up - from cognee.infrastructure.databases.vector import get_vector_engine - - vector_engine = get_vector_engine() - - # Check each collection that should have been cleaned up - vector_collections = [ - "DocumentChunk_text", - "Entity_name", - "TextSummary_text" - ] - - for collection_name in vector_collections: - if await vector_engine.has_collection(collection_name): - # Try to retrieve the deleted data points - try: - results = await vector_engine.retrieve(collection_name, [str(data_id)]) - assert len(results) == 0, ( - f"Data points should be deleted from {collection_name} collection" - ) - logger.info(f"โœ… Confirmed: {collection_name} collection is clean") - except Exception as e: - # Collection might be empty or not exist, which is fine - logger.info(f"โœ… Confirmed: {collection_name} collection is empty or doesn't exist") - pass - - logger.info("โœ… Confirmed: Vector database entries were deleted") - - logger.info("๐ŸŽ‰ All cleanup tests passed!") - - return True - - -if __name__ == "__main__": - import asyncio - success = asyncio.run(test_textdocument_cleanup_with_sql()) +if __name__ == "__main__": + import asyncio + success = asyncio.run(test_textdocument_cleanup_with_sql()) exit(0 if success else 1) From 2485c3f5f0c2b25572213fe7638467859679c8d2 Mon Sep 17 00:00:00 2001 From: chinu0609 Date: Thu, 11 Dec 2025 12:48:06 +0530 Subject: [PATCH 29/37] fix: only document level deletion --- cognee/infrastructure/engine/models/DataPoint.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 3178713c8..812380eaa 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -43,9 +43,6 @@ class DataPoint(BaseModel): updated_at: int = Field( default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) ) - last_accessed_at: int = Field( - default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000) - ) ontology_valid: bool = False version: int = 1 # Default version topological_rank: Optional[int] = 0 From cd60ae31740acc9444f5aaf61fd7720deb2a5c51 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 11 Dec 2025 15:25:44 +0100 Subject: [PATCH 30/37] test: remove docs tests. add trigger to docs repo --- .github/workflows/docs_tests.yml | 280 ------------------ .github/workflows/release_test.yml | 23 +- .../tests/docs/guides/custom_data_models.py | 38 --- cognee/tests/docs/guides/custom_prompts.py | 30 -- .../docs/guides/custom_tasks_and_pipelines.py | 53 ---- .../tests/docs/guides/graph_visualization.py | 13 - cognee/tests/docs/guides/low_level_llm.py | 31 -- cognee/tests/docs/guides/memify_quickstart.py | 29 -- .../tests/docs/guides/ontology_quickstart.py | 30 -- cognee/tests/docs/guides/s3_storage.py | 25 -- cognee/tests/docs/guides/search_basics.py | 58 ---- cognee/tests/docs/guides/temporal_cognify.py | 57 ---- 12 files changed, 16 insertions(+), 651 deletions(-) delete mode 100644 .github/workflows/docs_tests.yml delete mode 100644 cognee/tests/docs/guides/custom_data_models.py delete mode 100644 cognee/tests/docs/guides/custom_prompts.py delete mode 100644 cognee/tests/docs/guides/custom_tasks_and_pipelines.py delete mode 100644 cognee/tests/docs/guides/graph_visualization.py delete mode 100644 cognee/tests/docs/guides/low_level_llm.py delete mode 100644 cognee/tests/docs/guides/memify_quickstart.py delete mode 100644 cognee/tests/docs/guides/ontology_quickstart.py delete mode 100644 cognee/tests/docs/guides/s3_storage.py delete mode 100644 cognee/tests/docs/guides/search_basics.py delete mode 100644 cognee/tests/docs/guides/temporal_cognify.py diff --git a/.github/workflows/docs_tests.yml b/.github/workflows/docs_tests.yml deleted file mode 100644 index 7f7282bb2..000000000 --- a/.github/workflows/docs_tests.yml +++ /dev/null @@ -1,280 +0,0 @@ -name: Docs Tests - -permissions: - contents: read - -on: - workflow_dispatch: - workflow_call: - secrets: - LLM_PROVIDER: - required: true - LLM_MODEL: - required: true - LLM_ENDPOINT: - required: true - LLM_API_KEY: - required: true - LLM_API_VERSION: - required: true - EMBEDDING_PROVIDER: - required: true - EMBEDDING_MODEL: - required: true - EMBEDDING_ENDPOINT: - required: true - EMBEDDING_API_KEY: - required: true - EMBEDDING_API_VERSION: - required: true - -env: - ENV: 'dev' - -jobs: - test-search-basics: - name: Test Search Basics - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Search Basics Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/search_basics.py - - test-temporal-cognify: - name: Test Temporal Cognify - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Temporal Cognify Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py - - test-ontology-quickstart: - name: Test Temporal Cognify - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Temporal Cognify Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/temporal_cognify.py - - test-s3-storage: - name: Test S3 Docs Guide - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - extra-dependencies: "aws" - - - name: Run S3 Docs Guide Test - env: - ENABLE_BACKEND_ACCESS_CONTROL: True - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - STORAGE_BACKEND: s3 - AWS_REGION: eu-west-1 - AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} - run: uv run python ./cognee/tests/docs/guides/s3_storage.py - - test-graph-visualization: - name: Test Graph Visualization - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Graph Visualization Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/graph_visualization.py - - test-low-level-llm: - name: Test Low Level LLM - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Low Level LLM Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/low_level_llm.py - - test-memify-quickstart: - name: Test Memify Quickstart - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Memify Quickstart Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/memify_quickstart.py - - test-custom-data-models: - name: Test Custom Data Models - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Custom Data Models Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/custom_data_models.py - - test-custom-tasks-and-pipelines: - name: Test Custom Tasks and Pipelines - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Custom Tasks and Pipelines Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/custom_tasks_and_pipelines.py - - test-custom-prompts: - name: Test Custom Prompts - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Custom Prompts Test - env: - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./cognee/tests/docs/guides/custom_prompts.py \ No newline at end of file diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 89540fcfb..c6dd68484 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -5,18 +5,27 @@ permissions: contents: read on: + push: + branches: + - feature/cog-3213-docs-set-up-guide-script-tests workflow_dispatch: pull_request: branches: - main jobs: - load-tests: - name: Load Tests - uses: ./.github/workflows/load_tests.yml - secrets: inherit +# load-tests: +# name: Load Tests +# uses: ./.github/workflows/load_tests.yml +# secrets: inherit docs-tests: - name: Docs Tests - uses: ./.github/workflows/docs_tests.yml - secrets: inherit \ No newline at end of file + runs-on: ubuntu-22.04 + steps: + - name: Trigger docs tests + run: | + curl -sS -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \ + https://api.github.com/repos/your-org/repo-b/dispatches \ + -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' diff --git a/cognee/tests/docs/guides/custom_data_models.py b/cognee/tests/docs/guides/custom_data_models.py deleted file mode 100644 index 0eb314227..000000000 --- a/cognee/tests/docs/guides/custom_data_models.py +++ /dev/null @@ -1,38 +0,0 @@ -import asyncio -from typing import Any -from pydantic import SkipValidation - -import cognee -from cognee.infrastructure.engine import DataPoint -from cognee.infrastructure.engine.models.Edge import Edge -from cognee.tasks.storage import add_data_points - - -class Person(DataPoint): - name: str - # Keep it simple for forward refs / mixed values - knows: SkipValidation[Any] = None # single Person or list[Person] - # Recommended: specify which fields to index for search - metadata: dict = {"index_fields": ["name"]} - - -async def main(): - # Start clean (optional in your app) - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - alice = Person(name="Alice") - bob = Person(name="Bob") - charlie = Person(name="Charlie") - - # Create relationships - field name becomes edge label - alice.knows = bob - # You can also do lists: alice.knows = [bob, charlie] - - # Optional: add weights and custom relationship types - bob.knows = (Edge(weight=0.9, relationship_type="friend_of"), charlie) - - await add_data_points([alice, bob, charlie]) - - -asyncio.run(main()) diff --git a/cognee/tests/docs/guides/custom_prompts.py b/cognee/tests/docs/guides/custom_prompts.py deleted file mode 100644 index 0d0a55a80..000000000 --- a/cognee/tests/docs/guides/custom_prompts.py +++ /dev/null @@ -1,30 +0,0 @@ -import asyncio -import cognee -from cognee.api.v1.search import SearchType - -custom_prompt = """ -Extract only people and cities as entities. -Connect people to cities with the relationship "lives_in". -Ignore all other entities. -""" - - -async def main(): - await cognee.add( - [ - "Alice moved to Paris in 2010, while Bob has always lived in New York.", - "Andreas was born in Venice, but later settled in Lisbon.", - "Diana and Tom were born and raised in Helsingy. Diana currently resides in Berlin, while Tom never moved.", - ] - ) - await cognee.cognify(custom_prompt=custom_prompt) - - res = await cognee.search( - query_type=SearchType.GRAPH_COMPLETION, - query_text="Where does Alice live?", - ) - print(res) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/docs/guides/custom_tasks_and_pipelines.py b/cognee/tests/docs/guides/custom_tasks_and_pipelines.py deleted file mode 100644 index 202bb128a..000000000 --- a/cognee/tests/docs/guides/custom_tasks_and_pipelines.py +++ /dev/null @@ -1,53 +0,0 @@ -import asyncio -from typing import Any, Dict, List -from pydantic import BaseModel, SkipValidation - -import cognee -from cognee.modules.engine.operations.setup import setup -from cognee.infrastructure.llm.LLMGateway import LLMGateway -from cognee.infrastructure.engine import DataPoint -from cognee.tasks.storage import add_data_points -from cognee.modules.pipelines import Task, run_pipeline - - -class Person(DataPoint): - name: str - # Optional relationships (we'll let the LLM populate this) - knows: List["Person"] = [] - # Make names searchable in the vector store - metadata: Dict[str, Any] = {"index_fields": ["name"]} - - -class People(BaseModel): - persons: List[Person] - - -async def extract_people(text: str) -> List[Person]: - system_prompt = ( - "Extract people mentioned in the text. " - "Return as `persons: Person[]` with each Person having `name` and optional `knows` relations. " - "If the text says someone knows someone set `knows` accordingly. " - "Only include facts explicitly stated." - ) - people = await LLMGateway.acreate_structured_output(text, system_prompt, People) - return people.persons - - -async def main(): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - text = "Alice knows Bob." - - tasks = [ - Task(extract_people), # input: text -> output: list[Person] - Task(add_data_points), # input: list[Person] -> output: list[Person] - ] - - async for _ in run_pipeline(tasks=tasks, data=text, datasets=["people_demo"]): - pass - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/docs/guides/graph_visualization.py b/cognee/tests/docs/guides/graph_visualization.py deleted file mode 100644 index d463cbb56..000000000 --- a/cognee/tests/docs/guides/graph_visualization.py +++ /dev/null @@ -1,13 +0,0 @@ -import asyncio -import cognee -from cognee.api.v1.visualize.visualize import visualize_graph - - -async def main(): - await cognee.add(["Alice knows Bob.", "NLP is a subfield of CS."]) - await cognee.cognify() - - await visualize_graph("./graph_after_cognify.html") - - -asyncio.run(main()) diff --git a/cognee/tests/docs/guides/low_level_llm.py b/cognee/tests/docs/guides/low_level_llm.py deleted file mode 100644 index 454f53f44..000000000 --- a/cognee/tests/docs/guides/low_level_llm.py +++ /dev/null @@ -1,31 +0,0 @@ -import asyncio - -from pydantic import BaseModel -from typing import List -from cognee.infrastructure.llm.LLMGateway import LLMGateway - - -class MiniEntity(BaseModel): - name: str - type: str - - -class MiniGraph(BaseModel): - nodes: List[MiniEntity] - - -async def main(): - system_prompt = ( - "Extract entities as nodes with name and type. " - "Use concise, literal values present in the text." - ) - - text = "Apple develops iPhone; Audi produces the R8." - - result = await LLMGateway.acreate_structured_output(text, system_prompt, MiniGraph) - print(result) - # MiniGraph(nodes=[MiniEntity(name='Apple', type='Organization'), ...]) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/docs/guides/memify_quickstart.py b/cognee/tests/docs/guides/memify_quickstart.py deleted file mode 100644 index 040654350..000000000 --- a/cognee/tests/docs/guides/memify_quickstart.py +++ /dev/null @@ -1,29 +0,0 @@ -import asyncio -import cognee -from cognee import SearchType - - -async def main(): - # 1) Add two short chats and build a graph - await cognee.add( - [ - "We follow PEP8. Add type hints and docstrings.", - "Releases should not be on Friday. Susan must review PRs.", - ], - dataset_name="rules_demo", - ) - await cognee.cognify(datasets=["rules_demo"]) # builds graph - - # 2) Enrich the graph (uses default memify tasks) - await cognee.memify(dataset="rules_demo") - - # 3) Query the new coding rules - rules = await cognee.search( - query_type=SearchType.CODING_RULES, - query_text="List coding rules", - node_name=["coding_agent_rules"], - ) - print("Rules:", rules) - - -asyncio.run(main()) diff --git a/cognee/tests/docs/guides/ontology_quickstart.py b/cognee/tests/docs/guides/ontology_quickstart.py deleted file mode 100644 index 2784dab19..000000000 --- a/cognee/tests/docs/guides/ontology_quickstart.py +++ /dev/null @@ -1,30 +0,0 @@ -import asyncio -import cognee - - -async def main(): - texts = ["Audi produces the R8 and e-tron.", "Apple develops iPhone and MacBook."] - - await cognee.add(texts) - # or: await cognee.add("/path/to/folder/of/files") - - import os - from cognee.modules.ontology.ontology_config import Config - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver - - ontology_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "ontology_input_example/basic_ontology.owl" - ) - - # Create full config structure manually - config: Config = { - "ontology_config": { - "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_path) - } - } - - await cognee.cognify(config=config) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/docs/guides/s3_storage.py b/cognee/tests/docs/guides/s3_storage.py deleted file mode 100644 index 1044e05b4..000000000 --- a/cognee/tests/docs/guides/s3_storage.py +++ /dev/null @@ -1,25 +0,0 @@ -import asyncio -import cognee - - -async def main(): - # Single file - await cognee.add("s3://cognee-temp/2024-11-04.md") - - # Folder/prefix (recursively expands) - await cognee.add("s3://cognee-temp") - - # Mixed list - await cognee.add( - [ - "s3://cognee-temp/2024-11-04.md", - "Some inline text to ingest", - ] - ) - - # Process the data - await cognee.cognify() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/docs/guides/search_basics.py b/cognee/tests/docs/guides/search_basics.py deleted file mode 100644 index f1847ad4b..000000000 --- a/cognee/tests/docs/guides/search_basics.py +++ /dev/null @@ -1,58 +0,0 @@ -import asyncio -import cognee - -from cognee.modules.search.types import SearchType, CombinedSearchResult - - -async def main(): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - text = """ - Natural language processing (NLP) is an interdisciplinary - subfield of computer science and information retrieval. - First rule of coding: Do not talk about coding. - """ - - text2 = """ - Sandwiches are best served toasted with cheese, ham, mayo, - lettuce, mustard, and salt & pepper. - """ - - await cognee.add(text, dataset_name="NLP_coding") - await cognee.add(text2, dataset_name="Sandwiches") - await cognee.add(text2) - - await cognee.cognify() - - # Make sure you've already run cognee.cognify(...) so the graph has content - answers = await cognee.search(query_text="What are the main themes in my data?") - assert len(answers) > 0 - - answers = await cognee.search( - query_text="List coding guidelines", - query_type=SearchType.CODING_RULES, - ) - assert len(answers) > 0 - - answers = await cognee.search( - query_text="Give me a confident answer: What is NLP?", - system_prompt="Answer succinctly and state confidence at the end.", - ) - assert len(answers) > 0 - - answers = await cognee.search( - query_text="Tell me about NLP", - only_context=True, - ) - assert len(answers) > 0 - - answers = await cognee.search( - query_text="Quarterly financial highlights", - datasets=["NLP_coding", "Sandwiches"], - use_combined_context=True, - ) - assert isinstance(answers, CombinedSearchResult) - - -asyncio.run(main()) diff --git a/cognee/tests/docs/guides/temporal_cognify.py b/cognee/tests/docs/guides/temporal_cognify.py deleted file mode 100644 index 34c1ee33c..000000000 --- a/cognee/tests/docs/guides/temporal_cognify.py +++ /dev/null @@ -1,57 +0,0 @@ -import asyncio -import cognee - - -async def main(): - text = """ - In 1998 the project launched. In 2001 version 1.0 shipped. In 2004 the team merged - with another group. In 2010 support for v1 ended. - """ - - await cognee.add(text, dataset_name="timeline_demo") - - await cognee.cognify(datasets=["timeline_demo"], temporal_cognify=True) - - from cognee.api.v1.search import SearchType - - # Before / after queries - result = await cognee.search( - query_type=SearchType.TEMPORAL, query_text="What happened before 2000?", top_k=10 - ) - - assert result != [] - - result = await cognee.search( - query_type=SearchType.TEMPORAL, query_text="What happened after 2010?", top_k=10 - ) - - assert result != [] - - # Between queries - result = await cognee.search( - query_type=SearchType.TEMPORAL, query_text="Events between 2001 and 2004", top_k=10 - ) - - assert result != [] - - # Scoped descriptions - result = await cognee.search( - query_type=SearchType.TEMPORAL, - query_text="Key project milestones between 1998 and 2010", - top_k=10, - ) - - assert result != [] - - result = await cognee.search( - query_type=SearchType.TEMPORAL, - query_text="What happened after 2004?", - datasets=["timeline_demo"], - top_k=10, - ) - - assert result != [] - - -if __name__ == "__main__": - asyncio.run(main()) From 41edeb0cf890e0d0b733bcd4befb03b870e70cbc Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 11 Dec 2025 16:01:26 +0100 Subject: [PATCH 31/37] test: change target repo name --- .github/workflows/release_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index c6dd68484..3fef0732a 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -27,5 +27,5 @@ jobs: curl -sS -X POST \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \ - https://api.github.com/repos/your-org/repo-b/dispatches \ + https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' From 0f4cf15d588e5dfa672d680e5258de284d308367 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 11 Dec 2025 16:24:47 +0100 Subject: [PATCH 32/37] test: fix docs test trigger --- .github/workflows/release_test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 3fef0732a..76ce3b09d 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -24,8 +24,9 @@ jobs: steps: - name: Trigger docs tests run: | - curl -sS -X POST \ + curl -L -X POST \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' From e92d8f57b56823e0a1a4bf5ccf6734cdda01d56f Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Dec 2025 13:14:14 +0100 Subject: [PATCH 33/37] feat: add comunity test trigger --- .github/workflows/release_test.yml | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 76ce3b09d..be57c7fbf 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -19,14 +19,28 @@ jobs: # uses: ./.github/workflows/load_tests.yml # secrets: inherit - docs-tests: +# docs-tests: +# runs-on: ubuntu-22.04 +# steps: +# - name: Trigger docs tests +# run: | +# curl -L -X POST \ +# -H "Accept: application/vnd.github+json" \ +# -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ +# -H "X-GitHub-Api-Version: 2022-11-28" \ +# https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ +# -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' + + trigger-community-test-suite: + needs: release-pypi-package + if: ${{ inputs.flavour == 'main' }} runs-on: ubuntu-22.04 steps: - - name: Trigger docs tests + - name: Trigger community tests run: | curl -L -X POST \ -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.DOCS_REPO_PAT_TOKEN }}" \ + -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ + https://api.github.com/repos/topoteretes/cognee-community/dispatches \ -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' From 601f74db4fda3c1bc3603d03bfbe22be7c8d6a24 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Dec 2025 13:15:43 +0100 Subject: [PATCH 34/37] test: remove dependency from community trigger --- .github/workflows/release_test.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index be57c7fbf..dcb709ead 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -14,6 +14,18 @@ on: - main jobs: + trigger-community-test-suite: + if: ${{ inputs.flavour == 'main' }} + runs-on: ubuntu-22.04 + steps: + - name: Trigger community tests + run: | + curl -L -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/topoteretes/cognee-community/dispatches \ + -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' # load-tests: # name: Load Tests # uses: ./.github/workflows/load_tests.yml @@ -30,17 +42,3 @@ jobs: # -H "X-GitHub-Api-Version: 2022-11-28" \ # https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ # -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' - - trigger-community-test-suite: - needs: release-pypi-package - if: ${{ inputs.flavour == 'main' }} - runs-on: ubuntu-22.04 - steps: - - name: Trigger community tests - run: | - curl -L -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/topoteretes/cognee-community/dispatches \ - -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' From a5a7ae2564abd90c0bf9b51b3abfc2a24a067a8f Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Dec 2025 13:16:46 +0100 Subject: [PATCH 35/37] test: remove if --- .github/workflows/release_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index dcb709ead..08595a01e 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -15,7 +15,6 @@ on: jobs: trigger-community-test-suite: - if: ${{ inputs.flavour == 'main' }} runs-on: ubuntu-22.04 steps: - name: Trigger community tests From 6958b4edd462615e2e973d7cabd369181c030eba Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Dec 2025 13:50:03 +0100 Subject: [PATCH 36/37] feat: add the triggers to release, after pypi publishing --- .github/workflows/release.yml | 28 ++++++++++++++++++++++++++++ .github/workflows/release_test.yml | 30 ++++-------------------------- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 84601edf7..26ccce1f0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -136,3 +136,31 @@ jobs: flavour=${{ inputs.flavour }} cache-from: type=registry,ref=cognee/cognee:buildcache cache-to: type=registry,ref=cognee/cognee:buildcache,mode=max + + trigger-docs-test-suite: + needs: release-pypi-package + if: ${{ inputs.flavour == 'main' }} + runs-on: ubuntu-22.04 + steps: + - name: Trigger docs tests + run: | + curl -L -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ + -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' + + trigger-community-test-suite: + needs: release-pypi-package + if: ${{ inputs.flavour == 'main' }} + runs-on: ubuntu-22.04 + steps: + - name: Trigger community tests + run: | + curl -L -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/topoteretes/cognee-community/dispatches \ + -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' \ No newline at end of file diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 08595a01e..6090a1217 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -14,30 +14,8 @@ on: - main jobs: - trigger-community-test-suite: - runs-on: ubuntu-22.04 - steps: - - name: Trigger community tests - run: | - curl -L -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/topoteretes/cognee-community/dispatches \ - -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' -# load-tests: -# name: Load Tests -# uses: ./.github/workflows/load_tests.yml -# secrets: inherit + load-tests: + name: Load Tests + uses: ./.github/workflows/load_tests.yml + secrets: inherit -# docs-tests: -# runs-on: ubuntu-22.04 -# steps: -# - name: Trigger docs tests -# run: | -# curl -L -X POST \ -# -H "Accept: application/vnd.github+json" \ -# -H "Authorization: Bearer ${{ secrets.REPO_DISPATCH_PAT_TOKEN }}" \ -# -H "X-GitHub-Api-Version: 2022-11-28" \ -# https://api.github.com/repos/topoteretes/cognee-docs/dispatches \ -# -d '{"event_type":"new-main-release","client_payload":{"caller_repo":"'"${GITHUB_REPOSITORY}"'"}}' From 431a83247fff487357a253cbddb00e779e8bda9b Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Dec 2025 13:50:43 +0100 Subject: [PATCH 37/37] chore: remove unnecessary 'on push' setting --- .github/workflows/release_test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 6090a1217..b31b431a4 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -5,9 +5,6 @@ permissions: contents: read on: - push: - branches: - - feature/cog-3213-docs-set-up-guide-script-tests workflow_dispatch: pull_request: branches: