This commit is contained in:
Chinmay Bhosale 2026-01-20 16:27:41 +00:00 committed by GitHub
commit cf40657a9c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 372 additions and 301 deletions

View file

@ -216,6 +216,11 @@ TOKENIZERS_PARALLELISM="false"
# LITELLM Logging Level. Set to quiet down logging # LITELLM Logging Level. Set to quiet down logging
LITELLM_LOG="ERROR" LITELLM_LOG="ERROR"
# Enable or disable the last accessed timestamp tracking and cleanup functionality.
ENABLE_LAST_ACCESSED="false"
# Set this environment variable to disable sending telemetry data # Set this environment variable to disable sending telemetry data
# TELEMETRY_DISABLED=1 # TELEMETRY_DISABLED=1

View file

@ -1,5 +1,4 @@
from typing import Any, Optional from typing import Any, Optional
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.base_retriever import BaseRetriever
@ -51,7 +50,6 @@ class ChunksRetriever(BaseRetriever):
"DocumentChunk_text", query, limit=self.top_k, include_payload=True "DocumentChunk_text", query, limit=self.top_k, include_payload=True
) )
logger.info(f"Found {len(found_chunks)} chunks from vector search") logger.info(f"Found {len(found_chunks)} chunks from vector search")
await update_node_access_timestamps(found_chunks)
except CollectionNotFoundError as error: except CollectionNotFoundError as error:
logger.error("DocumentChunk_text collection not found in vector database") logger.error("DocumentChunk_text collection not found in vector database")

View file

@ -8,7 +8,6 @@ from cognee.modules.retrieval.utils.session_cache import (
save_conversation_history, save_conversation_history,
get_conversation_history, get_conversation_history,
) )
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.base_retriever import BaseRetriever
from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.modules.retrieval.exceptions.exceptions import NoDataError
from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
@ -68,7 +67,6 @@ class CompletionRetriever(BaseRetriever):
if len(found_chunks) == 0: if len(found_chunks) == 0:
return "" return ""
await update_node_access_timestamps(found_chunks)
# Combine all chunks text returned from vector search (number of chunks is determined by top_k # Combine all chunks text returned from vector search (number of chunks is determined by top_k
chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks] chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks]
combined_context = "\n".join(chunks_payload) combined_context = "\n".join(chunks_payload)

View file

@ -16,7 +16,6 @@ from cognee.modules.retrieval.utils.session_cache import (
) )
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
from cognee.modules.retrieval.utils.models import CogneeUserInteraction from cognee.modules.retrieval.utils.models import CogneeUserInteraction
from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.engine.models.node_set import NodeSet
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
@ -149,7 +148,6 @@ class GraphCompletionRetriever(BaseGraphRetriever):
entity_nodes = get_entity_nodes_from_triplets(triplets) entity_nodes = get_entity_nodes_from_triplets(triplets)
await update_node_access_timestamps(entity_nodes)
return triplets return triplets
async def convert_retrieved_objects_to_context(self, triplets: List[Edge]): async def convert_retrieved_objects_to_context(self, triplets: List[Edge]):

View file

@ -4,7 +4,6 @@ from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.base_retriever import BaseRetriever
from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.modules.retrieval.exceptions.exceptions import NoDataError
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
logger = get_logger("SummariesRetriever") logger = get_logger("SummariesRetriever")
@ -56,8 +55,6 @@ class SummariesRetriever(BaseRetriever):
) )
logger.info(f"Found {len(summaries_results)} summaries from vector search") logger.info(f"Found {len(summaries_results)} summaries from vector search")
await update_node_access_timestamps(summaries_results)
except CollectionNotFoundError as error: except CollectionNotFoundError as error:
logger.error("TextSummary_text collection not found in vector database") logger.error("TextSummary_text collection not found in vector database")
raise NoDataError("No data found in the system, please add data first.") from error raise NoDataError("No data found in the system, please add data first.") from error

View file

@ -1,88 +1,87 @@
"""Utilities for tracking data access in retrievers.""" """Utilities for tracking data access in retrievers."""
import json import json
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import List, Any from typing import List, Any
from uuid import UUID from uuid import UUID
import os import os
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Data from cognee.modules.data.models import Data
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from sqlalchemy import update from sqlalchemy import update
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
logger = get_logger(__name__) logger = get_logger(__name__)
async def update_node_access_timestamps(items: List[Any]): async def update_node_access_timestamps(items: List[Any]):
if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":
return return
if not items: if not items:
return return
graph_engine = await get_graph_engine() graph_engine = await get_graph_engine()
timestamp_dt = datetime.now(timezone.utc) timestamp_dt = datetime.now(timezone.utc)
# Extract node IDs # Extract node IDs - updated for graph node format
node_ids = [] node_ids = []
for item in items: for item in items:
item_id = item.payload.get("id") if hasattr(item, "payload") else item.get("id") # Handle graph nodes from prepare_search_result (direct id attribute)
if item_id: if hasattr(item, 'id'):
node_ids.append(str(item_id)) node_ids.append(str(item.id))
# Fallback for original retriever format
if not node_ids: elif hasattr(item, 'payload') and item.payload.get("id"):
return node_ids.append(str(item.payload.get("id")))
elif isinstance(item, dict) and item.get("id"):
# Focus on document-level tracking via projection node_ids.append(str(item.get("id")))
try:
doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) if not node_ids:
if doc_ids: return
await _update_sql_records(doc_ids, timestamp_dt)
except Exception as e: # Focus on document-level tracking via projection
logger.error(f"Failed to update SQL timestamps: {e}") try:
raise doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)
if doc_ids:
await _update_sql_records(doc_ids, timestamp_dt)
async def _find_origin_documents_via_projection(graph_engine, node_ids): except Exception as e:
"""Find origin documents using graph projection instead of DB queries""" logger.error(f"Failed to update SQL timestamps: {e}")
# Project the entire graph with necessary properties raise
memory_fragment = CogneeGraph()
await memory_fragment.project_graph_from_db(
graph_engine, async def _find_origin_documents_via_projection(graph_engine, node_ids):
node_properties_to_project=["id", "type"], """Find origin documents using graph projection instead of DB queries"""
edge_properties_to_project=["relationship_name"], # Project the entire graph with necessary properties
) memory_fragment = CogneeGraph()
await memory_fragment.project_graph_from_db(
# Find origin documents by traversing the in-memory graph graph_engine,
doc_ids = set() node_properties_to_project=["id", "type"],
for node_id in node_ids: edge_properties_to_project=["relationship_name"]
node = memory_fragment.get_node(node_id) )
if node and node.get_attribute("type") == "DocumentChunk":
# Traverse edges to find connected documents # Find origin documents by traversing the in-memory graph
for edge in node.get_skeleton_edges(): doc_ids = set()
# Get the neighbor node for node_id in node_ids:
neighbor = ( node = memory_fragment.get_node(node_id)
edge.get_destination_node() if node and node.get_attribute("type") == "DocumentChunk":
if edge.get_source_node().id == node_id # Traverse edges to find connected documents
else edge.get_source_node() for edge in node.get_skeleton_edges():
) # Get the neighbor node
if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node()
doc_ids.add(neighbor.id) if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:
doc_ids.add(neighbor.id)
return list(doc_ids)
return list(doc_ids)
async def _update_sql_records(doc_ids, timestamp_dt):
"""Update SQL Data table (same for all providers)""" async def _update_sql_records(doc_ids, timestamp_dt):
db_engine = get_relational_engine() """Update SQL Data table (same for all providers)"""
async with db_engine.get_async_session() as session: db_engine = get_relational_engine()
stmt = ( async with db_engine.get_async_session() as session:
update(Data) stmt = update(Data).where(
.where(Data.id.in_([UUID(doc_id) for doc_id in doc_ids])) Data.id.in_([UUID(doc_id) for doc_id in doc_ids])
.values(last_accessed=timestamp_dt) ).values(last_accessed=timestamp_dt)
)
await session.execute(stmt)
await session.execute(stmt)
await session.commit() await session.commit()

View file

@ -13,8 +13,9 @@ from cognee.context_global_variables import backend_access_control_enabled
from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.engine.models.node_set import NodeSet
from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
from cognee.modules.search.types import ( from cognee.modules.search.types import (
SearchResult, SearchResultDataset,
SearchType, SearchResult,
SearchType,
) )
from cognee.modules.search.operations import log_query, log_result from cognee.modules.search.operations import log_query, log_result
from cognee.modules.users.models import User from cognee.modules.users.models import User
@ -26,6 +27,7 @@ from cognee import __version__ as cognee_version
from .get_search_type_tools import get_search_type_tools from .get_search_type_tools import get_search_type_tools
from .no_access_control_search import no_access_control_search from .no_access_control_search import no_access_control_search
from ..utils.prepare_search_result import prepare_search_result from ..utils.prepare_search_result import prepare_search_result
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
logger = get_logger() logger = get_logger()
@ -43,10 +45,11 @@ async def search(
save_interaction: bool = False, save_interaction: bool = False,
last_k: Optional[int] = None, last_k: Optional[int] = None,
only_context: bool = False, only_context: bool = False,
use_combined_context: bool = False,
session_id: Optional[str] = None, session_id: Optional[str] = None,
wide_search_top_k: Optional[int] = 100, wide_search_top_k: Optional[int] = 100,
triplet_distance_penalty: Optional[float] = 3.5, triplet_distance_penalty: Optional[float] = 3.5,
verbose=False, verbose: bool = False,
) -> List[SearchResult]: ) -> List[SearchResult]:
""" """
@ -73,9 +76,11 @@ async def search(
}, },
) )
actual_accessed_items = [] # Collect all accessed items here
# Use search function filtered by permissions if access control is enabled # Use search function filtered by permissions if access control is enabled
if backend_access_control_enabled(): if backend_access_control_enabled():
search_results = await authorized_search( raw_search_results = await authorized_search(
query_type=query_type, query_type=query_type,
query_text=query_text, query_text=query_text,
user=user, user=user,
@ -92,8 +97,19 @@ async def search(
wide_search_top_k=wide_search_top_k, wide_search_top_k=wide_search_top_k,
triplet_distance_penalty=triplet_distance_penalty, triplet_distance_penalty=triplet_distance_penalty,
) )
if use_combined_context:
# raw_search_results is (completion, context, datasets)
_, context_data, _ = raw_search_results
if isinstance(context_data, list): # Expecting a list of Edge or similar
actual_accessed_items.extend(context_data)
# If context_data is a string, it's already textual and might not map to specific nodes for timestamp updates
else:
for result_tuple in raw_search_results:
_, context_data, _ = result_tuple
if isinstance(context_data, list): # Expecting a list of Edge or similar
actual_accessed_items.extend(context_data)
else: else:
search_results = [ raw_search_results = [
await no_access_control_search( await no_access_control_search(
query_type=query_type, query_type=query_type,
query_text=query_text, query_text=query_text,
@ -110,6 +126,15 @@ async def search(
triplet_distance_penalty=triplet_distance_penalty, triplet_distance_penalty=triplet_distance_penalty,
) )
] ]
# In this case, raw_search_results is a list containing a single tuple
if raw_search_results:
_, context_data, _ = raw_search_results[0]
if isinstance(context_data, list): # Expecting a list of Edge or similar
actual_accessed_items.extend(context_data)
# Call the update_node_access_timestamps function here
# Pass the collected actual_accessed_items
await update_node_access_timestamps(actual_accessed_items)
send_telemetry( send_telemetry(
"cognee.search EXECUTION COMPLETED", "cognee.search EXECUTION COMPLETED",
@ -120,6 +145,8 @@ async def search(
}, },
) )
search_results = raw_search_results
await log_result( await log_result(
query.id, query.id,
json.dumps( json.dumps(
@ -130,48 +157,65 @@ async def search(
user.id, user.id,
) )
# This is for maintaining backwards compatibility if use_combined_context:
if backend_access_control_enabled(): # Note: combined context search must always be verbose and return a CombinedSearchResult with graphs info
return_value = [] prepared_search_results = await prepare_search_result(
for search_result in search_results: search_results[0] if isinstance(search_results, list) else search_results
prepared_search_results = await prepare_search_result(search_result) )
result = prepared_search_results["result"]
graphs = prepared_search_results["graphs"]
context = prepared_search_results["context"]
datasets = prepared_search_results["datasets"]
result = prepared_search_results["result"] return CombinedSearchResult(
graphs = prepared_search_results["graphs"] result=result,
context = prepared_search_results["context"] graphs=graphs,
datasets = prepared_search_results["datasets"] context=context,
datasets=[
if only_context: SearchResultDataset(
search_result_dict = { id=dataset.id,
"search_result": [context] if context else None, name=dataset.name,
"dataset_id": datasets[0].id, )
"dataset_name": datasets[0].name, for dataset in datasets
"dataset_tenant_id": datasets[0].tenant_id, ],
} )
if verbose:
# Include graphs only in verbose mode
search_result_dict["graphs"] = graphs
return_value.append(search_result_dict)
else:
search_result_dict = {
"search_result": [result] if result else None,
"dataset_id": datasets[0].id,
"dataset_name": datasets[0].name,
"dataset_tenant_id": datasets[0].tenant_id,
}
if verbose:
# Include graphs only in verbose mode
search_result_dict["graphs"] = graphs
return_value.append(search_result_dict)
return return_value
else: else:
return_value = [] return_value = []
if only_context: if only_context:
for search_result in search_results: for search_result in search_results:
prepared_search_results = await prepare_search_result(search_result) prepared_search_results = await prepare_search_result(search_result)
return_value.append(prepared_search_results["context"])
result = prepared_search_results["result"]
graphs = prepared_search_results["graphs"]
context = prepared_search_results["context"]
datasets = prepared_search_results["datasets"]
if only_context:
search_result_dict = {
"search_result": [context] if context else None,
"dataset_id": datasets[0].id,
"dataset_name": datasets[0].name,
"dataset_tenant_id": datasets[0].tenant_id,
}
if verbose:
# Include graphs only in verbose mode
search_result_dict["graphs"] = graphs
return_value.append(search_result_dict)
else:
search_result_dict = {
"search_result": [result] if result else None,
"dataset_id": datasets[0].id,
"dataset_name": datasets[0].name,
"dataset_tenant_id": datasets[0].tenant_id,
}
if verbose:
# Include graphs only in verbose mode
search_result_dict["graphs"] = graphs
return_value.append(search_result_dict)
return return_value
else: else:
for search_result in search_results: for search_result in search_results:
result, context, datasets = search_result result, context, datasets = search_result

View file

@ -1,165 +1,197 @@
import os import os
import pathlib import pathlib
import cognee import cognee
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from uuid import UUID from uuid import UUID
from sqlalchemy import select, update from sqlalchemy import select, update
from cognee.modules.data.models import Data, DatasetData from cognee.modules.data.models import Data, DatasetData
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.users.methods import get_default_user from cognee.modules.users.methods import get_default_user
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.search.types import SearchType from cognee.modules.search.types import SearchType
logger = get_logger() logger = get_logger()
async def test_textdocument_cleanup_with_sql(): async def test_all_search_types_cleanup():
""" """
End-to-end test for TextDocument cleanup based on last_accessed timestamps. End-to-end test for TextDocument cleanup based on last_accessed timestamps
""" across all search types.
# Enable last accessed tracking BEFORE any cognee operations """
os.environ["ENABLE_LAST_ACCESSED"] = "true" # Enable last accessed tracking BEFORE any cognee operations
os.environ["ENABLE_LAST_ACCESSED"] = "true"
# Setup test directories
data_directory_path = str( # Setup test directories
pathlib.Path( data_directory_path = str(
os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") pathlib.Path(
).resolve() os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")
) ).resolve()
cognee_directory_path = str( )
pathlib.Path( cognee_directory_path = str(
os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") pathlib.Path(
).resolve() os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")
) ).resolve()
)
cognee.config.data_root_directory(data_directory_path)
cognee.config.system_root_directory(cognee_directory_path) cognee.config.data_root_directory(data_directory_path)
cognee.config.system_root_directory(cognee_directory_path)
# Initialize database
from cognee.modules.engine.operations.setup import setup # Initialize database
from cognee.modules.engine.operations.setup import setup
# Clean slate
await cognee.prune.prune_data() # Clean slate
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
logger.info("🧪 Testing TextDocument cleanup based on last_accessed")
logger.info("🧪 Testing TextDocument cleanup for all search types")
# Step 1: Add and cognify a test document
dataset_name = "test_cleanup_dataset" # Step 1: Add and cognify a test document
test_text = """ dataset_name = "test_cleanup_dataset"
Machine learning is a subset of artificial intelligence that enables systems to learn test_text = """
and improve from experience without being explicitly programmed. Deep learning uses Machine learning is a subset of artificial intelligence that enables systems to learn
neural networks with multiple layers to process data. and improve from experience without being explicitly programmed. Deep learning uses
""" neural networks with multiple layers to process data.
"""
await setup()
user = await get_default_user() await setup()
await cognee.add([test_text], dataset_name=dataset_name, user=user) user = await get_default_user()
await cognee.add([test_text], dataset_name=dataset_name, user=user)
cognify_result = await cognee.cognify([dataset_name], user=user)
cognify_result = await cognee.cognify([dataset_name], user=user)
# Extract dataset_id from cognify result
dataset_id = None # Extract dataset_id from cognify result
for ds_id, pipeline_result in cognify_result.items(): dataset_id = None
dataset_id = ds_id for ds_id, pipeline_result in cognify_result.items():
break dataset_id = ds_id
break
assert dataset_id is not None, "Failed to get dataset_id from cognify result"
logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}") assert dataset_id is not None, "Failed to get dataset_id from cognify result"
logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")
# Step 2: Perform search to trigger last_accessed update
logger.info("Triggering search to update last_accessed...") # All available search types to test (excluding CODE)
search_results = await cognee.search( search_types_to_test = [
query_type=SearchType.CHUNKS, SearchType.CHUNKS,
query_text="machine learning", SearchType.SUMMARIES,
datasets=[dataset_name], SearchType.RAG_COMPLETION,
user=user, SearchType.GRAPH_COMPLETION,
) SearchType.GRAPH_SUMMARY_COMPLETION,
logger.info(f"✅ Search completed, found {len(search_results)} results") SearchType.GRAPH_COMPLETION_COT,
assert len(search_results) > 0, "Search should return results" SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION,
SearchType.FEELING_LUCKY,
# Step 3: Verify last_accessed was set and get data_id SearchType.CHUNKS_LEXICAL,
db_engine = get_relational_engine() ]
async with db_engine.get_async_session() as session:
result = await session.execute( # Skip search types that require special data or permissions
select(Data, DatasetData) skip_types = {
.join(DatasetData, Data.id == DatasetData.data_id) SearchType.CYPHER, # Requires ALLOW_CYPHER_QUERY=true
.where(DatasetData.dataset_id == dataset_id) SearchType.NATURAL_LANGUAGE, # Requires ALLOW_CYPHER_QUERY=true
) SearchType.FEEDBACK, # Requires previous search interaction
data_records = result.all() SearchType.TEMPORAL, # Requires temporal data
assert len(data_records) > 0, "No Data records found for the dataset" SearchType.CODING_RULES, # Requires coding rules data
data_record = data_records[0][0] }
data_id = data_record.id
tested_data_ids = []
# Verify last_accessed is set
assert data_record.last_accessed is not None, ( # Test each search type
"last_accessed should be set after search operation" for search_type in search_types_to_test:
) if search_type in skip_types:
logger.info(f"⏭️ Skipping {search_type.value} (requires special setup)")
original_last_accessed = data_record.last_accessed continue
logger.info(f"✅ last_accessed verified: {original_last_accessed}")
logger.info(f"🔍 Testing {search_type.value} search...")
# Step 4: Manually age the timestamp
minutes_threshold = 30 try:
aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10) # Perform search to trigger last_accessed update
search_results = await cognee.search(
async with db_engine.get_async_session() as session: query_type=search_type,
stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) query_text="machine learning",
await session.execute(stmt) datasets=[dataset_name],
await session.commit() user=user,
)
# Verify timestamp was updated
async with db_engine.get_async_session() as session: logger.info(f"{search_type.value} search completed, found {len(search_results)} results")
result = await session.execute(select(Data).where(Data.id == data_id))
updated_data = result.scalar_one_or_none() # Verify last_accessed was set
assert updated_data is not None, "Data record should exist" db_engine = get_relational_engine()
retrieved_timestamp = updated_data.last_accessed async with db_engine.get_async_session() as session:
if retrieved_timestamp.tzinfo is None: result = await session.execute(
retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) select(Data, DatasetData)
assert retrieved_timestamp == aged_timestamp, "Timestamp should be updated to aged value" .join(DatasetData, Data.id == DatasetData.data_id)
.where(DatasetData.dataset_id == dataset_id)
# Step 5: Test cleanup (document-level is now the default) )
from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data data_records = result.all()
assert len(data_records) > 0, "No Data records found for the dataset"
# First do a dry run data_record = data_records[0][0]
logger.info("Testing dry run...") data_id = data_record.id
dry_run_result = await cleanup_unused_data(minutes_threshold=10, dry_run=True, user_id=user.id)
# Verify last_accessed is set
# Debug: Print the actual result assert data_record.last_accessed is not None, (
logger.info(f"Dry run result: {dry_run_result}") f"last_accessed should be set after {search_type.value} search operation"
)
assert dry_run_result["status"] == "dry_run", (
f"Status should be 'dry_run', got: {dry_run_result['status']}" original_last_accessed = data_record.last_accessed
) logger.info(f"{search_type.value} last_accessed verified: {original_last_accessed}")
assert dry_run_result["unused_count"] > 0, "Should find at least one unused document"
logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents") if data_id not in tested_data_ids:
tested_data_ids.append(data_id)
# Now run actual cleanup
logger.info("Executing cleanup...") except Exception as e:
cleanup_result = await cleanup_unused_data(minutes_threshold=30, dry_run=False, user_id=user.id) logger.warning(f"⚠️ {search_type.value} search failed: {str(e)}")
continue
assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"
assert cleanup_result["deleted_count"]["documents"] > 0, ( # Step 3: Test cleanup with aged timestamps
"At least one document should be deleted" from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data
)
logger.info( minutes_threshold = 30
f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents" aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10)
)
# Age all tested data records
# Step 6: Verify deletion db_engine = get_relational_engine()
async with db_engine.get_async_session() as session: for data_id in tested_data_ids:
deleted_data = ( async with db_engine.get_async_session() as session:
await session.execute(select(Data).where(Data.id == data_id)) stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)
).scalar_one_or_none() await session.execute(stmt)
assert deleted_data is None, "Data record should be deleted" await session.commit()
logger.info("✅ Confirmed: Data record was deleted")
# First do a dry run
logger.info("🎉 All cleanup tests passed!") logger.info("Testing dry run...")
return True dry_run_result = await cleanup_unused_data(minutes_threshold=10, dry_run=True, user_id=user.id)
logger.info(f"Dry run result: {dry_run_result}")
if __name__ == "__main__":
import asyncio assert dry_run_result["status"] == "dry_run", (
f"Status should be 'dry_run', got: {dry_run_result['status']}"
success = asyncio.run(test_textdocument_cleanup_with_sql()) )
assert dry_run_result["unused_count"] > 0, "Should find at least one unused document"
logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")
# Now run actual cleanup
logger.info("Executing cleanup...")
cleanup_result = await cleanup_unused_data(minutes_threshold=30, dry_run=False, user_id=user.id)
assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"
assert cleanup_result["deleted_count"]["documents"] > 0, (
"At least one document should be deleted"
)
logger.info(
f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents"
)
# Step 4: Verify deletion
for data_id in tested_data_ids:
async with db_engine.get_async_session() as session:
deleted_data = (
await session.execute(select(Data).where(Data.id == data_id))
).scalar_one_or_none()
assert deleted_data is None, f"Data record {data_id} should be deleted"
logger.info("✅ Confirmed: All tested data records were deleted")
logger.info("🎉 All cleanup tests passed for all search types!")
return True
if __name__ == "__main__":
import asyncio
success = asyncio.run(test_all_search_types_cleanup())
exit(0 if success else 1) exit(0 if success else 1)