This commit is contained in:
hajdul88 2025-12-19 10:25:24 +01:00
parent 8a490b1c16
commit 4b71995a70
10 changed files with 486 additions and 504 deletions

View file

@ -1,52 +1,51 @@
"""add_last_accessed_to_data """add_last_accessed_to_data
Revision ID: e1ec1dcb50b6 Revision ID: e1ec1dcb50b6
Revises: 211ab850ef3d Revises: 211ab850ef3d
Create Date: 2025-11-04 21:45:52.642322 Create Date: 2025-11-04 21:45:52.642322
""" """
import os
from typing import Sequence, Union import os
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'e1ec1dcb50b6' # revision identifiers, used by Alembic.
down_revision: Union[str, None] = '211ab850ef3d' revision: str = "e1ec1dcb50b6"
branch_labels: Union[str, Sequence[str], None] = None down_revision: Union[str, None] = "211ab850ef3d"
depends_on: Union[str, Sequence[str], None] = None branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def _get_column(inspector, table, name, schema=None):
for col in inspector.get_columns(table, schema=schema): def _get_column(inspector, table, name, schema=None):
if col["name"] == name: for col in inspector.get_columns(table, schema=schema):
return col if col["name"] == name:
return None return col
return None
def upgrade() -> None:
conn = op.get_bind() def upgrade() -> None:
insp = sa.inspect(conn) conn = op.get_bind()
insp = sa.inspect(conn)
last_accessed_column = _get_column(insp, "data", "last_accessed")
if not last_accessed_column: last_accessed_column = _get_column(insp, "data", "last_accessed")
# Always create the column for schema consistency if not last_accessed_column:
op.add_column('data', # Always create the column for schema consistency
sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True) op.add_column("data", sa.Column("last_accessed", sa.DateTime(timezone=True), nullable=True))
)
# Only initialize existing records if feature is enabled
# Only initialize existing records if feature is enabled enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true"
enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true" if enable_last_accessed:
if enable_last_accessed: op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")
op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")
def downgrade() -> None:
def downgrade() -> None: conn = op.get_bind()
conn = op.get_bind() insp = sa.inspect(conn)
insp = sa.inspect(conn)
last_accessed_column = _get_column(insp, "data", "last_accessed")
last_accessed_column = _get_column(insp, "data", "last_accessed") if last_accessed_column:
if last_accessed_column: op.drop_column("data", "last_accessed")
op.drop_column('data', 'last_accessed')

View file

@ -2,6 +2,7 @@ from cognee.infrastructure.engine import DataPoint
from cognee.modules.engine.models.EntityType import EntityType from cognee.modules.engine.models.EntityType import EntityType
from typing import Optional from typing import Optional
class Entity(DataPoint): class Entity(DataPoint):
name: str name: str
is_a: Optional[EntityType] = None is_a: Optional[EntityType] = None

View file

@ -1,13 +1,12 @@
def get_entity_nodes_from_triplets(triplets): def get_entity_nodes_from_triplets(triplets):
entity_nodes = [] entity_nodes = []
seen_ids = set() seen_ids = set()
for triplet in triplets: for triplet in triplets:
if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids: if hasattr(triplet, "node1") and triplet.node1 and triplet.node1.id not in seen_ids:
entity_nodes.append({"id": str(triplet.node1.id)}) entity_nodes.append({"id": str(triplet.node1.id)})
seen_ids.add(triplet.node1.id) seen_ids.add(triplet.node1.id)
if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids: if hasattr(triplet, "node2") and triplet.node2 and triplet.node2.id not in seen_ids:
entity_nodes.append({"id": str(triplet.node2.id)}) entity_nodes.append({"id": str(triplet.node2.id)})
seen_ids.add(triplet.node2.id) seen_ids.add(triplet.node2.id)
return entity_nodes return entity_nodes

View file

@ -5,7 +5,7 @@ from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.base_retriever import BaseRetriever
from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.modules.retrieval.exceptions.exceptions import NoDataError
from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
from datetime import datetime, timezone from datetime import datetime, timezone
logger = get_logger("ChunksRetriever") logger = get_logger("ChunksRetriever")
@ -28,7 +28,7 @@ class ChunksRetriever(BaseRetriever):
): ):
self.top_k = top_k self.top_k = top_k
async def get_context(self, query: str) -> Any: async def get_context(self, query: str) -> Any:
""" """
Retrieves document chunks context based on the query. Retrieves document chunks context based on the query.
Searches for document chunks relevant to the specified query using a vector engine. Searches for document chunks relevant to the specified query using a vector engine.

View file

@ -148,8 +148,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
# context = await self.resolve_edges_to_text(triplets) # context = await self.resolve_edges_to_text(triplets)
entity_nodes = get_entity_nodes_from_triplets(triplets) entity_nodes = get_entity_nodes_from_triplets(triplets)
await update_node_access_timestamps(entity_nodes) await update_node_access_timestamps(entity_nodes)
return triplets return triplets
async def convert_retrieved_objects_to_context(self, triplets: List[Edge]): async def convert_retrieved_objects_to_context(self, triplets: List[Edge]):

View file

@ -55,9 +55,9 @@ class SummariesRetriever(BaseRetriever):
"TextSummary_text", query, limit=self.top_k "TextSummary_text", query, limit=self.top_k
) )
logger.info(f"Found {len(summaries_results)} summaries from vector search") logger.info(f"Found {len(summaries_results)} summaries from vector search")
await update_node_access_timestamps(summaries_results) await update_node_access_timestamps(summaries_results)
except CollectionNotFoundError as error: except CollectionNotFoundError as error:
logger.error("TextSummary_text collection not found in vector database") logger.error("TextSummary_text collection not found in vector database")
raise NoDataError("No data found in the system, please add data first.") from error raise NoDataError("No data found in the system, please add data first.") from error

View file

@ -1,82 +1,88 @@
"""Utilities for tracking data access in retrievers.""" """Utilities for tracking data access in retrievers."""
import json import json
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import List, Any from typing import List, Any
from uuid import UUID from uuid import UUID
import os import os
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Data from cognee.modules.data.models import Data
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from sqlalchemy import update from sqlalchemy import update
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
logger = get_logger(__name__) logger = get_logger(__name__)
async def update_node_access_timestamps(items: List[Any]): async def update_node_access_timestamps(items: List[Any]):
if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":
return return
if not items: if not items:
return return
graph_engine = await get_graph_engine() graph_engine = await get_graph_engine()
timestamp_dt = datetime.now(timezone.utc) timestamp_dt = datetime.now(timezone.utc)
# Extract node IDs # Extract node IDs
node_ids = [] node_ids = []
for item in items: for item in items:
item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id") item_id = item.payload.get("id") if hasattr(item, "payload") else item.get("id")
if item_id: if item_id:
node_ids.append(str(item_id)) node_ids.append(str(item_id))
if not node_ids: if not node_ids:
return return
# Focus on document-level tracking via projection # Focus on document-level tracking via projection
try: try:
doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids) doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)
if doc_ids: if doc_ids:
await _update_sql_records(doc_ids, timestamp_dt) await _update_sql_records(doc_ids, timestamp_dt)
except Exception as e: except Exception as e:
logger.error(f"Failed to update SQL timestamps: {e}") logger.error(f"Failed to update SQL timestamps: {e}")
raise raise
async def _find_origin_documents_via_projection(graph_engine, node_ids): async def _find_origin_documents_via_projection(graph_engine, node_ids):
"""Find origin documents using graph projection instead of DB queries""" """Find origin documents using graph projection instead of DB queries"""
# Project the entire graph with necessary properties # Project the entire graph with necessary properties
memory_fragment = CogneeGraph() memory_fragment = CogneeGraph()
await memory_fragment.project_graph_from_db( await memory_fragment.project_graph_from_db(
graph_engine, graph_engine,
node_properties_to_project=["id", "type"], node_properties_to_project=["id", "type"],
edge_properties_to_project=["relationship_name"] edge_properties_to_project=["relationship_name"],
) )
# Find origin documents by traversing the in-memory graph # Find origin documents by traversing the in-memory graph
doc_ids = set() doc_ids = set()
for node_id in node_ids: for node_id in node_ids:
node = memory_fragment.get_node(node_id) node = memory_fragment.get_node(node_id)
if node and node.get_attribute("type") == "DocumentChunk": if node and node.get_attribute("type") == "DocumentChunk":
# Traverse edges to find connected documents # Traverse edges to find connected documents
for edge in node.get_skeleton_edges(): for edge in node.get_skeleton_edges():
# Get the neighbor node # Get the neighbor node
neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node() neighbor = (
if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]: edge.get_destination_node()
doc_ids.add(neighbor.id) if edge.get_source_node().id == node_id
else edge.get_source_node()
return list(doc_ids) )
if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:
doc_ids.add(neighbor.id)
async def _update_sql_records(doc_ids, timestamp_dt):
"""Update SQL Data table (same for all providers)""" return list(doc_ids)
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
stmt = update(Data).where( async def _update_sql_records(doc_ids, timestamp_dt):
Data.id.in_([UUID(doc_id) for doc_id in doc_ids]) """Update SQL Data table (same for all providers)"""
).values(last_accessed=timestamp_dt) db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
await session.execute(stmt) stmt = (
update(Data)
.where(Data.id.in_([UUID(doc_id) for doc_id in doc_ids]))
.values(last_accessed=timestamp_dt)
)
await session.execute(stmt)
await session.commit() await session.commit()

View file

@ -1,187 +1,172 @@
""" """
Task for automatically deleting unused data from the memify pipeline. Task for automatically deleting unused data from the memify pipeline.
This task identifies and removes entire documents that haven't This task identifies and removes entire documents that haven't
been accessed by retrievers for a specified period, helping maintain system been accessed by retrievers for a specified period, helping maintain system
efficiency and storage optimization through whole-document removal. efficiency and storage optimization through whole-document removal.
""" """
import json import json
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
from uuid import UUID from uuid import UUID
import os import os
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Data, DatasetData from cognee.modules.data.models import Data, DatasetData
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from sqlalchemy import select, or_ from sqlalchemy import select, or_
import cognee import cognee
import sqlalchemy as sa import sqlalchemy as sa
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
logger = get_logger(__name__) logger = get_logger(__name__)
async def cleanup_unused_data( async def cleanup_unused_data(
minutes_threshold: Optional[int], minutes_threshold: Optional[int], dry_run: bool = True, user_id: Optional[UUID] = None
dry_run: bool = True, ) -> Dict[str, Any]:
user_id: Optional[UUID] = None """
) -> Dict[str, Any]: Identify and remove unused data from the memify pipeline.
"""
Identify and remove unused data from the memify pipeline. Parameters
----------
Parameters minutes_threshold : int
---------- Minutes since last access to consider data unused
minutes_threshold : int dry_run : bool
Minutes since last access to consider data unused If True, only report what would be deleted without actually deleting (default: True)
dry_run : bool user_id : UUID, optional
If True, only report what would be deleted without actually deleting (default: True) Limit cleanup to specific user's data (default: None)
user_id : UUID, optional
Limit cleanup to specific user's data (default: None) Returns
-------
Returns Dict[str, Any]
------- Cleanup results with status, counts, and timestamp
Dict[str, Any] """
Cleanup results with status, counts, and timestamp # Check 1: Environment variable must be enabled
""" if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":
# Check 1: Environment variable must be enabled logger.warning("Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled.")
if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true": return {
logger.warning( "status": "skipped",
"Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled." "reason": "ENABLE_LAST_ACCESSED not enabled",
) "unused_count": 0,
return { "deleted_count": {},
"status": "skipped", "cleanup_date": datetime.now(timezone.utc).isoformat(),
"reason": "ENABLE_LAST_ACCESSED not enabled", }
"unused_count": 0,
"deleted_count": {}, # Check 2: Verify tracking has actually been running
"cleanup_date": datetime.now(timezone.utc).isoformat() db_engine = get_relational_engine()
} async with db_engine.get_async_session() as session:
# Count records with non-NULL last_accessed
# Check 2: Verify tracking has actually been running tracked_count = await session.execute(
db_engine = get_relational_engine() select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))
async with db_engine.get_async_session() as session: )
# Count records with non-NULL last_accessed tracked_records = tracked_count.scalar()
tracked_count = await session.execute(
select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None)) if tracked_records == 0:
) logger.warning(
tracked_records = tracked_count.scalar() "Cleanup skipped: No records have been tracked yet. "
"ENABLE_LAST_ACCESSED may have been recently enabled. "
if tracked_records == 0: "Wait for retrievers to update timestamps before running cleanup."
logger.warning( )
"Cleanup skipped: No records have been tracked yet. " return {
"ENABLE_LAST_ACCESSED may have been recently enabled. " "status": "skipped",
"Wait for retrievers to update timestamps before running cleanup." "reason": "No tracked records found - tracking may be newly enabled",
) "unused_count": 0,
return { "deleted_count": {},
"status": "skipped", "cleanup_date": datetime.now(timezone.utc).isoformat(),
"reason": "No tracked records found - tracking may be newly enabled", }
"unused_count": 0,
"deleted_count": {}, logger.info(
"cleanup_date": datetime.now(timezone.utc).isoformat() "Starting cleanup task",
} minutes_threshold=minutes_threshold,
dry_run=dry_run,
logger.info( user_id=str(user_id) if user_id else None,
"Starting cleanup task", )
minutes_threshold=minutes_threshold,
dry_run=dry_run, # Calculate cutoff timestamp
user_id=str(user_id) if user_id else None cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)
)
# Document-level approach (recommended)
# Calculate cutoff timestamp return await _cleanup_via_sql(cutoff_date, dry_run, user_id)
cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)
# Document-level approach (recommended) async def _cleanup_via_sql(
return await _cleanup_via_sql(cutoff_date, dry_run, user_id) cutoff_date: datetime, dry_run: bool, user_id: Optional[UUID] = None
) -> Dict[str, Any]:
"""
async def _cleanup_via_sql( SQL-based cleanup: Query Data table for unused documents and use cognee.delete().
cutoff_date: datetime,
dry_run: bool, Parameters
user_id: Optional[UUID] = None ----------
) -> Dict[str, Any]: cutoff_date : datetime
""" Cutoff date for last_accessed filtering
SQL-based cleanup: Query Data table for unused documents and use cognee.delete(). dry_run : bool
If True, only report what would be deleted
Parameters user_id : UUID, optional
---------- Filter by user ID if provided
cutoff_date : datetime
Cutoff date for last_accessed filtering Returns
dry_run : bool -------
If True, only report what would be deleted Dict[str, Any]
user_id : UUID, optional Cleanup results
Filter by user ID if provided """
db_engine = get_relational_engine()
Returns
------- async with db_engine.get_async_session() as session:
Dict[str, Any] # Query for Data records with old last_accessed timestamps
Cleanup results query = (
""" select(Data, DatasetData)
db_engine = get_relational_engine() .join(DatasetData, Data.id == DatasetData.data_id)
.where(or_(Data.last_accessed < cutoff_date, Data.last_accessed.is_(None)))
async with db_engine.get_async_session() as session: )
# Query for Data records with old last_accessed timestamps
query = select(Data, DatasetData).join( if user_id:
DatasetData, Data.id == DatasetData.data_id from cognee.modules.data.models import Dataset
).where(
or_( query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(
Data.last_accessed < cutoff_date, Dataset.owner_id == user_id
Data.last_accessed.is_(None) )
)
) result = await session.execute(query)
unused_data = result.all()
if user_id:
from cognee.modules.data.models import Dataset logger.info(f"Found {len(unused_data)} unused documents in SQL")
query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(
Dataset.owner_id == user_id if dry_run:
) return {
"status": "dry_run",
result = await session.execute(query) "unused_count": len(unused_data),
unused_data = result.all() "deleted_count": {"data_items": 0, "documents": 0},
"cleanup_date": datetime.now(timezone.utc).isoformat(),
logger.info(f"Found {len(unused_data)} unused documents in SQL") "preview": {"documents": len(unused_data)},
}
if dry_run:
return { # Delete each document using cognee.delete()
"status": "dry_run", deleted_count = 0
"unused_count": len(unused_data), from cognee.modules.users.methods import get_default_user
"deleted_count": {
"data_items": 0, user = await get_default_user() if user_id is None else None
"documents": 0
}, for data, dataset_data in unused_data:
"cleanup_date": datetime.now(timezone.utc).isoformat(), try:
"preview": { await cognee.delete(
"documents": len(unused_data) data_id=data.id,
} dataset_id=dataset_data.dataset_id,
} mode="hard", # Use hard mode to also remove orphaned entities
user=user,
# Delete each document using cognee.delete() )
deleted_count = 0 deleted_count += 1
from cognee.modules.users.methods import get_default_user logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")
user = await get_default_user() if user_id is None else None except Exception as e:
logger.error(f"Failed to delete document {data.id}: {e}")
for data, dataset_data in unused_data:
try: logger.info("Cleanup completed", deleted_count=deleted_count)
await cognee.delete(
data_id=data.id, return {
dataset_id=dataset_data.dataset_id, "status": "completed",
mode="hard", # Use hard mode to also remove orphaned entities "unused_count": len(unused_data),
user=user "deleted_count": {"data_items": deleted_count, "documents": deleted_count},
) "cleanup_date": datetime.now(timezone.utc).isoformat(),
deleted_count += 1
logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")
except Exception as e:
logger.error(f"Failed to delete document {data.id}: {e}")
logger.info("Cleanup completed", deleted_count=deleted_count)
return {
"status": "completed",
"unused_count": len(unused_data),
"deleted_count": {
"data_items": deleted_count,
"documents": deleted_count
},
"cleanup_date": datetime.now(timezone.utc).isoformat()
} }

View file

@ -1,4 +1,3 @@
from typing import Union from typing import Union
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.modules.chunking.models import DocumentChunk from cognee.modules.chunking.models import DocumentChunk

View file

@ -1,172 +1,165 @@
import os import os
import pathlib import pathlib
import cognee import cognee
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from uuid import UUID from uuid import UUID
from sqlalchemy import select, update from sqlalchemy import select, update
from cognee.modules.data.models import Data, DatasetData from cognee.modules.data.models import Data, DatasetData
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.users.methods import get_default_user from cognee.modules.users.methods import get_default_user
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.search.types import SearchType from cognee.modules.search.types import SearchType
logger = get_logger() logger = get_logger()
async def test_textdocument_cleanup_with_sql(): async def test_textdocument_cleanup_with_sql():
""" """
End-to-end test for TextDocument cleanup based on last_accessed timestamps. End-to-end test for TextDocument cleanup based on last_accessed timestamps.
""" """
# Enable last accessed tracking BEFORE any cognee operations # Enable last accessed tracking BEFORE any cognee operations
os.environ["ENABLE_LAST_ACCESSED"] = "true" os.environ["ENABLE_LAST_ACCESSED"] = "true"
# Setup test directories # Setup test directories
data_directory_path = str( data_directory_path = str(
pathlib.Path( pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup") os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_cleanup")
).resolve() ).resolve()
) )
cognee_directory_path = str( cognee_directory_path = str(
pathlib.Path( pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup") os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_cleanup")
).resolve() ).resolve()
) )
cognee.config.data_root_directory(data_directory_path) cognee.config.data_root_directory(data_directory_path)
cognee.config.system_root_directory(cognee_directory_path) cognee.config.system_root_directory(cognee_directory_path)
# Initialize database # Initialize database
from cognee.modules.engine.operations.setup import setup from cognee.modules.engine.operations.setup import setup
# Clean slate # Clean slate
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
logger.info("🧪 Testing TextDocument cleanup based on last_accessed") logger.info("🧪 Testing TextDocument cleanup based on last_accessed")
# Step 1: Add and cognify a test document # Step 1: Add and cognify a test document
dataset_name = "test_cleanup_dataset" dataset_name = "test_cleanup_dataset"
test_text = """ test_text = """
Machine learning is a subset of artificial intelligence that enables systems to learn Machine learning is a subset of artificial intelligence that enables systems to learn
and improve from experience without being explicitly programmed. Deep learning uses and improve from experience without being explicitly programmed. Deep learning uses
neural networks with multiple layers to process data. neural networks with multiple layers to process data.
""" """
await setup() await setup()
user = await get_default_user() user = await get_default_user()
await cognee.add([test_text], dataset_name=dataset_name, user=user) await cognee.add([test_text], dataset_name=dataset_name, user=user)
cognify_result = await cognee.cognify([dataset_name], user=user) cognify_result = await cognee.cognify([dataset_name], user=user)
# Extract dataset_id from cognify result # Extract dataset_id from cognify result
dataset_id = None dataset_id = None
for ds_id, pipeline_result in cognify_result.items(): for ds_id, pipeline_result in cognify_result.items():
dataset_id = ds_id dataset_id = ds_id
break break
assert dataset_id is not None, "Failed to get dataset_id from cognify result" assert dataset_id is not None, "Failed to get dataset_id from cognify result"
logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}") logger.info(f"✅ Document added and cognified. Dataset ID: {dataset_id}")
# Step 2: Perform search to trigger last_accessed update # Step 2: Perform search to trigger last_accessed update
logger.info("Triggering search to update last_accessed...") logger.info("Triggering search to update last_accessed...")
search_results = await cognee.search( search_results = await cognee.search(
query_type=SearchType.CHUNKS, query_type=SearchType.CHUNKS,
query_text="machine learning", query_text="machine learning",
datasets=[dataset_name], datasets=[dataset_name],
user=user user=user,
) )
logger.info(f"✅ Search completed, found {len(search_results)} results") logger.info(f"✅ Search completed, found {len(search_results)} results")
assert len(search_results) > 0, "Search should return results" assert len(search_results) > 0, "Search should return results"
# Step 3: Verify last_accessed was set and get data_id # Step 3: Verify last_accessed was set and get data_id
db_engine = get_relational_engine() db_engine = get_relational_engine()
async with db_engine.get_async_session() as session: async with db_engine.get_async_session() as session:
result = await session.execute( result = await session.execute(
select(Data, DatasetData) select(Data, DatasetData)
.join(DatasetData, Data.id == DatasetData.data_id) .join(DatasetData, Data.id == DatasetData.data_id)
.where(DatasetData.dataset_id == dataset_id) .where(DatasetData.dataset_id == dataset_id)
) )
data_records = result.all() data_records = result.all()
assert len(data_records) > 0, "No Data records found for the dataset" assert len(data_records) > 0, "No Data records found for the dataset"
data_record = data_records[0][0] data_record = data_records[0][0]
data_id = data_record.id data_id = data_record.id
# Verify last_accessed is set # Verify last_accessed is set
assert data_record.last_accessed is not None, ( assert data_record.last_accessed is not None, (
"last_accessed should be set after search operation" "last_accessed should be set after search operation"
) )
original_last_accessed = data_record.last_accessed original_last_accessed = data_record.last_accessed
logger.info(f"✅ last_accessed verified: {original_last_accessed}") logger.info(f"✅ last_accessed verified: {original_last_accessed}")
# Step 4: Manually age the timestamp # Step 4: Manually age the timestamp
minutes_threshold = 30 minutes_threshold = 30
aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10) aged_timestamp = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold + 10)
async with db_engine.get_async_session() as session: async with db_engine.get_async_session() as session:
stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp) stmt = update(Data).where(Data.id == data_id).values(last_accessed=aged_timestamp)
await session.execute(stmt) await session.execute(stmt)
await session.commit() await session.commit()
# Verify timestamp was updated # Verify timestamp was updated
async with db_engine.get_async_session() as session: async with db_engine.get_async_session() as session:
result = await session.execute(select(Data).where(Data.id == data_id)) result = await session.execute(select(Data).where(Data.id == data_id))
updated_data = result.scalar_one_or_none() updated_data = result.scalar_one_or_none()
assert updated_data is not None, "Data record should exist" assert updated_data is not None, "Data record should exist"
retrieved_timestamp = updated_data.last_accessed retrieved_timestamp = updated_data.last_accessed
if retrieved_timestamp.tzinfo is None: if retrieved_timestamp.tzinfo is None:
retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc) retrieved_timestamp = retrieved_timestamp.replace(tzinfo=timezone.utc)
assert retrieved_timestamp == aged_timestamp, ( assert retrieved_timestamp == aged_timestamp, f"Timestamp should be updated to aged value"
f"Timestamp should be updated to aged value"
) # Step 5: Test cleanup (document-level is now the default)
from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data
# Step 5: Test cleanup (document-level is now the default)
from cognee.tasks.cleanup.cleanup_unused_data import cleanup_unused_data # First do a dry run
logger.info("Testing dry run...")
# First do a dry run dry_run_result = await cleanup_unused_data(minutes_threshold=10, dry_run=True, user_id=user.id)
logger.info("Testing dry run...")
dry_run_result = await cleanup_unused_data( # Debug: Print the actual result
minutes_threshold=10, logger.info(f"Dry run result: {dry_run_result}")
dry_run=True,
user_id=user.id assert dry_run_result["status"] == "dry_run", (
) f"Status should be 'dry_run', got: {dry_run_result['status']}"
)
# Debug: Print the actual result assert dry_run_result["unused_count"] > 0, "Should find at least one unused document"
logger.info(f"Dry run result: {dry_run_result}") logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents")
assert dry_run_result['status'] == 'dry_run', f"Status should be 'dry_run', got: {dry_run_result['status']}" # Now run actual cleanup
assert dry_run_result['unused_count'] > 0, ( logger.info("Executing cleanup...")
"Should find at least one unused document" cleanup_result = await cleanup_unused_data(minutes_threshold=30, dry_run=False, user_id=user.id)
)
logger.info(f"✅ Dry run found {dry_run_result['unused_count']} unused documents") assert cleanup_result["status"] == "completed", "Cleanup should complete successfully"
assert cleanup_result["deleted_count"]["documents"] > 0, (
# Now run actual cleanup "At least one document should be deleted"
logger.info("Executing cleanup...") )
cleanup_result = await cleanup_unused_data( logger.info(
minutes_threshold=30, f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents"
dry_run=False, )
user_id=user.id
) # Step 6: Verify deletion
async with db_engine.get_async_session() as session:
assert cleanup_result["status"] == "completed", "Cleanup should complete successfully" deleted_data = (
assert cleanup_result["deleted_count"]["documents"] > 0, ( await session.execute(select(Data).where(Data.id == data_id))
"At least one document should be deleted" ).scalar_one_or_none()
) assert deleted_data is None, "Data record should be deleted"
logger.info(f"✅ Cleanup completed. Deleted {cleanup_result['deleted_count']['documents']} documents") logger.info("✅ Confirmed: Data record was deleted")
# Step 6: Verify deletion logger.info("🎉 All cleanup tests passed!")
async with db_engine.get_async_session() as session: return True
deleted_data = (
await session.execute(select(Data).where(Data.id == data_id))
).scalar_one_or_none() if __name__ == "__main__":
assert deleted_data is None, "Data record should be deleted" import asyncio
logger.info("✅ Confirmed: Data record was deleted")
success = asyncio.run(test_textdocument_cleanup_with_sql())
logger.info("🎉 All cleanup tests passed!")
return True
if __name__ == "__main__":
import asyncio
success = asyncio.run(test_textdocument_cleanup_with_sql())
exit(0 if success else 1) exit(0 if success else 1)