fix: fallback and document deletion
This commit is contained in:
parent
6a4d31356b
commit
5f00abf3e4
2 changed files with 68 additions and 46 deletions
|
|
@ -36,16 +36,22 @@ async def update_node_access_timestamps(items: List[Any]):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Update nodes using graph projection ( database-agnostic approach
|
# Try to update nodes in graph database (may fail for unsupported DBs)
|
||||||
await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)
|
await _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Failed to update node timestamps in graph database: {e}. "
|
||||||
|
"Will update document-level timestamps in SQL instead."
|
||||||
|
)
|
||||||
|
|
||||||
# Find origin documents and update SQL
|
# Always try to find origin documents and update SQL
|
||||||
|
# This ensures document-level tracking works even if graph updates fail
|
||||||
|
try:
|
||||||
doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)
|
doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)
|
||||||
if doc_ids:
|
if doc_ids:
|
||||||
await _update_sql_records(doc_ids, timestamp_dt)
|
await _update_sql_records(doc_ids, timestamp_dt)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to update timestamps: {e}")
|
logger.error(f"Failed to update SQL timestamps: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):
|
async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):
|
||||||
|
|
@ -59,37 +65,42 @@ async def _update_nodes_via_projection(graph_engine, node_ids, timestamp_ms):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update each node's last_accessed_at property
|
# Update each node's last_accessed_at property
|
||||||
|
provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()
|
||||||
|
|
||||||
for node_id in node_ids:
|
for node_id in node_ids:
|
||||||
node = memory_fragment.get_node(node_id)
|
node = memory_fragment.get_node(node_id)
|
||||||
if node:
|
if node:
|
||||||
# Update the node in the database
|
try:
|
||||||
provider = os.getenv("GRAPH_DATABASE_PROVIDER", "kuzu").lower()
|
# Update the node in the database
|
||||||
|
if provider == "kuzu":
|
||||||
if provider == "kuzu":
|
# Kuzu stores properties as JSON
|
||||||
# Kuzu stores properties as JSON
|
result = await graph_engine.query(
|
||||||
result = await graph_engine.query(
|
"MATCH (n:Node {id: $id}) RETURN n.properties",
|
||||||
"MATCH (n:Node {id: $id}) RETURN n.properties",
|
{"id": node_id}
|
||||||
{"id": node_id}
|
|
||||||
)
|
|
||||||
|
|
||||||
if result and result[0]:
|
|
||||||
props = json.loads(result[0][0]) if result[0][0] else {}
|
|
||||||
props["last_accessed_at"] = timestamp_ms
|
|
||||||
|
|
||||||
await graph_engine.query(
|
|
||||||
"MATCH (n:Node {id: $id}) SET n.properties = $props",
|
|
||||||
{"id": node_id, "props": json.dumps(props)}
|
|
||||||
)
|
)
|
||||||
elif provider == "neo4j":
|
|
||||||
await graph_engine.query(
|
if result and result[0]:
|
||||||
"MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",
|
props = json.loads(result[0][0]) if result[0][0] else {}
|
||||||
{"id": node_id, "timestamp": timestamp_ms}
|
props["last_accessed_at"] = timestamp_ms
|
||||||
)
|
|
||||||
elif provider == "neptune":
|
await graph_engine.query(
|
||||||
await graph_engine.query(
|
"MATCH (n:Node {id: $id}) SET n.properties = $props",
|
||||||
"MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",
|
{"id": node_id, "props": json.dumps(props)}
|
||||||
{"id": node_id, "timestamp": timestamp_ms}
|
)
|
||||||
)
|
elif provider == "neo4j":
|
||||||
|
await graph_engine.query(
|
||||||
|
"MATCH (n:__Node__ {id: $id}) SET n.last_accessed_at = $timestamp",
|
||||||
|
{"id": node_id, "timestamp": timestamp_ms}
|
||||||
|
)
|
||||||
|
elif provider == "neptune":
|
||||||
|
await graph_engine.query(
|
||||||
|
"MATCH (n:Node {id: $id}) SET n.last_accessed_at = $timestamp",
|
||||||
|
{"id": node_id, "timestamp": timestamp_ms}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# Log but continue with other nodes
|
||||||
|
logger.debug(f"Failed to update node {node_id}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
async def _find_origin_documents_via_projection(graph_engine, node_ids):
|
async def _find_origin_documents_via_projection(graph_engine, node_ids):
|
||||||
"""Find origin documents using graph projection instead of DB queries"""
|
"""Find origin documents using graph projection instead of DB queries"""
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
"""
|
"""
|
||||||
Task for automatically deleting unused data from the memify pipeline.
|
Task for automatically deleting unused data from the memify pipeline.
|
||||||
|
|
||||||
This task identifies and removes data (chunks, entities, summaries)) that hasn't
|
This task identifies and removes entire documents that haven't
|
||||||
been accessed by retrievers for a specified period, helping maintain system
|
been accessed by retrievers for a specified period, helping maintain system
|
||||||
efficiency and storage optimization.
|
efficiency and storage optimization through whole-document removal.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
@ -28,22 +28,26 @@ async def cleanup_unused_data(
|
||||||
minutes_threshold: Optional[int],
|
minutes_threshold: Optional[int],
|
||||||
dry_run: bool = True,
|
dry_run: bool = True,
|
||||||
user_id: Optional[UUID] = None,
|
user_id: Optional[UUID] = None,
|
||||||
text_doc: bool = False
|
text_doc: bool = True, # Changed default to True for document-level cleanup
|
||||||
|
node_level: bool = False # New parameter for explicit node-level cleanup
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Identify and remove unused data from the memify pipeline.
|
Identify and remove unused data from the memify pipeline.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
minutes_threshold : int
|
minutes_threshold : int
|
||||||
days since last access to consider data unused
|
Minutes since last access to consider data unused
|
||||||
dry_run : bool
|
dry_run : bool
|
||||||
If True, only report what would be delete without actually deleting (default: True)
|
If True, only report what would be deleted without actually deleting (default: True)
|
||||||
user_id : UUID, optional
|
user_id : UUID, optional
|
||||||
Limit cleanup to specific user's data (default: None)
|
Limit cleanup to specific user's data (default: None)
|
||||||
text_doc : bool
|
text_doc : bool
|
||||||
If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()
|
If True (default), use SQL-based filtering to find unused TextDocuments and call cognee.delete()
|
||||||
for proper whole-document deletion (default: False)
|
for proper whole-document deletion
|
||||||
|
node_level : bool
|
||||||
|
If True, perform chaotic node-level deletion of unused chunks, entities, and summaries
|
||||||
|
(default: False - deprecated in favor of document-level cleanup)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -91,17 +95,19 @@ async def cleanup_unused_data(
|
||||||
minutes_threshold=minutes_threshold,
|
minutes_threshold=minutes_threshold,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
user_id=str(user_id) if user_id else None,
|
user_id=str(user_id) if user_id else None,
|
||||||
text_doc=text_doc
|
text_doc=text_doc,
|
||||||
|
node_level=node_level
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate cutoff timestamp
|
# Calculate cutoff timestamp
|
||||||
cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)
|
cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)
|
||||||
|
|
||||||
if text_doc:
|
if node_level:
|
||||||
# SQL-based approach: Find unused TextDocuments and use cognee.delete()
|
# Deprecated: Node-level approach (chaotic)
|
||||||
return await _cleanup_via_sql(cutoff_date, dry_run, user_id)
|
logger.warning(
|
||||||
else:
|
"Node-level cleanup is deprecated and may lead to fragmented knowledge graphs. "
|
||||||
# Graph-based approach: Find unused nodes using projection (database-agnostic)
|
"Consider using document-level cleanup (default) instead."
|
||||||
|
)
|
||||||
cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)
|
cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)
|
||||||
logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")
|
logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")
|
||||||
|
|
||||||
|
|
@ -147,6 +153,9 @@ async def cleanup_unused_data(
|
||||||
},
|
},
|
||||||
"cleanup_date": datetime.now(timezone.utc).isoformat()
|
"cleanup_date": datetime.now(timezone.utc).isoformat()
|
||||||
}
|
}
|
||||||
|
else:
|
||||||
|
# Default: Document-level approach (recommended)
|
||||||
|
return await _cleanup_via_sql(cutoff_date, dry_run, user_id)
|
||||||
|
|
||||||
|
|
||||||
async def _cleanup_via_sql(
|
async def _cleanup_via_sql(
|
||||||
|
|
@ -243,6 +252,7 @@ async def _cleanup_via_sql(
|
||||||
async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:
|
async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[str, list]:
|
||||||
"""
|
"""
|
||||||
Find unused nodes using graph projection - database-agnostic approach.
|
Find unused nodes using graph projection - database-agnostic approach.
|
||||||
|
NOTE: This function is deprecated as it leads to fragmented knowledge graphs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -291,6 +301,7 @@ async def _find_unused_nodes_via_projection(cutoff_timestamp_ms: int) -> Dict[st
|
||||||
async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
|
async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
|
||||||
"""
|
"""
|
||||||
Delete unused nodes from graph and vector databases.
|
Delete unused nodes from graph and vector databases.
|
||||||
|
NOTE: This function is deprecated as it leads to fragmented knowledge graphs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -325,7 +336,7 @@ async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
|
||||||
if not node_ids:
|
if not node_ids:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Count edges connected to these nodes
|
# Count edges from the in-memory graph
|
||||||
for node_id in node_ids:
|
for node_id in node_ids:
|
||||||
node = memory_fragment.get_node(node_id)
|
node = memory_fragment.get_node(node_id)
|
||||||
if node:
|
if node:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue