LightRAG/lightrag/tools/clean_llm_query_cache.py
clssck 69358d830d test(lightrag,examples,api): comprehensive ruff formatting and type hints
Format entire codebase with ruff and add type hints across all modules:
- Apply ruff formatting to all Python files (121 files, 17K insertions)
- Add type hints to function signatures throughout lightrag core and API
- Update test suite with improved type annotations and docstrings
- Add pyrightconfig.json for static type checking configuration
- Create prompt_optimized.py and test_extraction_prompt_ab.py test files
- Update ruff.toml and .gitignore for improved linting configuration
- Standardize code style across examples, reproduce scripts, and utilities
2025-12-05 15:17:06 +01:00

1048 lines
38 KiB
Python

#!/usr/bin/env python3
"""
LLM Query Cache Cleanup Tool for LightRAG
This tool cleans up LLM query cache (mix:*, hybrid:*, local:*, global:*)
from KV storage implementations while preserving workspace isolation.
Usage:
python -m lightrag.tools.clean_llm_query_cache
# or
python lightrag/tools/clean_llm_query_cache.py
Supported KV Storage Types:
- JsonKVStorage
- RedisKVStorage
- PGKVStorage
- MongoKVStorage
"""
import asyncio
import contextlib
import os
import sys
import time
from dataclasses import dataclass, field
from typing import Any
from dotenv import load_dotenv
# Add project root to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from lightrag.kg import STORAGE_ENV_REQUIREMENTS
from lightrag.kg.shared_storage import set_all_update_flags
from lightrag.namespace import NameSpace
from lightrag.utils import setup_logger
# Load environment variables
load_dotenv(dotenv_path='.env', override=False)
# Setup logger
setup_logger('lightrag', level='INFO')
# Storage type configurations
STORAGE_TYPES = {
'1': 'JsonKVStorage',
'2': 'RedisKVStorage',
'3': 'PGKVStorage',
'4': 'MongoKVStorage',
}
# Workspace environment variable mapping
WORKSPACE_ENV_MAP = {
'PGKVStorage': 'POSTGRES_WORKSPACE',
'MongoKVStorage': 'MONGODB_WORKSPACE',
'RedisKVStorage': 'REDIS_WORKSPACE',
}
# Query cache modes
QUERY_MODES = ['mix', 'hybrid', 'local', 'global']
# Query cache types
CACHE_TYPES = ['query', 'keywords']
# Default batch size for deletion
DEFAULT_BATCH_SIZE = 1000
# ANSI color codes for terminal output
BOLD_CYAN = '\033[1;36m'
BOLD_RED = '\033[1;31m'
BOLD_GREEN = '\033[1;32m'
RESET = '\033[0m'
@dataclass
class CleanupStats:
"""Cleanup statistics and error tracking"""
# Count by mode and cache_type before cleanup
counts_before: dict[str, dict[str, int]] = field(default_factory=dict)
# Deletion statistics
total_to_delete: int = 0
total_batches: int = 0
successful_batches: int = 0
failed_batches: int = 0
successfully_deleted: int = 0
failed_to_delete: int = 0
# Count by mode and cache_type after cleanup
counts_after: dict[str, dict[str, int]] = field(default_factory=dict)
# Error tracking
errors: list[dict[str, Any]] = field(default_factory=list)
def add_error(self, batch_idx: int, error: Exception, batch_size: int):
"""Record batch error"""
self.errors.append(
{
'batch': batch_idx,
'error_type': type(error).__name__,
'error_msg': str(error),
'records_lost': batch_size,
'timestamp': time.time(),
}
)
self.failed_batches += 1
self.failed_to_delete += batch_size
def initialize_counts(self):
"""Initialize count dictionaries"""
for mode in QUERY_MODES:
self.counts_before[mode] = {'query': 0, 'keywords': 0}
self.counts_after[mode] = {'query': 0, 'keywords': 0}
class CleanupTool:
"""LLM Query Cache Cleanup Tool"""
def __init__(self):
self.storage = None
self.workspace = ''
self.batch_size = DEFAULT_BATCH_SIZE
def get_workspace_for_storage(self, storage_name: str) -> str:
"""Get workspace for a specific storage type
Priority: Storage-specific env var > WORKSPACE env var > empty string
Args:
storage_name: Storage implementation name
Returns:
Workspace name
"""
# Check storage-specific workspace
if storage_name in WORKSPACE_ENV_MAP:
specific_workspace = os.getenv(WORKSPACE_ENV_MAP[storage_name])
if specific_workspace:
return specific_workspace
# Check generic WORKSPACE
workspace = os.getenv('WORKSPACE', '')
return workspace
def check_config_ini_for_storage(self, storage_name: str) -> bool:
"""Check if config.ini has configuration for the storage type
Args:
storage_name: Storage implementation name
Returns:
True if config.ini has the necessary configuration
"""
try:
import configparser
config = configparser.ConfigParser()
config.read('config.ini', 'utf-8')
if storage_name == 'RedisKVStorage':
return config.has_option('redis', 'uri')
elif storage_name == 'PGKVStorage':
return (
config.has_option('postgres', 'user')
and config.has_option('postgres', 'password')
and config.has_option('postgres', 'database')
)
elif storage_name == 'MongoKVStorage':
return config.has_option('mongodb', 'uri') and config.has_option('mongodb', 'database')
return False
except Exception:
return False
def check_env_vars(self, storage_name: str) -> bool:
"""Check environment variables, show warnings if missing but don't fail
Args:
storage_name: Storage implementation name
Returns:
Always returns True (warnings only, no hard failure)
"""
required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
if not required_vars:
print('✓ No environment variables required')
return True
missing_vars = [var for var in required_vars if var not in os.environ]
if missing_vars:
print(f'⚠️ Warning: Missing environment variables: {", ".join(missing_vars)}')
# Check if config.ini has configuration
has_config = self.check_config_ini_for_storage(storage_name)
if has_config:
print(' ✓ Found configuration in config.ini')
else:
print(f' Will attempt to use defaults for {storage_name}')
return True
print('✓ All required environment variables are set')
return True
def get_storage_class(self, storage_name: str):
"""Dynamically import and return storage class
Args:
storage_name: Storage implementation name
Returns:
Storage class
"""
if storage_name == 'JsonKVStorage':
from lightrag.kg.json_kv_impl import JsonKVStorage
return JsonKVStorage
elif storage_name == 'RedisKVStorage':
from lightrag.kg.redis_impl import RedisKVStorage
return RedisKVStorage
elif storage_name == 'PGKVStorage':
from lightrag.kg.postgres_impl import PGKVStorage
return PGKVStorage
elif storage_name == 'MongoKVStorage':
from lightrag.kg.mongo_impl import MongoKVStorage
return MongoKVStorage
else:
raise ValueError(f'Unsupported storage type: {storage_name}')
async def initialize_storage(self, storage_name: str, workspace: str):
"""Initialize storage instance with fallback to config.ini and defaults
Args:
storage_name: Storage implementation name
workspace: Workspace name
Returns:
Initialized storage instance
Raises:
Exception: If initialization fails
"""
storage_class = self.get_storage_class(storage_name)
# Create global config
global_config = {
'working_dir': os.getenv('WORKING_DIR', './rag_storage'),
'embedding_batch_num': 10,
}
# Initialize storage
storage = storage_class(
namespace=NameSpace.KV_STORE_LLM_RESPONSE_CACHE,
workspace=workspace,
global_config=global_config,
embedding_func=None,
)
# Initialize the storage (may raise exception if connection fails)
await storage.initialize()
return storage
async def count_query_caches_json(self, storage) -> dict[str, dict[str, int]]:
"""Count query caches in JsonKVStorage by mode and cache_type
Args:
storage: JsonKVStorage instance
Returns:
Dictionary with counts for each mode and cache_type
"""
counts = {mode: {'query': 0, 'keywords': 0} for mode in QUERY_MODES}
async with storage._storage_lock:
for key in storage._data:
for mode in QUERY_MODES:
if key.startswith(f'{mode}:query:'):
counts[mode]['query'] += 1
elif key.startswith(f'{mode}:keywords:'):
counts[mode]['keywords'] += 1
return counts
async def count_query_caches_redis(self, storage) -> dict[str, dict[str, int]]:
"""Count query caches in RedisKVStorage by mode and cache_type
Args:
storage: RedisKVStorage instance
Returns:
Dictionary with counts for each mode and cache_type
"""
counts = {mode: {'query': 0, 'keywords': 0} for mode in QUERY_MODES}
print('Scanning Redis keys...', end='', flush=True)
async with storage._get_redis_connection() as redis:
for mode in QUERY_MODES:
for cache_type in CACHE_TYPES:
pattern = f'{mode}:{cache_type}:*'
prefixed_pattern = f'{storage.final_namespace}:{pattern}'
cursor = 0
while True:
cursor, keys = await redis.scan(cursor, match=prefixed_pattern, count=DEFAULT_BATCH_SIZE)
counts[mode][cache_type] += len(keys)
if cursor == 0:
break
print() # New line after progress
return counts
async def count_query_caches_pg(self, storage) -> dict[str, dict[str, int]]:
"""Count query caches in PostgreSQL by mode and cache_type
Args:
storage: PGKVStorage instance
Returns:
Dictionary with counts for each mode and cache_type
"""
from lightrag.kg.postgres_impl import namespace_to_table_name
counts = {mode: {'query': 0, 'keywords': 0} for mode in QUERY_MODES}
table_name = namespace_to_table_name(storage.namespace)
print('Counting PostgreSQL records...', end='', flush=True)
start_time = time.time()
for mode in QUERY_MODES:
for cache_type in CACHE_TYPES:
query = f"""
SELECT COUNT(*) as count
FROM {table_name}
WHERE workspace = $1
AND id LIKE $2
"""
pattern = f'{mode}:{cache_type}:%'
result = await storage.db.query(query, [storage.workspace, pattern])
counts[mode][cache_type] = result['count'] if result else 0
elapsed = time.time() - start_time
if elapsed > 1:
print(f' (took {elapsed:.1f}s)', end='')
print() # New line
return counts
async def count_query_caches_mongo(self, storage) -> dict[str, dict[str, int]]:
"""Count query caches in MongoDB by mode and cache_type
Args:
storage: MongoKVStorage instance
Returns:
Dictionary with counts for each mode and cache_type
"""
counts = {mode: {'query': 0, 'keywords': 0} for mode in QUERY_MODES}
print('Counting MongoDB documents...', end='', flush=True)
start_time = time.time()
for mode in QUERY_MODES:
for cache_type in CACHE_TYPES:
pattern = f'^{mode}:{cache_type}:'
query = {'_id': {'$regex': pattern}}
count = await storage._data.count_documents(query)
counts[mode][cache_type] = count
elapsed = time.time() - start_time
if elapsed > 1:
print(f' (took {elapsed:.1f}s)', end='')
print() # New line
return counts
async def count_query_caches(self, storage, storage_name: str) -> dict[str, dict[str, int]]:
"""Count query caches from any storage type efficiently
Args:
storage: Storage instance
storage_name: Storage type name
Returns:
Dictionary with counts for each mode and cache_type
"""
if storage_name == 'JsonKVStorage':
return await self.count_query_caches_json(storage)
elif storage_name == 'RedisKVStorage':
return await self.count_query_caches_redis(storage)
elif storage_name == 'PGKVStorage':
return await self.count_query_caches_pg(storage)
elif storage_name == 'MongoKVStorage':
return await self.count_query_caches_mongo(storage)
else:
raise ValueError(f'Unsupported storage type: {storage_name}')
async def delete_query_caches_json(self, storage, cleanup_type: str, stats: CleanupStats):
"""Delete query caches from JsonKVStorage
Args:
storage: JsonKVStorage instance
cleanup_type: 'all', 'query', or 'keywords'
stats: CleanupStats object to track progress
"""
# Collect keys to delete
async with storage._storage_lock:
keys_to_delete = []
for key in storage._data:
should_delete = False
for mode in QUERY_MODES:
if cleanup_type == 'all':
if key.startswith(f'{mode}:query:') or key.startswith(f'{mode}:keywords:'):
should_delete = True
elif cleanup_type == 'query':
if key.startswith(f'{mode}:query:'):
should_delete = True
elif cleanup_type == 'keywords' and key.startswith(f'{mode}:keywords:'):
should_delete = True
if should_delete:
keys_to_delete.append(key)
# Delete in batches
total_keys = len(keys_to_delete)
stats.total_batches = (total_keys + self.batch_size - 1) // self.batch_size
print('\n=== Starting Cleanup ===')
print(f'💡 Processing {self.batch_size:,} records at a time from JsonKVStorage\n')
for batch_idx in range(stats.total_batches):
start_idx = batch_idx * self.batch_size
end_idx = min((batch_idx + 1) * self.batch_size, total_keys)
batch_keys = keys_to_delete[start_idx:end_idx]
try:
async with storage._storage_lock:
for key in batch_keys:
del storage._data[key]
# CRITICAL: Set update flag so changes persist to disk
# Without this, deletions remain in-memory only and are lost on exit
await set_all_update_flags(storage.namespace, workspace=storage.workspace)
# Success
stats.successful_batches += 1
stats.successfully_deleted += len(batch_keys)
# Calculate progress
progress = (stats.successfully_deleted / total_keys) * 100
bar_length = 20
filled_length = int(bar_length * stats.successfully_deleted // total_keys)
bar = '' * filled_length + '' * (bar_length - filled_length)
print(
f'Batch {batch_idx + 1}/{stats.total_batches}: {bar} '
f'{stats.successfully_deleted:,}/{total_keys:,} ({progress:.1f}%) ✓'
)
except Exception as e:
stats.add_error(batch_idx + 1, e, len(batch_keys))
print(f'Batch {batch_idx + 1}/{stats.total_batches}: ✗ FAILED - {type(e).__name__}: {e!s}')
async def delete_query_caches_redis(self, storage, cleanup_type: str, stats: CleanupStats):
"""Delete query caches from RedisKVStorage
Args:
storage: RedisKVStorage instance
cleanup_type: 'all', 'query', or 'keywords'
stats: CleanupStats object to track progress
"""
# Build patterns to delete
patterns = []
for mode in QUERY_MODES:
if cleanup_type == 'all':
patterns.append(f'{mode}:query:*')
patterns.append(f'{mode}:keywords:*')
elif cleanup_type == 'query':
patterns.append(f'{mode}:query:*')
elif cleanup_type == 'keywords':
patterns.append(f'{mode}:keywords:*')
print('\n=== Starting Cleanup ===')
print(f'💡 Processing Redis keys in batches of {self.batch_size:,}\n')
batch_idx = 0
total_deleted = 0
async with storage._get_redis_connection() as redis:
for pattern in patterns:
prefixed_pattern = f'{storage.final_namespace}:{pattern}'
cursor = 0
while True:
cursor, keys = await redis.scan(cursor, match=prefixed_pattern, count=self.batch_size)
if keys:
batch_idx += 1
stats.total_batches += 1
try:
# Delete batch using pipeline
pipe = redis.pipeline()
for key in keys:
pipe.delete(key)
await pipe.execute()
# Success
stats.successful_batches += 1
stats.successfully_deleted += len(keys)
total_deleted += len(keys)
# Progress
print(f'Batch {batch_idx}: Deleted {len(keys):,} keys (Total: {total_deleted:,}) ✓')
except Exception as e:
stats.add_error(batch_idx, e, len(keys))
print(f'Batch {batch_idx}: ✗ FAILED - {type(e).__name__}: {e!s}')
if cursor == 0:
break
await asyncio.sleep(0)
async def delete_query_caches_pg(self, storage, cleanup_type: str, stats: CleanupStats):
"""Delete query caches from PostgreSQL
Args:
storage: PGKVStorage instance
cleanup_type: 'all', 'query', or 'keywords'
stats: CleanupStats object to track progress
"""
from lightrag.kg.postgres_impl import namespace_to_table_name
table_name = namespace_to_table_name(storage.namespace)
# Build WHERE conditions
conditions = []
for mode in QUERY_MODES:
if cleanup_type == 'all':
conditions.append(f"id LIKE '{mode}:query:%'")
conditions.append(f"id LIKE '{mode}:keywords:%'")
elif cleanup_type == 'query':
conditions.append(f"id LIKE '{mode}:query:%'")
elif cleanup_type == 'keywords':
conditions.append(f"id LIKE '{mode}:keywords:%'")
where_clause = ' OR '.join(conditions)
print('\n=== Starting Cleanup ===')
print('💡 Executing PostgreSQL DELETE query\n')
try:
query = f"""
DELETE FROM {table_name}
WHERE workspace = $1
AND ({where_clause})
"""
start_time = time.time()
# Fix: Pass dict instead of list for execute() method
await storage.db.execute(query, {'workspace': storage.workspace})
elapsed = time.time() - start_time
# PostgreSQL returns deletion count
stats.total_batches = 1
stats.successful_batches = 1
stats.successfully_deleted = stats.total_to_delete
print(f'✓ Deleted {stats.successfully_deleted:,} records in {elapsed:.2f}s')
except Exception as e:
stats.add_error(1, e, stats.total_to_delete)
print(f'✗ DELETE failed: {type(e).__name__}: {e!s}')
async def delete_query_caches_mongo(self, storage, cleanup_type: str, stats: CleanupStats):
"""Delete query caches from MongoDB
Args:
storage: MongoKVStorage instance
cleanup_type: 'all', 'query', or 'keywords'
stats: CleanupStats object to track progress
"""
# Build regex patterns
patterns = []
for mode in QUERY_MODES:
if cleanup_type == 'all':
patterns.append(f'^{mode}:query:')
patterns.append(f'^{mode}:keywords:')
elif cleanup_type == 'query':
patterns.append(f'^{mode}:query:')
elif cleanup_type == 'keywords':
patterns.append(f'^{mode}:keywords:')
print('\n=== Starting Cleanup ===')
print('💡 Executing MongoDB deleteMany operations\n')
total_deleted = 0
for idx, pattern in enumerate(patterns, 1):
try:
query = {'_id': {'$regex': pattern}}
result = await storage._data.delete_many(query)
deleted_count = result.deleted_count
stats.total_batches += 1
stats.successful_batches += 1
stats.successfully_deleted += deleted_count
total_deleted += deleted_count
print(f'Pattern {idx}/{len(patterns)}: Deleted {deleted_count:,} records ✓')
except Exception as e:
stats.add_error(idx, e, 0)
print(f'Pattern {idx}/{len(patterns)}: ✗ FAILED - {type(e).__name__}: {e!s}')
print(f'\nTotal deleted: {total_deleted:,} records')
async def delete_query_caches(self, storage, storage_name: str, cleanup_type: str, stats: CleanupStats):
"""Delete query caches from any storage type
Args:
storage: Storage instance
storage_name: Storage type name
cleanup_type: 'all', 'query', or 'keywords'
stats: CleanupStats object to track progress
"""
if storage_name == 'JsonKVStorage':
await self.delete_query_caches_json(storage, cleanup_type, stats)
elif storage_name == 'RedisKVStorage':
await self.delete_query_caches_redis(storage, cleanup_type, stats)
elif storage_name == 'PGKVStorage':
await self.delete_query_caches_pg(storage, cleanup_type, stats)
elif storage_name == 'MongoKVStorage':
await self.delete_query_caches_mongo(storage, cleanup_type, stats)
else:
raise ValueError(f'Unsupported storage type: {storage_name}')
def print_header(self):
"""Print tool header"""
print('\n' + '=' * 60)
print('LLM Query Cache Cleanup Tool - LightRAG')
print('=' * 60)
def print_storage_types(self):
"""Print available storage types"""
print('\nSupported KV Storage Types:')
for key, value in STORAGE_TYPES.items():
print(f'[{key}] {value}')
def format_workspace(self, workspace: str) -> str:
"""Format workspace name with highlighting
Args:
workspace: Workspace name (may be empty)
Returns:
Formatted workspace string with ANSI color codes
"""
if workspace:
return f'{BOLD_CYAN}{workspace}{RESET}'
else:
return f'{BOLD_CYAN}(default){RESET}'
def print_cache_statistics(self, counts: dict[str, dict[str, int]], title: str):
"""Print cache statistics in a formatted table
Args:
counts: Dictionary with counts for each mode and cache_type
title: Title for the statistics display
"""
print(f'\n{title}')
print('' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '')
print(f'{"Mode":<10}{"Query":>10}{"Keywords":>10}{"Total":>10}')
print('' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '')
total_query = 0
total_keywords = 0
for mode in QUERY_MODES:
query_count = counts[mode]['query']
keywords_count = counts[mode]['keywords']
mode_total = query_count + keywords_count
total_query += query_count
total_keywords += keywords_count
print(f'{mode:<10}{query_count:>10,}{keywords_count:>10,}{mode_total:>10,}')
print('' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '')
grand_total = total_query + total_keywords
print(f'{"Total":<10}{total_query:>10,}{total_keywords:>10,}{grand_total:>10,}')
print('' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '' + '' * 12 + '')
def calculate_total_to_delete(self, counts: dict[str, dict[str, int]], cleanup_type: str) -> int:
"""Calculate total number of records to delete
Args:
counts: Dictionary with counts for each mode and cache_type
cleanup_type: 'all', 'query', or 'keywords'
Returns:
Total number of records to delete
"""
total = 0
for mode in QUERY_MODES:
if cleanup_type == 'all':
total += counts[mode]['query'] + counts[mode]['keywords']
elif cleanup_type == 'query':
total += counts[mode]['query']
elif cleanup_type == 'keywords':
total += counts[mode]['keywords']
return total
def print_cleanup_report(self, stats: CleanupStats):
"""Print comprehensive cleanup report
Args:
stats: CleanupStats object with cleanup results
"""
print('\n' + '=' * 60)
print('Cleanup Complete - Final Report')
print('=' * 60)
# Overall statistics
print('\n📊 Statistics:')
print(f' Total records to delete: {stats.total_to_delete:,}')
print(f' Total batches: {stats.total_batches:,}')
print(f' Successful batches: {stats.successful_batches:,}')
print(f' Failed batches: {stats.failed_batches:,}')
print(f' Successfully deleted: {stats.successfully_deleted:,}')
print(f' Failed to delete: {stats.failed_to_delete:,}')
# Success rate
success_rate = (stats.successfully_deleted / stats.total_to_delete * 100) if stats.total_to_delete > 0 else 0
print(f' Success rate: {success_rate:.2f}%')
# Before/After comparison
print('\n📈 Before/After Comparison:')
total_before = sum(counts['query'] + counts['keywords'] for counts in stats.counts_before.values())
total_after = sum(counts['query'] + counts['keywords'] for counts in stats.counts_after.values())
print(f' Total caches before: {total_before:,}')
print(f' Total caches after: {total_after:,}')
print(f' Net reduction: {total_before - total_after:,}')
# Error details
if stats.errors:
print(f'\n⚠️ Errors encountered: {len(stats.errors)}')
print('\nError Details:')
print('-' * 60)
# Group errors by type
error_types = {}
for error in stats.errors:
err_type = error['error_type']
error_types[err_type] = error_types.get(err_type, 0) + 1
print('\nError Summary:')
for err_type, count in sorted(error_types.items(), key=lambda x: -x[1]):
print(f' - {err_type}: {count} occurrence(s)')
print('\nFirst 5 errors:')
for i, error in enumerate(stats.errors[:5], 1):
print(f'\n {i}. Batch {error["batch"]}')
print(f' Type: {error["error_type"]}')
print(f' Message: {error["error_msg"]}')
print(f' Records lost: {error["records_lost"]:,}')
if len(stats.errors) > 5:
print(f'\n ... and {len(stats.errors) - 5} more errors')
print('\n' + '=' * 60)
print(f'{BOLD_RED}⚠️ WARNING: Cleanup completed with errors!{RESET}')
print(' Please review the error details above.')
print('=' * 60)
else:
print('\n' + '=' * 60)
print(f'{BOLD_GREEN}✓ SUCCESS: All records cleaned up successfully!{RESET}')
print('=' * 60)
async def setup_storage(self) -> tuple:
"""Setup and initialize storage
Returns:
Tuple of (storage_instance, storage_name, workspace)
Returns (None, None, None) if user chooses to exit
"""
print('\n=== Storage Setup ===')
self.print_storage_types()
# Custom input handling with exit support
while True:
choice = input('\nSelect storage type (1-4) (Press Enter to exit): ').strip()
# Check for exit
if choice == '' or choice == '0':
print('\n✓ Cleanup cancelled by user')
return None, None, None
# Check if choice is valid
if choice in STORAGE_TYPES:
break
print(f'✗ Invalid choice. Please enter one of: {", ".join(STORAGE_TYPES.keys())}')
storage_name = STORAGE_TYPES[choice]
# Special warning for JsonKVStorage about concurrent access
if storage_name == 'JsonKVStorage':
print('\n' + '=' * 60)
print(f'{BOLD_RED}⚠️ IMPORTANT WARNING - JsonKVStorage Concurrency{RESET}')
print('=' * 60)
print('\nJsonKVStorage is an in-memory database that does NOT support')
print('concurrent access to the same file by multiple programs.')
print('\nBefore proceeding, please ensure that:')
print(' • LightRAG Server is completely shut down')
print(' • No other programs are accessing the storage files')
print('\n' + '=' * 60)
confirm = input('\nHas LightRAG Server been shut down? (yes/no): ').strip().lower()
if confirm != 'yes':
print('\n✓ Operation cancelled - Please shut down LightRAG Server first')
return None, None, None
print('✓ Proceeding with JsonKVStorage cleanup...')
# Check configuration (warnings only, doesn't block)
print('\nChecking configuration...')
self.check_env_vars(storage_name)
# Get workspace
workspace = self.get_workspace_for_storage(storage_name)
# Initialize storage (real validation point)
print('\nInitializing storage...')
try:
storage = await self.initialize_storage(storage_name, workspace)
print(f'- Storage Type: {storage_name}')
print(f'- Workspace: {workspace if workspace else "(default)"}')
print('- Connection Status: ✓ Success')
except Exception as e:
print(f'✗ Initialization failed: {e}')
print(f'\nFor {storage_name}, you can configure using:')
print(' 1. Environment variables (highest priority)')
# Show specific environment variable requirements
if storage_name in STORAGE_ENV_REQUIREMENTS:
for var in STORAGE_ENV_REQUIREMENTS[storage_name]:
print(f' - {var}')
print(' 2. config.ini file (medium priority)')
if storage_name == 'RedisKVStorage':
print(' [redis]')
print(' uri = redis://localhost:6379')
elif storage_name == 'PGKVStorage':
print(' [postgres]')
print(' host = localhost')
print(' port = 5432')
print(' user = postgres')
print(' password = yourpassword')
print(' database = lightrag')
elif storage_name == 'MongoKVStorage':
print(' [mongodb]')
print(' uri = mongodb://root:root@localhost:27017/')
print(' database = LightRAG')
return None, None, None
return storage, storage_name, workspace
async def run(self):
"""Run the cleanup tool"""
try:
# Initialize shared storage (REQUIRED for storage classes to work)
from lightrag.kg.shared_storage import initialize_share_data
initialize_share_data(workers=1)
# Print header
self.print_header()
# Setup storage
self.storage, storage_name, self.workspace = await self.setup_storage()
# Check if user cancelled
if self.storage is None:
return
# Count query caches
print('\nCounting query cache records...')
try:
counts = await self.count_query_caches(self.storage, storage_name)
except Exception as e:
print(f'✗ Counting failed: {e}')
await self.storage.finalize()
return
# Initialize stats
stats = CleanupStats()
stats.initialize_counts()
stats.counts_before = counts
# Print statistics
self.print_cache_statistics(counts, '📊 Query Cache Statistics (Before Cleanup):')
# Calculate total
total_caches = sum(counts[mode]['query'] + counts[mode]['keywords'] for mode in QUERY_MODES)
if total_caches == 0:
print('\n⚠️ No query caches found in storage')
await self.storage.finalize()
return
# Select cleanup type
print('\n=== Cleanup Options ===')
print('[1] Delete all query caches (both query and keywords)')
print('[2] Delete query caches only (keep keywords)')
print('[3] Delete keywords caches only (keep query)')
print('[0] Cancel')
while True:
choice = input('\nSelect cleanup option (0-3): ').strip()
if choice == '0' or choice == '':
print('\n✓ Cleanup cancelled')
await self.storage.finalize()
return
elif choice == '1':
cleanup_type = 'all'
elif choice == '2':
cleanup_type = 'query'
elif choice == '3':
cleanup_type = 'keywords'
else:
print('✗ Invalid choice. Please enter 0, 1, 2, or 3')
continue
# Calculate total to delete for the selected type
stats.total_to_delete = self.calculate_total_to_delete(counts, cleanup_type)
# Check if there are any records to delete
if stats.total_to_delete == 0:
if cleanup_type == 'all':
print(f'\n{BOLD_RED}⚠️ No query caches found to delete!{RESET}')
elif cleanup_type == 'query':
print(f'\n{BOLD_RED}⚠️ No query caches found to delete! (Only keywords exist){RESET}')
elif cleanup_type == 'keywords':
print(f'\n{BOLD_RED}⚠️ No keywords caches found to delete! (Only query caches exist){RESET}')
print(' Please select a different cleanup option.\n')
continue
# Valid selection with records to delete
break
# Confirm deletion
print('\n' + '=' * 60)
print('Cleanup Confirmation')
print('=' * 60)
print(f'Storage: {BOLD_CYAN}{storage_name}{RESET} (workspace: {self.format_workspace(self.workspace)})')
print(f'Cleanup Type: {BOLD_CYAN}{cleanup_type}{RESET}')
print(f'Records to Delete: {BOLD_RED}{stats.total_to_delete:,}{RESET} / {total_caches:,}')
if cleanup_type == 'all':
print(f'\n{BOLD_RED}⚠️ WARNING: This will delete ALL query caches across all modes!{RESET}')
elif cleanup_type == 'query':
print('\n⚠️ This will delete query caches only (keywords will be kept)')
elif cleanup_type == 'keywords':
print('\n⚠️ This will delete keywords caches only (query will be kept)')
confirm = input('\nContinue with deletion? (y/n): ').strip().lower()
if confirm != 'y':
print('\n✓ Cleanup cancelled')
await self.storage.finalize()
return
# Perform deletion
await self.delete_query_caches(self.storage, storage_name, cleanup_type, stats)
# Persist changes
print('\nPersisting changes to storage...')
try:
await self.storage.index_done_callback()
print('✓ Changes persisted successfully')
except Exception as e:
print(f'✗ Persist failed: {e}')
stats.add_error(0, e, 0)
# Count again to verify
print('\nVerifying cleanup results...')
try:
stats.counts_after = await self.count_query_caches(self.storage, storage_name)
except Exception as e:
print(f'⚠️ Verification failed: {e}')
# Use zero counts if verification fails
stats.counts_after = {mode: {'query': 0, 'keywords': 0} for mode in QUERY_MODES}
# Print final report
self.print_cleanup_report(stats)
# Print after statistics
self.print_cache_statistics(stats.counts_after, '\n📊 Query Cache Statistics (After Cleanup):')
# Cleanup
await self.storage.finalize()
except KeyboardInterrupt:
print('\n\n✗ Cleanup interrupted by user')
except Exception as e:
print(f'\n✗ Cleanup failed: {e}')
import traceback
traceback.print_exc()
finally:
# Ensure cleanup
if self.storage:
with contextlib.suppress(Exception):
await self.storage.finalize()
# Finalize shared storage
try:
from lightrag.kg.shared_storage import finalize_share_data
finalize_share_data()
except Exception:
pass
async def async_main():
"""Async main entry point"""
tool = CleanupTool()
await tool.run()
def main():
"""Synchronous entry point for CLI command"""
asyncio.run(async_main())
if __name__ == '__main__':
main()