LightRAG/lightrag/tools/migrate_llm_cache.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

967 lines
34 KiB
Python

#!/usr/bin/env python3
"""
LLM Cache Migration Tool for LightRAG (PostgreSQL only)
This tool migrates LLM response cache (default:extract:* and default:summary:*)
between PostgreSQL-backed KV storage instances while preserving workspace isolation.
Usage:
python -m lightrag.tools.migrate_llm_cache
# or
python lightrag/tools/migrate_llm_cache.py
Supported KV Storage Types:
- PGKVStorage
"""
import asyncio
import contextlib
import os
import sys
import time
from dataclasses import dataclass, field
from typing import Any, cast
from dotenv import load_dotenv
# Add project root to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from lightrag.kg import STORAGE_ENV_REQUIREMENTS
from lightrag.namespace import NameSpace
from lightrag.utils import setup_logger
# Load environment variables
# use the .env that is inside the current folder
# allows to use different .env file for each lightrag instance
# the OS environment variables take precedence over the .env file
load_dotenv(dotenv_path='.env', override=False)
# Setup logger
setup_logger('lightrag', level='INFO')
# Storage type configurations (PostgreSQL only)
STORAGE_TYPES = {
'1': 'PGKVStorage',
}
# Workspace environment variable mapping
WORKSPACE_ENV_MAP = {
'PGKVStorage': 'POSTGRES_WORKSPACE',
}
# Default batch size for migration
DEFAULT_BATCH_SIZE = 1000
# Default count batch size for efficient counting
DEFAULT_COUNT_BATCH_SIZE = 1000
# ANSI color codes for terminal output
BOLD_CYAN = '\033[1;36m'
RESET = '\033[0m'
@dataclass
class MigrationStats:
"""Migration statistics and error tracking"""
total_source_records: int = 0
total_batches: int = 0
successful_batches: int = 0
failed_batches: int = 0
successful_records: int = 0
failed_records: int = 0
errors: list[dict[str, Any]] = field(default_factory=list)
def add_error(self, batch_idx: int, error: Exception, batch_size: int):
"""Record batch error"""
self.errors.append(
{
'batch': batch_idx,
'error_type': type(error).__name__,
'error_msg': str(error),
'records_lost': batch_size,
'timestamp': time.time(),
}
)
self.failed_batches += 1
self.failed_records += batch_size
class MigrationTool:
"""LLM Cache Migration Tool"""
def __init__(self):
self.source_storage = None
self.target_storage = None
self.source_workspace = ''
self.target_workspace = ''
self.batch_size = DEFAULT_BATCH_SIZE
def get_workspace_for_storage(self, storage_name: str) -> str:
"""Get workspace for a specific storage type
Priority: Storage-specific env var > WORKSPACE env var > empty string
Args:
storage_name: Storage implementation name
Returns:
Workspace name
"""
# Check storage-specific workspace
if storage_name in WORKSPACE_ENV_MAP:
specific_workspace = os.getenv(WORKSPACE_ENV_MAP[storage_name])
if specific_workspace:
return specific_workspace
# Check generic WORKSPACE
workspace = os.getenv('WORKSPACE', '')
return workspace
def check_config_ini_for_storage(self, storage_name: str) -> bool:
"""Check if config.ini has configuration for the storage type
Args:
storage_name: Storage implementation name
Returns:
True if config.ini has the necessary configuration
"""
try:
import configparser
config = configparser.ConfigParser()
config.read('config.ini', encoding='utf-8')
if storage_name == 'PGKVStorage':
return (
config.has_option('postgres', 'user')
and config.has_option('postgres', 'password')
and config.has_option('postgres', 'database')
)
return False
except Exception:
return False
def check_env_vars(self, storage_name: str) -> bool:
"""Check environment variables, show warnings if missing but don't fail
Args:
storage_name: Storage implementation name
Returns:
Always returns True (warnings only, no hard failure)
"""
required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
if not required_vars:
print('✓ No environment variables required')
return True
missing_vars = [var for var in required_vars if var not in os.environ]
if missing_vars:
print(f'⚠️ Warning: Missing environment variables: {", ".join(missing_vars)}')
# Check if config.ini has configuration
has_config = self.check_config_ini_for_storage(storage_name)
if has_config:
print(' ✓ Found configuration in config.ini')
else:
print(f' Will attempt to use defaults for {storage_name}')
return True
print('✓ All required environment variables are set')
return True
def count_available_storage_types(self) -> int:
"""Count available storage types (with env vars or config.ini)
Returns:
Number of available storage types
"""
available_count = 0
for storage_name in STORAGE_TYPES.values():
required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
# Check if has environment variables
has_env = all(var in os.environ for var in required_vars)
if has_env:
available_count += 1
else:
# Check if has config.ini configuration
has_config = self.check_config_ini_for_storage(storage_name)
if has_config:
available_count += 1
return available_count
def get_storage_class(self, storage_name: str):
"""Dynamically import and return storage class
Args:
storage_name: Storage implementation name
Returns:
Storage class
"""
if storage_name == 'PGKVStorage':
from lightrag.kg.postgres_impl import PGKVStorage
return PGKVStorage
else:
raise ValueError(f'Unsupported storage type: {storage_name}')
async def initialize_storage(self, storage_name: str, workspace: str):
"""Initialize storage instance with fallback to config.ini and defaults
Args:
storage_name: Storage implementation name
workspace: Workspace name
Returns:
Initialized storage instance
Raises:
Exception: If initialization fails
"""
storage_class = self.get_storage_class(storage_name)
# Create global config
global_config = {
'working_dir': os.getenv('WORKING_DIR', './rag_storage'),
'embedding_batch_num': 10,
}
# Initialize storage
storage = storage_class(
namespace=NameSpace.KV_STORE_LLM_RESPONSE_CACHE,
workspace=workspace,
global_config=global_config,
embedding_func=cast(Any, None),
)
# Initialize the storage (may raise exception if connection fails)
await storage.initialize()
return storage
async def get_default_caches_pg(self, storage, batch_size: int = 1000) -> dict[str, Any]:
"""Get default caches from PGKVStorage with pagination
Args:
storage: PGKVStorage instance
batch_size: Number of records to fetch per batch
Returns:
Dictionary of cache entries with default:extract:* or default:summary:* keys
"""
from lightrag.kg.postgres_impl import namespace_to_table_name
cache_data = {}
table_name = namespace_to_table_name(storage.namespace)
offset = 0
while True:
# Use LIMIT and OFFSET for pagination
query = f"""
SELECT id as key, original_prompt, return_value, chunk_id, cache_type, queryparam,
EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
FROM {table_name}
WHERE workspace = $1
AND (id LIKE 'default:extract:%' OR id LIKE 'default:summary:%')
ORDER BY id
LIMIT $2 OFFSET $3
"""
results = await storage.db.query(query, [storage.workspace, batch_size, offset], multirows=True)
if not results:
break
for row in results:
# Map PostgreSQL fields to cache format
cache_entry = {
'return': row.get('return_value', ''),
'cache_type': row.get('cache_type'),
'original_prompt': row.get('original_prompt', ''),
'chunk_id': row.get('chunk_id'),
'queryparam': row.get('queryparam'),
'create_time': row.get('create_time', 0),
'update_time': row.get('update_time', 0),
}
cache_data[row['key']] = cache_entry
# If we got fewer results than batch_size, we're done
if len(results) < batch_size:
break
offset += batch_size
# Yield control periodically
await asyncio.sleep(0)
return cache_data
async def get_default_caches(self, storage, storage_name: str) -> dict[str, Any]:
"""Get default caches from PostgreSQL storage"""
if storage_name == 'PGKVStorage':
return await self.get_default_caches_pg(storage)
raise ValueError(f'Unsupported storage type: {storage_name}')
async def count_default_caches_pg(self, storage) -> int:
"""Count default caches in PostgreSQL using COUNT(*) with progress indicator
Args:
storage: PGKVStorage instance
Returns:
Total count of cache records
"""
from lightrag.kg.postgres_impl import namespace_to_table_name
table_name = namespace_to_table_name(storage.namespace)
query = f"""
SELECT COUNT(*) as count
FROM {table_name}
WHERE workspace = $1
AND (id LIKE 'default:extract:%' OR id LIKE 'default:summary:%')
"""
print('Counting PostgreSQL records...', end='', flush=True)
start_time = time.time()
result = await storage.db.query(query, [storage.workspace])
elapsed = time.time() - start_time
if elapsed > 1:
print(f' (took {elapsed:.1f}s)', end='')
print() # New line
return result['count'] if result else 0
async def count_default_caches(self, storage, storage_name: str) -> int:
"""Count default caches from PostgreSQL efficiently"""
if storage_name == 'PGKVStorage':
return await self.count_default_caches_pg(storage)
raise ValueError(f'Unsupported storage type: {storage_name}')
async def stream_default_caches_pg(self, storage, batch_size: int):
"""Stream default caches from PostgreSQL - yields batches
Args:
storage: PGKVStorage instance
batch_size: Size of each batch to yield
Yields:
Dictionary batches of cache entries
"""
from lightrag.kg.postgres_impl import namespace_to_table_name
table_name = namespace_to_table_name(storage.namespace)
offset = 0
while True:
query = f"""
SELECT id as key, original_prompt, return_value, chunk_id, cache_type, queryparam,
EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
FROM {table_name}
WHERE workspace = $1
AND (id LIKE 'default:extract:%' OR id LIKE 'default:summary:%')
ORDER BY id
LIMIT $2 OFFSET $3
"""
results = await storage.db.query(query, [storage.workspace, batch_size, offset], multirows=True)
if not results:
break
batch = {}
for row in results:
cache_entry = {
'return': row.get('return_value', ''),
'cache_type': row.get('cache_type'),
'original_prompt': row.get('original_prompt', ''),
'chunk_id': row.get('chunk_id'),
'queryparam': row.get('queryparam'),
'create_time': row.get('create_time', 0),
'update_time': row.get('update_time', 0),
}
batch[row['key']] = cache_entry
if batch:
yield batch
if len(results) < batch_size:
break
offset += batch_size
await asyncio.sleep(0)
async def stream_default_caches(self, storage, storage_name: str, batch_size: int | None = None):
"""Stream default caches from PostgreSQL - unified interface"""
if batch_size is None:
batch_size = self.batch_size
if storage_name == 'PGKVStorage':
async for batch in self.stream_default_caches_pg(storage, batch_size):
yield batch
else:
raise ValueError(f'Unsupported storage type: {storage_name}')
async def count_cache_types(self, cache_data: dict[str, Any]) -> dict[str, int]:
"""Count cache entries by type
Args:
cache_data: Dictionary of cache entries
Returns:
Dictionary with counts for each cache type
"""
counts = {
'extract': 0,
'summary': 0,
}
for key in cache_data:
if key.startswith('default:extract:'):
counts['extract'] += 1
elif key.startswith('default:summary:'):
counts['summary'] += 1
return counts
def print_header(self):
"""Print tool header"""
print('\n' + '=' * 50)
print('LLM Cache Migration Tool - LightRAG')
print('=' * 50)
def print_storage_types(self):
"""Print available storage types"""
print('\nSupported KV Storage Types:')
for key, value in STORAGE_TYPES.items():
print(f'[{key}] {value}')
def format_workspace(self, workspace: str) -> str:
"""Format workspace name with highlighting
Args:
workspace: Workspace name (may be empty)
Returns:
Formatted workspace string with ANSI color codes
"""
if workspace:
return f'{BOLD_CYAN}{workspace}{RESET}'
else:
return f'{BOLD_CYAN}(default){RESET}'
def format_storage_name(self, storage_name: str) -> str:
"""Format storage type name with highlighting
Args:
storage_name: Storage type name
Returns:
Formatted storage name string with ANSI color codes
"""
return f'{BOLD_CYAN}{storage_name}{RESET}'
async def setup_storage(
self,
storage_type: str,
use_streaming: bool = False,
exclude_storage_name: str | None = None,
) -> tuple:
"""Setup and initialize storage with config.ini fallback support
Args:
storage_type: Type label (source/target)
use_streaming: If True, only count records without loading. If False, load all data (legacy mode)
exclude_storage_name: Storage type to exclude from selection (e.g., to prevent selecting same as source)
Returns:
Tuple of (storage_instance, storage_name, workspace, total_count)
Returns (None, None, None, 0) if user chooses to exit
"""
print(f'\n=== {storage_type} Storage Setup ===')
# Filter and remap available storage types if exclusion is specified
if exclude_storage_name:
# Get available storage types (excluding source)
available_list = [(k, v) for k, v in STORAGE_TYPES.items() if v != exclude_storage_name]
# Remap to sequential numbering (1, 2, 3...)
remapped_types = {str(i + 1): name for i, (_, name) in enumerate(available_list)}
# Print available types with new sequential numbers
print(f'\nAvailable Storage Types for Target (source: {exclude_storage_name} excluded):')
for key, value in remapped_types.items():
print(f'[{key}] {value}')
available_types = remapped_types
else:
# For source storage, use original numbering
available_types = STORAGE_TYPES.copy()
self.print_storage_types()
# Generate dynamic prompt based on number of options
num_options = len(available_types)
prompt_range = '1' if num_options == 1 else f'1-{num_options}'
# Custom input handling with exit support
while True:
choice = input(f'\nSelect {storage_type} storage type ({prompt_range}) (Press Enter to exit): ').strip()
# Check for exit
if choice == '' or choice == '0':
print('\n✓ Migration cancelled by user')
return None, None, None, 0
# Check if choice is valid
if choice in available_types:
break
print(f'✗ Invalid choice. Please enter one of: {", ".join(available_types.keys())}')
storage_name = available_types[choice]
# Check configuration (warnings only, doesn't block)
print('\nChecking configuration...')
self.check_env_vars(storage_name)
# Get workspace
workspace = self.get_workspace_for_storage(storage_name)
# Initialize storage (real validation point)
print(f'\nInitializing {storage_type} storage...')
try:
storage = await self.initialize_storage(storage_name, workspace)
print(f'- Storage Type: {storage_name}')
print(f'- Workspace: {workspace if workspace else "(default)"}\n')
print('- Connection Status: ✓ Success')
# Show configuration source for transparency
if storage_name == 'PGKVStorage':
config_source = (
'environment variables'
if all(var in os.environ for var in STORAGE_ENV_REQUIREMENTS[storage_name])
else 'config.ini or defaults'
)
print(f'- Configuration Source: {config_source}')
except Exception as e:
print(f'✗ Initialization failed: {e}')
print(f'\nFor {storage_name}, you can configure using:')
print(' 1. Environment variables (highest priority)')
# Show specific environment variable requirements
if storage_name in STORAGE_ENV_REQUIREMENTS:
for var in STORAGE_ENV_REQUIREMENTS[storage_name]:
print(f' - {var}')
print(' 2. config.ini file (medium priority)')
if storage_name == 'PGKVStorage':
print(' [postgres]')
print(' host = localhost')
print(' port = 5432')
print(' user = postgres')
print(' password = yourpassword')
print(' database = lightrag')
return None, None, None, 0
# Count cache records efficiently
print(f'\n{"Counting" if use_streaming else "Loading"} cache records...')
try:
if use_streaming:
# Use efficient counting without loading data
total_count = await self.count_default_caches(storage, storage_name)
print(f'- Total: {total_count:,} records')
else:
# Legacy mode: load all data
cache_data = await self.get_default_caches(storage, storage_name)
counts = await self.count_cache_types(cache_data)
total_count = len(cache_data)
print(f'- default:extract: {counts["extract"]:,} records')
print(f'- default:summary: {counts["summary"]:,} records')
print(f'- Total: {total_count:,} records')
except Exception as e:
print(f'{"Counting" if use_streaming else "Loading"} failed: {e}')
return None, None, None, 0
return storage, storage_name, workspace, total_count
async def migrate_caches(
self,
source_data: dict[str, Any],
target_storage,
target_storage_name: str,
) -> MigrationStats:
"""Migrate caches in batches with error tracking (Legacy mode - loads all data)
Args:
source_data: Source cache data
target_storage: Target storage instance
target_storage_name: Target storage type name
Returns:
MigrationStats object with migration results and errors
"""
stats = MigrationStats()
stats.total_source_records = len(source_data)
if stats.total_source_records == 0:
print('\nNo records to migrate')
return stats
# Convert to list for batching
items = list(source_data.items())
stats.total_batches = (stats.total_source_records + self.batch_size - 1) // self.batch_size
print('\n=== Starting Migration ===')
for batch_idx in range(stats.total_batches):
start_idx = batch_idx * self.batch_size
end_idx = min((batch_idx + 1) * self.batch_size, stats.total_source_records)
batch_items = items[start_idx:end_idx]
batch_data = dict(batch_items)
# Determine current cache type for display
current_key = batch_items[0][0]
cache_type = 'extract' if 'extract' in current_key else 'summary'
try:
# Attempt to write batch
await target_storage.upsert(batch_data)
# Success - update stats
stats.successful_batches += 1
stats.successful_records += len(batch_data)
# Calculate progress
progress = (end_idx / stats.total_source_records) * 100
bar_length = 20
filled_length = int(bar_length * end_idx // stats.total_source_records)
bar = '' * filled_length + '' * (bar_length - filled_length)
print(
f'Batch {batch_idx + 1}/{stats.total_batches}: {bar} '
f'{end_idx:,}/{stats.total_source_records:,} ({progress:.0f}%) - '
f'default:{cache_type}'
)
except Exception as e:
# Error - record and continue
stats.add_error(batch_idx + 1, e, len(batch_data))
print(f'Batch {batch_idx + 1}/{stats.total_batches}: ✗ FAILED - {type(e).__name__}: {e!s}')
# Final persist
print('\nPersisting data to disk...')
try:
await target_storage.index_done_callback()
print('✓ Data persisted successfully')
except Exception as e:
print(f'✗ Persist failed: {e}')
stats.add_error(0, e, 0) # batch 0 = persist error
return stats
async def migrate_caches_streaming(
self,
source_storage,
source_storage_name: str,
target_storage,
target_storage_name: str,
total_records: int,
) -> MigrationStats:
"""Migrate caches using streaming approach - minimal memory footprint
Args:
source_storage: Source storage instance
source_storage_name: Source storage type name
target_storage: Target storage instance
target_storage_name: Target storage type name
total_records: Total number of records to migrate
Returns:
MigrationStats object with migration results and errors
"""
stats = MigrationStats()
stats.total_source_records = total_records
if stats.total_source_records == 0:
print('\nNo records to migrate')
return stats
# Calculate total batches
stats.total_batches = (total_records + self.batch_size - 1) // self.batch_size
print('\n=== Starting Streaming Migration ===')
print(f'💡 Memory-optimized mode: Processing {self.batch_size:,} records at a time\n')
batch_idx = 0
# Stream batches from source and write to target immediately
async for batch in self.stream_default_caches(source_storage, source_storage_name):
batch_idx += 1
# Determine current cache type for display
if batch:
first_key = next(iter(batch.keys()))
cache_type = 'extract' if 'extract' in first_key else 'summary'
else:
cache_type = 'unknown'
try:
# Write batch to target storage
await target_storage.upsert(batch)
# Success - update stats
stats.successful_batches += 1
stats.successful_records += len(batch)
# Calculate progress with known total
progress = (stats.successful_records / total_records) * 100
bar_length = 20
filled_length = int(bar_length * stats.successful_records // total_records)
bar = '' * filled_length + '' * (bar_length - filled_length)
print(
f'Batch {batch_idx}/{stats.total_batches}: {bar} '
f'{stats.successful_records:,}/{total_records:,} ({progress:.1f}%) - '
f'default:{cache_type}'
)
except Exception as e:
# Error - record and continue
stats.add_error(batch_idx, e, len(batch))
print(f'Batch {batch_idx}/{stats.total_batches}: ✗ FAILED - {type(e).__name__}: {e!s}')
# Final persist
print('\nPersisting data to disk...')
try:
await target_storage.index_done_callback()
print('✓ Data persisted successfully')
except Exception as e:
print(f'✗ Persist failed: {e}')
stats.add_error(0, e, 0) # batch 0 = persist error
return stats
def print_migration_report(self, stats: MigrationStats):
"""Print comprehensive migration report
Args:
stats: MigrationStats object with migration results
"""
print('\n' + '=' * 60)
print('Migration Complete - Final Report')
print('=' * 60)
# Overall statistics
print('\n📊 Statistics:')
print(f' Total source records: {stats.total_source_records:,}')
print(f' Total batches: {stats.total_batches:,}')
print(f' Successful batches: {stats.successful_batches:,}')
print(f' Failed batches: {stats.failed_batches:,}')
print(f' Successfully migrated: {stats.successful_records:,}')
print(f' Failed to migrate: {stats.failed_records:,}')
# Success rate
success_rate = (
(stats.successful_records / stats.total_source_records * 100) if stats.total_source_records > 0 else 0
)
print(f' Success rate: {success_rate:.2f}%')
# Error details
if stats.errors:
print(f'\n⚠️ Errors encountered: {len(stats.errors)}')
print('\nError Details:')
print('-' * 60)
# Group errors by type
error_types = {}
for error in stats.errors:
err_type = error['error_type']
error_types[err_type] = error_types.get(err_type, 0) + 1
print('\nError Summary:')
for err_type, count in sorted(error_types.items(), key=lambda x: -x[1]):
print(f' - {err_type}: {count} occurrence(s)')
print('\nFirst 5 errors:')
for i, error in enumerate(stats.errors[:5], 1):
print(f'\n {i}. Batch {error["batch"]}')
print(f' Type: {error["error_type"]}')
print(f' Message: {error["error_msg"]}')
print(f' Records lost: {error["records_lost"]:,}')
if len(stats.errors) > 5:
print(f'\n ... and {len(stats.errors) - 5} more errors')
print('\n' + '=' * 60)
print('⚠️ WARNING: Migration completed with errors!')
print(' Please review the error details above.')
print('=' * 60)
else:
print('\n' + '=' * 60)
print('✓ SUCCESS: All records migrated successfully!')
print('=' * 60)
async def run(self):
"""Run the migration tool with streaming approach and early validation"""
try:
# Initialize shared storage (REQUIRED for storage classes to work)
from lightrag.kg.shared_storage import initialize_share_data
initialize_share_data(workers=1)
# Print header
self.print_header()
# Setup source storage with streaming (only count, don't load all data)
(
self.source_storage,
source_storage_name,
self.source_workspace,
source_count,
) = await self.setup_storage('Source', use_streaming=True)
# Check if user cancelled (setup_storage returns None for all fields)
if self.source_storage is None:
return
# Check if there are at least 2 storage types available
available_count = self.count_available_storage_types()
if available_count <= 1:
print('\n' + '=' * 60)
print('⚠️ Warning: Migration Not Possible')
print('=' * 60)
print(f'Only {available_count} storage type(s) available.')
print('Migration requires at least 2 different storage types.')
print('\nTo enable migration, configure additional storage:')
print(' 1. Set environment variables, OR')
print(' 2. Update config.ini file')
print('\nSupported storage types:')
for name in STORAGE_TYPES.values():
if name != source_storage_name:
print(f' - {name}')
if name in STORAGE_ENV_REQUIREMENTS:
for var in STORAGE_ENV_REQUIREMENTS[name]:
print(f' Required: {var}')
print('=' * 60)
# Cleanup
await self.source_storage.finalize()
return
if source_count == 0:
print('\n⚠️ Source storage has no cache records to migrate')
# Cleanup
await self.source_storage.finalize()
return
# Setup target storage with streaming (only count, don't load all data)
# Exclude source storage type from target selection
(
self.target_storage,
target_storage_name,
self.target_workspace,
target_count,
) = await self.setup_storage('Target', use_streaming=True, exclude_storage_name=source_storage_name)
if not self.target_storage:
print('\n✗ Target storage setup failed')
# Cleanup source
await self.source_storage.finalize()
return
# Show migration summary
print('\n' + '=' * 50)
print('Migration Confirmation')
print('=' * 50)
print(
f'Source: {self.format_storage_name(source_storage_name)} (workspace: {self.format_workspace(self.source_workspace)}) - {source_count:,} records'
)
print(
f'Target: {self.format_storage_name(target_storage_name)} (workspace: {self.format_workspace(self.target_workspace)}) - {target_count:,} records'
)
print(f'Batch Size: {self.batch_size:,} records/batch')
print('Memory Mode: Streaming (memory-optimized)')
if target_count > 0:
print(f'\n⚠️ Warning: Target storage already has {target_count:,} records')
print('Migration will overwrite records with the same keys')
# Confirm migration
confirm = input('\nContinue? (y/n): ').strip().lower()
if confirm != 'y':
print('\n✗ Migration cancelled')
# Cleanup
await self.source_storage.finalize()
await self.target_storage.finalize()
return
# Perform streaming migration with error tracking
stats = await self.migrate_caches_streaming(
self.source_storage,
source_storage_name,
self.target_storage,
target_storage_name,
source_count,
)
# Print comprehensive migration report
self.print_migration_report(stats)
# Cleanup
await self.source_storage.finalize()
await self.target_storage.finalize()
except KeyboardInterrupt:
print('\n\n✗ Migration interrupted by user')
except Exception as e:
print(f'\n✗ Migration failed: {e}')
import traceback
traceback.print_exc()
finally:
# Ensure cleanup
if self.source_storage:
with contextlib.suppress(Exception):
await self.source_storage.finalize()
if self.target_storage:
with contextlib.suppress(Exception):
await self.target_storage.finalize()
# Finalize shared storage
try:
from lightrag.kg.shared_storage import finalize_share_data
finalize_share_data()
except Exception:
pass
async def main():
"""Main entry point"""
tool = MigrationTool()
await tool.run()
if __name__ == '__main__':
asyncio.run(main())