Enhance LLM cache migration tool with streaming and improved UX

- Add streaming migration for memory efficiency - Implement graceful exit with Enter/0 - Add progress indicators for counting - Optimize batch processing by storage type - Update docs with new progress displays
2025-11-08 19:38:00 +08:00 · 2025-11-08 19:38:00 +08:00 · 6b9f13c792
commit 6b9f13c792
parent d0d31e9262
2 changed files with 538 additions and 46 deletions
--- a/lightrag/tools/README_MIGRATE_LLM_CACHE.md
+++ b/lightrag/tools/README_MIGRATE_LLM_CACHE.md
@ -95,9 +95,11 @@ Supported KV Storage Types:
 [3] PGKVStorage
 [4] MongoKVStorage

-Select Source storage type (1-4): 1
+Select Source storage type (1-4) (Press Enter or 0 to exit): 1
 ```

+**Note**: You can press Enter or type `0` at the source storage selection to exit gracefully.
+
 #### 2. Source Storage Validation
 The tool will:
 - Check required environment variables
@ -115,22 +117,33 @@ Initializing Source storage...
 - Connection Status: ✓ Success

 Counting cache records...
- default:extract: 8,500 records
- default:summary: 234 records
 - Total: 8,734 records
 ```

+**Progress Display by Storage Type:**
+- **JsonKVStorage**: Fast in-memory counting, no progress display needed
+- **RedisKVStorage**: Real-time scanning progress
+  ```
+  Scanning Redis keys... found 8,734 records
+  ```
+- **PostgreSQL**: Shows timing if operation takes >1 second
+  ```
+  Counting PostgreSQL records... (took 2.3s)
+  ```
+- **MongoDB**: Shows timing if operation takes >1 second
+  ```
+  Counting MongoDB documents... (took 1.8s)
+  ```
+
 #### 3. Select Target Storage Type

 Repeat steps 1-2 to select and validate the target storage.

 #### 4. Confirm Migration

-Review the migration summary and confirm:
-
 ```
-Migration Confirmation
 ==================================================
+Migration Confirmation
 Source: JsonKVStorage (workspace: space1) - 8,734 records
 Target: MongoKVStorage (workspace: space1) - 0 records
 Batch Size: 1,000 records/batch
--- a/lightrag/tools/migrate_llm_cache.py
+++ b/lightrag/tools/migrate_llm_cache.py
@ -26,7 +26,9 @@ from dataclasses import dataclass, field
 from dotenv import load_dotenv

 # Add project root to path for imports
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+sys.path.insert(
+    0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)

 from lightrag.kg import STORAGE_ENV_REQUIREMENTS
 from lightrag.namespace import NameSpace
@ -60,6 +62,10 @@ WORKSPACE_ENV_MAP = {
 DEFAULT_BATCH_SIZE = 1000


+# Default count batch size for efficient counting
+DEFAULT_COUNT_BATCH_SIZE = 1000
+
+
@dataclass
 class MigrationStats:
    """Migration statistics and error tracking"""
@ -418,6 +424,351 @@ class MigrationTool:
        else:
            raise ValueError(f"Unsupported storage type: {storage_name}")

+    async def count_default_caches_json(self, storage) -> int:
+        """Count default caches in JsonKVStorage - O(N) but very fast in-memory
+
+        Args:
+            storage: JsonKVStorage instance
+
+        Returns:
+            Total count of cache records
+        """
+        async with storage._storage_lock:
+            return sum(
+                1
+                for key in storage._data.keys()
+                if key.startswith("default:extract:")
+                or key.startswith("default:summary:")
+            )
+
+    async def count_default_caches_redis(self, storage) -> int:
+        """Count default caches in RedisKVStorage using SCAN with progress display
+
+        Args:
+            storage: RedisKVStorage instance
+
+        Returns:
+            Total count of cache records
+        """
+        count = 0
+        print("Scanning Redis keys...", end="", flush=True)
+
+        async with storage._get_redis_connection() as redis:
+            for pattern in ["default:extract:*", "default:summary:*"]:
+                prefixed_pattern = f"{storage.final_namespace}:{pattern}"
+                cursor = 0
+                while True:
+                    cursor, keys = await redis.scan(
+                        cursor, match=prefixed_pattern, count=DEFAULT_COUNT_BATCH_SIZE
+                    )
+                    count += len(keys)
+
+                    # Show progress
+                    print(
+                        f"\rScanning Redis keys... found {count:,} records",
+                        end="",
+                        flush=True,
+                    )
+
+                    if cursor == 0:
+                        break
+
+        print()  # New line after progress
+        return count
+
+    async def count_default_caches_pg(self, storage) -> int:
+        """Count default caches in PostgreSQL using COUNT(*) with progress indicator
+
+        Args:
+            storage: PGKVStorage instance
+
+        Returns:
+            Total count of cache records
+        """
+        from lightrag.kg.postgres_impl import namespace_to_table_name
+
+        table_name = namespace_to_table_name(storage.namespace)
+
+        query = f"""
+            SELECT COUNT(*) as count
+            FROM {table_name}
+            WHERE workspace = $1
+            AND (id LIKE 'default:extract:%' OR id LIKE 'default:summary:%')
+        """
+
+        print("Counting PostgreSQL records...", end="", flush=True)
+        start_time = time.time()
+
+        result = await storage.db.query(query, [storage.workspace])
+
+        elapsed = time.time() - start_time
+        if elapsed > 1:
+            print(f" (took {elapsed:.1f}s)", end="")
+        print()  # New line
+
+        return result["count"] if result else 0
+
+    async def count_default_caches_mongo(self, storage) -> int:
+        """Count default caches in MongoDB using count_documents with progress indicator
+
+        Args:
+            storage: MongoKVStorage instance
+
+        Returns:
+            Total count of cache records
+        """
+        query = {"_id": {"$regex": "^default:(extract|summary):"}}
+
+        print("Counting MongoDB documents...", end="", flush=True)
+        start_time = time.time()
+
+        count = await storage._data.count_documents(query)
+
+        elapsed = time.time() - start_time
+        if elapsed > 1:
+            print(f" (took {elapsed:.1f}s)", end="")
+        print()  # New line
+
+        return count
+
+    async def count_default_caches(self, storage, storage_name: str) -> int:
+        """Count default caches from any storage type efficiently
+
+        Args:
+            storage: Storage instance
+            storage_name: Storage type name
+
+        Returns:
+            Total count of cache records
+        """
+        if storage_name == "JsonKVStorage":
+            return await self.count_default_caches_json(storage)
+        elif storage_name == "RedisKVStorage":
+            return await self.count_default_caches_redis(storage)
+        elif storage_name == "PGKVStorage":
+            return await self.count_default_caches_pg(storage)
+        elif storage_name == "MongoKVStorage":
+            return await self.count_default_caches_mongo(storage)
+        else:
+            raise ValueError(f"Unsupported storage type: {storage_name}")
+
+    async def stream_default_caches_json(self, storage, batch_size: int):
+        """Stream default caches from JsonKVStorage - yields batches
+
+        Args:
+            storage: JsonKVStorage instance
+            batch_size: Size of each batch to yield
+
+        Yields:
+            Dictionary batches of cache entries
+        """
+        async with storage._storage_lock:
+            batch = {}
+            for key, value in storage._data.items():
+                if key.startswith("default:extract:") or key.startswith(
+                    "default:summary:"
+                ):
+                    batch[key] = value
+                    if len(batch) >= batch_size:
+                        yield batch
+                        batch = {}
+            # Yield remaining items
+            if batch:
+                yield batch
+
+    async def stream_default_caches_redis(self, storage, batch_size: int):
+        """Stream default caches from RedisKVStorage - yields batches
+
+        Args:
+            storage: RedisKVStorage instance
+            batch_size: Size of each batch to yield
+
+        Yields:
+            Dictionary batches of cache entries
+        """
+        import json
+
+        async with storage._get_redis_connection() as redis:
+            for pattern in ["default:extract:*", "default:summary:*"]:
+                prefixed_pattern = f"{storage.final_namespace}:{pattern}"
+                cursor = 0
+
+                while True:
+                    cursor, keys = await redis.scan(
+                        cursor, match=prefixed_pattern, count=batch_size
+                    )
+
+                    if keys:
+                        try:
+                            pipe = redis.pipeline()
+                            for key in keys:
+                                pipe.get(key)
+                            values = await pipe.execute()
+
+                            batch = {}
+                            for key, value in zip(keys, values):
+                                if value:
+                                    key_str = (
+                                        key.decode() if isinstance(key, bytes) else key
+                                    )
+                                    original_key = key_str.replace(
+                                        f"{storage.final_namespace}:", "", 1
+                                    )
+                                    batch[original_key] = json.loads(value)
+
+                            if batch:
+                                yield batch
+
+                        except Exception as e:
+                            print(f"⚠️  Pipeline execution failed for batch: {e}")
+                            # Fall back to individual gets
+                            batch = {}
+                            for key in keys:
+                                try:
+                                    value = await redis.get(key)
+                                    if value:
+                                        key_str = (
+                                            key.decode()
+                                            if isinstance(key, bytes)
+                                            else key
+                                        )
+                                        original_key = key_str.replace(
+                                            f"{storage.final_namespace}:", "", 1
+                                        )
+                                        batch[original_key] = json.loads(value)
+                                except Exception as individual_error:
+                                    print(
+                                        f"⚠️  Failed to get individual key {key}: {individual_error}"
+                                    )
+                                    continue
+
+                            if batch:
+                                yield batch
+
+                    if cursor == 0:
+                        break
+
+                    await asyncio.sleep(0)
+
+    async def stream_default_caches_pg(self, storage, batch_size: int):
+        """Stream default caches from PostgreSQL - yields batches
+
+        Args:
+            storage: PGKVStorage instance
+            batch_size: Size of each batch to yield
+
+        Yields:
+            Dictionary batches of cache entries
+        """
+        from lightrag.kg.postgres_impl import namespace_to_table_name
+
+        table_name = namespace_to_table_name(storage.namespace)
+        offset = 0
+
+        while True:
+            query = f"""
+                SELECT id as key, original_prompt, return_value, chunk_id, cache_type, queryparam,
+                       EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
+                       EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
+                FROM {table_name}
+                WHERE workspace = $1
+                AND (id LIKE 'default:extract:%' OR id LIKE 'default:summary:%')
+                ORDER BY id
+                LIMIT $2 OFFSET $3
+            """
+
+            results = await storage.db.query(
+                query, [storage.workspace, batch_size, offset], multirows=True
+            )
+
+            if not results:
+                break
+
+            batch = {}
+            for row in results:
+                cache_entry = {
+                    "return": row.get("return_value", ""),
+                    "cache_type": row.get("cache_type"),
+                    "original_prompt": row.get("original_prompt", ""),
+                    "chunk_id": row.get("chunk_id"),
+                    "queryparam": row.get("queryparam"),
+                    "create_time": row.get("create_time", 0),
+                    "update_time": row.get("update_time", 0),
+                }
+                batch[row["key"]] = cache_entry
+
+            if batch:
+                yield batch
+
+            if len(results) < batch_size:
+                break
+
+            offset += batch_size
+            await asyncio.sleep(0)
+
+    async def stream_default_caches_mongo(self, storage, batch_size: int):
+        """Stream default caches from MongoDB - yields batches
+
+        Args:
+            storage: MongoKVStorage instance
+            batch_size: Size of each batch to yield
+
+        Yields:
+            Dictionary batches of cache entries
+        """
+        query = {"_id": {"$regex": "^default:(extract|summary):"}}
+        cursor = storage._data.find(query).batch_size(batch_size)
+
+        batch = {}
+        async for doc in cursor:
+            doc_copy = doc.copy()
+            key = doc_copy.pop("_id")
+
+            # Filter MongoDB/database-specific fields
+            for field_name in ["namespace", "workspace", "_id", "content"]:
+                doc_copy.pop(field_name, None)
+
+            batch[key] = doc_copy
+
+            if len(batch) >= batch_size:
+                yield batch
+                batch = {}
+
+        # Yield remaining items
+        if batch:
+            yield batch
+
+    async def stream_default_caches(
+        self, storage, storage_name: str, batch_size: int = None
+    ):
+        """Stream default caches from any storage type - unified interface
+
+        Args:
+            storage: Storage instance
+            storage_name: Storage type name
+            batch_size: Size of each batch to yield (defaults to self.batch_size)
+
+        Yields:
+            Dictionary batches of cache entries
+        """
+        if batch_size is None:
+            batch_size = self.batch_size
+
+        if storage_name == "JsonKVStorage":
+            async for batch in self.stream_default_caches_json(storage, batch_size):
+                yield batch
+        elif storage_name == "RedisKVStorage":
+            async for batch in self.stream_default_caches_redis(storage, batch_size):
+                yield batch
+        elif storage_name == "PGKVStorage":
+            async for batch in self.stream_default_caches_pg(storage, batch_size):
+                yield batch
+        elif storage_name == "MongoKVStorage":
+            async for batch in self.stream_default_caches_mongo(storage, batch_size):
+                yield batch
+        else:
+            raise ValueError(f"Unsupported storage type: {storage_name}")
+
    async def count_cache_types(self, cache_data: Dict[str, Any]) -> Dict[str, int]:
        """Count cache entries by type

@ -452,43 +803,65 @@ class MigrationTool:
        for key, value in STORAGE_TYPES.items():
            print(f"[{key}] {value}")

-    def get_user_choice(self, prompt: str, valid_choices: list) -> str:
+    def get_user_choice(
+        self, prompt: str, valid_choices: list, allow_exit: bool = False
+    ) -> str:
        """Get user choice with validation

        Args:
            prompt: Prompt message
            valid_choices: List of valid choices
+            allow_exit: If True, allow user to press Enter or input '0' to exit

        Returns:
-            User's choice
+            User's choice, or None if user chose to exit
        """
+        exit_hint = " (Press Enter or 0 to exit)" if allow_exit else ""
        while True:
-            choice = input(f"\n{prompt}: ").strip()
+            choice = input(f"\n{prompt}{exit_hint}: ").strip()
+
+            # Check for exit
+            if allow_exit and (choice == "" or choice == "0"):
+                return None
+
            if choice in valid_choices:
                return choice
            print(f"✗ Invalid choice, please enter one of: {', '.join(valid_choices)}")

-    async def setup_storage(self, storage_type: str) -> tuple:
+    async def setup_storage(
+        self, storage_type: str, use_streaming: bool = False
+    ) -> tuple:
        """Setup and initialize storage

        Args:
            storage_type: Type label (source/target)
+            use_streaming: If True, only count records without loading. If False, load all data (legacy mode)

        Returns:
-            Tuple of (storage_instance, storage_name, workspace, cache_data)
+            Tuple of (storage_instance, storage_name, workspace, total_count)
+            Returns (None, None, None, 0) if user chooses to exit
        """
        print(f"\n=== {storage_type} Storage Setup ===")

-        # Get storage type choice
+        # Get storage type choice - allow exit for source storage
+        allow_exit = storage_type == "Source"
        choice = self.get_user_choice(
-            f"Select {storage_type} storage type (1-4)", list(STORAGE_TYPES.keys())
+            f"Select {storage_type} storage type (1-4)",
+            list(STORAGE_TYPES.keys()),
+            allow_exit=allow_exit,
        )
+
+        # Handle exit
+        if choice is None:
+            print("\n✓ Migration cancelled by user")
+            return None, None, None, 0
+
        storage_name = STORAGE_TYPES[choice]

        # Check environment variables
        print("\nChecking environment variables...")
        if not self.check_env_vars(storage_name):
-            return None, None, None, None
+            return None, None, None, 0

        # Get workspace
        workspace = self.get_workspace_for_storage(storage_name)
@ -502,27 +875,34 @@ class MigrationTool:
            print("- Connection Status: ✓ Success")
        except Exception as e:
            print(f"✗ Initialization failed: {e}")
-            return None, None, None, None
+            return None, None, None, 0

-        # Get cache data
-        print("\nCounting cache records...")
+        # Count cache records efficiently
+        print(f"\n{'Counting' if use_streaming else 'Loading'} cache records...")
        try:
-            cache_data = await self.get_default_caches(storage, storage_name)
-            counts = await self.count_cache_types(cache_data)
+            if use_streaming:
+                # Use efficient counting without loading data
+                total_count = await self.count_default_caches(storage, storage_name)
+                print(f"- Total: {total_count:,} records")
+            else:
+                # Legacy mode: load all data
+                cache_data = await self.get_default_caches(storage, storage_name)
+                counts = await self.count_cache_types(cache_data)
+                total_count = len(cache_data)

-            print(f"- default:extract: {counts['extract']:,} records")
-            print(f"- default:summary: {counts['summary']:,} records")
-            print(f"- Total: {len(cache_data):,} records")
+                print(f"- default:extract: {counts['extract']:,} records")
+                print(f"- default:summary: {counts['summary']:,} records")
+                print(f"- Total: {total_count:,} records")
        except Exception as e:
-            print(f"✗ Counting failed: {e}")
-            return None, None, None, None
+            print(f"✗ {'Counting' if use_streaming else 'Loading'} failed: {e}")
+            return None, None, None, 0

-        return storage, storage_name, workspace, cache_data
+        return storage, storage_name, workspace, total_count

    async def migrate_caches(
        self, source_data: Dict[str, Any], target_storage, target_storage_name: str
    ) -> MigrationStats:
-        """Migrate caches in batches with error tracking
+        """Migrate caches in batches with error tracking (Legacy mode - loads all data)

        Args:
            source_data: Source cache data
@ -597,6 +977,98 @@ class MigrationTool:

        return stats

+    async def migrate_caches_streaming(
+        self,
+        source_storage,
+        source_storage_name: str,
+        target_storage,
+        target_storage_name: str,
+        total_records: int,
+    ) -> MigrationStats:
+        """Migrate caches using streaming approach - minimal memory footprint
+
+        Args:
+            source_storage: Source storage instance
+            source_storage_name: Source storage type name
+            target_storage: Target storage instance
+            target_storage_name: Target storage type name
+            total_records: Total number of records to migrate
+
+        Returns:
+            MigrationStats object with migration results and errors
+        """
+        stats = MigrationStats()
+        stats.total_source_records = total_records
+
+        if stats.total_source_records == 0:
+            print("\nNo records to migrate")
+            return stats
+
+        # Calculate total batches
+        stats.total_batches = (total_records + self.batch_size - 1) // self.batch_size
+
+        print("\n=== Starting Streaming Migration ===")
+        print(
+            f"💡 Memory-optimized mode: Processing {self.batch_size:,} records at a time\n"
+        )
+
+        batch_idx = 0
+
+        # Stream batches from source and write to target immediately
+        async for batch in self.stream_default_caches(
+            source_storage, source_storage_name
+        ):
+            batch_idx += 1
+
+            # Determine current cache type for display
+            if batch:
+                first_key = next(iter(batch.keys()))
+                cache_type = "extract" if "extract" in first_key else "summary"
+            else:
+                cache_type = "unknown"
+
+            try:
+                # Write batch to target storage
+                await target_storage.upsert(batch)
+
+                # Success - update stats
+                stats.successful_batches += 1
+                stats.successful_records += len(batch)
+
+                # Calculate progress with known total
+                progress = (stats.successful_records / total_records) * 100
+                bar_length = 20
+                filled_length = int(
+                    bar_length * stats.successful_records // total_records
+                )
+                bar = "█" * filled_length + "░" * (bar_length - filled_length)
+
+                print(
+                    f"Batch {batch_idx}/{stats.total_batches}: {bar} "
+                    f"{stats.successful_records:,}/{total_records:,} ({progress:.1f}%) - "
+                    f"default:{cache_type} ✓"
+                )
+
+            except Exception as e:
+                # Error - record and continue
+                stats.add_error(batch_idx, e, len(batch))
+
+                print(
+                    f"Batch {batch_idx}/{stats.total_batches}: ✗ FAILED - "
+                    f"{type(e).__name__}: {str(e)}"
+                )
+
+        # Final persist
+        print("\nPersisting data to disk...")
+        try:
+            await target_storage.index_done_callback()
+            print("✓ Data persisted successfully")
+        except Exception as e:
+            print(f"✗ Persist failed: {e}")
+            stats.add_error(0, e, 0)  # batch 0 = persist error
+
+        return stats
+
    def print_migration_report(self, stats: MigrationStats):
        """Print comprehensive migration report

@ -660,41 +1132,42 @@ class MigrationTool:
            print("=" * 60)

    async def run(self):
-        """Run the migration tool"""
+        """Run the migration tool with streaming approach"""
        try:
            # Initialize shared storage (REQUIRED for storage classes to work)
            from lightrag.kg.shared_storage import initialize_share_data
+
            initialize_share_data(workers=1)
-            
+
            # Print header
            self.print_header()
            self.print_storage_types()

-            # Setup source storage
+            # Setup source storage with streaming (only count, don't load all data)
            (
                self.source_storage,
                source_storage_name,
                self.source_workspace,
-                source_data,
-            ) = await self.setup_storage("Source")
+                source_count,
+            ) = await self.setup_storage("Source", use_streaming=True)

-            if not self.source_storage:
-                print("\n✗ Source storage setup failed")
+            # Check if user cancelled (setup_storage returns None for all fields)
+            if self.source_storage is None:
                return

-            if not source_data:
+            if source_count == 0:
                print("\n⚠ Source storage has no cache records to migrate")
                # Cleanup
                await self.source_storage.finalize()
                return

-            # Setup target storage
+            # Setup target storage with streaming (only count, don't load all data)
            (
                self.target_storage,
                target_storage_name,
                self.target_workspace,
-                target_data,
-            ) = await self.setup_storage("Target")
+                target_count,
+            ) = await self.setup_storage("Target", use_streaming=True)

            if not self.target_storage:
                print("\n✗ Target storage setup failed")
@ -707,16 +1180,17 @@ class MigrationTool:
            print("Migration Confirmation")
            print("=" * 50)
            print(
-                f"Source: {source_storage_name} (workspace: {self.source_workspace if self.source_workspace else '(default)'}) - {len(source_data):,} records"
+                f"Source: {source_storage_name} (workspace: {self.source_workspace if self.source_workspace else '(default)'}) - {source_count:,} records"
            )
            print(
-                f"Target: {target_storage_name} (workspace: {self.target_workspace if self.target_workspace else '(default)'}) - {len(target_data):,} records"
+                f"Target: {target_storage_name} (workspace: {self.target_workspace if self.target_workspace else '(default)'}) - {target_count:,} records"
            )
            print(f"Batch Size: {self.batch_size:,} records/batch")
+            print("Memory Mode: Streaming (memory-optimized)")

-            if target_data:
+            if target_count > 0:
                print(
-                    f"\n⚠️ Warning: Target storage already has {len(target_data):,} records"
+                    f"\n⚠️ Warning: Target storage already has {target_count:,} records"
                )
                print("Migration will overwrite records with the same keys")

@ -729,9 +1203,13 @@ class MigrationTool:
                await self.target_storage.finalize()
                return

-            # Perform migration with error tracking
-            stats = await self.migrate_caches(
-                source_data, self.target_storage, target_storage_name
+            # Perform streaming migration with error tracking
+            stats = await self.migrate_caches_streaming(
+                self.source_storage,
+                source_storage_name,
+                self.target_storage,
+                target_storage_name,
+                source_count,
            )

            # Print comprehensive migration report
@ -760,10 +1238,11 @@ class MigrationTool:
                    await self.target_storage.finalize()
                except Exception:
                    pass
-            
+
            # Finalize shared storage
            try:
                from lightrag.kg.shared_storage import finalize_share_data
+
                finalize_share_data()
            except Exception:
                pass