LightRAG/scripts/migrate_workspace_to_tenant.py

#!/usr/bin/env python
"""
Workspace-to-Tenant Migration Script

Migrates existing single-tenant workspace-based deployments to multi-tenant architecture.
This script:
1. Scans existing workspace directories
2. Creates a default tenant for each workspace
3. Creates a default knowledge base within each tenant
4. Preserves all existing data structure for backward compatibility

Usage:
    python migrate_workspace_to_tenant.py --working-dir /path/to/rag_storage
    python migrate_workspace_to_tenant.py --working-dir /path/to/rag_storage --dry-run
    python migrate_workspace_to_tenant.py --working-dir /path/to/rag_storage --skip-backup
"""

import asyncio
import argparse
import os
import sys
import shutil
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
from lightrag.services.tenant_service import TenantService
from lightrag.models.tenant import Tenant, TenantConfig
from lightrag.utils import logger


class WorkspaceToTenantMigrator:
    """
    Handles migration from workspace-based to multi-tenant architecture.
    """

    def __init__(self, working_dir: str, dry_run: bool = False, backup: bool = True):
        """
        Initialize the migrator.

        Args:
            working_dir: Root directory containing workspace folders
            dry_run: If True, simulate migration without making changes
            backup: If True, create backup before migration
        """
        self.working_dir = Path(working_dir)
        self.dry_run = dry_run
        self.backup = backup
        self.tenant_service = TenantService()
        self.migration_log: List[str] = []
        self.error_log: List[str] = []

    def validate_working_dir(self) -> bool:
        """Validate that working directory exists."""
        if not self.working_dir.exists():
            self.error_log.append(f"Working directory does not exist: {self.working_dir}")
            return False

        if not self.working_dir.is_dir():
            self.error_log.append(f"Path is not a directory: {self.working_dir}")
            return False

        return True

    def discover_workspaces(self) -> List[str]:
        """
        Discover existing workspace directories.

        Workspaces are identified by common RAG storage files like:
        - kv_store_*.json
        - doc_status_storage.json
        - rag_storage.db

        Returns:
            List of workspace directory names
        """
        workspaces = []

        # Check for common RAG storage files
        for item in self.working_dir.iterdir():
            if not item.is_dir():
                continue

            # Skip special directories
            if item.name.startswith(('.', '__')):
                continue

            # Check if directory contains RAG storage files
            has_rag_files = any([
                (item / f"kv_store_{name}.json").exists()
                for name in ["full_docs", "text_chunks", "entities", "relations"]
            ]) or (item / "doc_status_storage.json").exists()

            if has_rag_files or item.name.startswith("workspace_"):
                workspaces.append(item.name)

        return sorted(workspaces)

    def backup_working_dir(self) -> Optional[Path]:
        """
        Create a backup of the working directory.

        Returns:
            Path to backup directory, or None if backup failed
        """
        if not self.backup:
            return None

        backup_dir = self.working_dir.parent / f"{self.working_dir.name}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        try:
            msg = f"Creating backup at {backup_dir}"
            logger.info(msg)
            self.migration_log.append(msg)

            if not self.dry_run:
                shutil.copytree(self.working_dir, backup_dir)

            return backup_dir
        except Exception as e:
            msg = f"Failed to create backup: {e}"
            logger.error(msg)
            self.error_log.append(msg)
            return None

    async def migrate_workspace(self, workspace_name: str) -> bool:
        """
        Migrate a single workspace to multi-tenant structure.

        Args:
            workspace_name: Name of the workspace to migrate

        Returns:
            True if migration successful, False otherwise
        """
        try:
            msg = f"\nMigrating workspace: {workspace_name}"
            logger.info(msg)
            self.migration_log.append(msg)

            # Create tenant from workspace
            tenant_name = workspace_name if workspace_name != "" else "default"

            if not self.dry_run:
                tenant = await self.tenant_service.create_tenant(
                    tenant_name=tenant_name,
                    config=None  # Use default config
                )

                msg = f"  ✓ Created tenant '{tenant_name}' with ID: {tenant.tenant_id}"
                logger.info(msg)
                self.migration_log.append(msg)

                # Create default knowledge base
                kb = await self.tenant_service.create_knowledge_base(
                    tenant_id=tenant.tenant_id,
                    kb_name="default",
                    description="Default knowledge base (migrated from workspace)"
                )

                msg = f"  ✓ Created default KB with ID: {kb.kb_id}"
                logger.info(msg)
                self.migration_log.append(msg)
            else:
                msg = f"  [DRY RUN] Would create tenant '{tenant_name}' with default KB"
                logger.info(msg)
                self.migration_log.append(msg)

            return True

        except Exception as e:
            msg = f"  ✗ Failed to migrate workspace '{workspace_name}': {e}"
            logger.error(msg)
            self.error_log.append(msg)
            return False

    async def migrate_all_workspaces(self, workspaces: List[str]) -> Dict[str, bool]:
        """
        Migrate all discovered workspaces.

        Args:
            workspaces: List of workspace names to migrate

        Returns:
            Dictionary mapping workspace name to migration status
        """
        results = {}

        for workspace in workspaces:
            success = await self.migrate_workspace(workspace)
            results[workspace] = success

        return results

    def generate_report(self, workspaces: List[str], results: Dict[str, bool]) -> str:
        """
        Generate a migration report.

        Args:
            workspaces: List of workspaces processed
            results: Migration results

        Returns:
            Formatted report string
        """
        successful = sum(1 for v in results.values() if v)
        failed = len(workspaces) - successful

        report = f"""
╔══════════════════════════════════════════════════════════════╗
║           WORKSPACE-TO-TENANT MIGRATION REPORT              ║
╚══════════════════════════════════════════════════════════════╝

Working Directory: {self.working_dir}
Dry Run Mode: {self.dry_run}
Workspaces Processed: {len(workspaces)}
Successfully Migrated: {successful}
Failed: {failed}

Migration Log:
"""
        for line in self.migration_log:
            report += f"\n{line}"

        if self.error_log:
            report += "\n\nErrors Encountered:"
            for error in self.error_log:
                report += f"\n{error}"

        report += "\n"
        return report

    async def run(self) -> bool:
        """
        Execute the migration process.

        Returns:
            True if migration completed successfully, False otherwise
        """
        # Validate setup
        if not self.validate_working_dir():
            logger.error("Validation failed")
            return False

        # Discover workspaces
        workspaces = self.discover_workspaces()

        if not workspaces:
            msg = "No workspaces found to migrate"
            logger.warning(msg)
            self.migration_log.append(msg)
            return True

        msg = f"Discovered {len(workspaces)} workspace(s): {', '.join(workspaces)}"
        logger.info(msg)
        self.migration_log.append(msg)

        # Create backup if not dry-run
        if not self.dry_run:
            backup_path = self.backup_working_dir()
            if not backup_path and self.backup:
                logger.warning("Backup failed but continuing with migration")

        # Migrate workspaces
        results = await self.migrate_all_workspaces(workspaces)

        # Generate and display report
        report = self.generate_report(workspaces, results)
        print(report)

        # Save report to file
        report_path = self.working_dir / f"migration_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        try:
            if not self.dry_run:
                with open(report_path, 'w') as f:
                    f.write(report)
                logger.info(f"Migration report saved to {report_path}")
        except Exception as e:
            logger.error(f"Failed to save migration report: {e}")

        # Return success if no failures
        return all(results.values())


def main():
    """Main entry point for migration script."""
    parser = argparse.ArgumentParser(
        description="Migrate workspace-based deployment to multi-tenant architecture",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Perform actual migration
  python migrate_workspace_to_tenant.py --working-dir /path/to/rag_storage

  # Preview what would be migrated without making changes
  python migrate_workspace_to_tenant.py --working-dir /path/to/rag_storage --dry-run

  # Migrate without creating backup
  python migrate_workspace_to_tenant.py --working-dir /path/to/rag_storage --skip-backup
        """
    )

    parser.add_argument(
        "--working-dir",
        required=True,
        help="Path to the working directory containing workspaces"
    )

    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Simulate migration without making actual changes"
    )

    parser.add_argument(
        "--skip-backup",
        action="store_true",
        help="Skip creating a backup of the working directory"
    )

    args = parser.parse_args()

    # Create migrator
    migrator = WorkspaceToTenantMigrator(
        working_dir=args.working_dir,
        dry_run=args.dry_run,
        backup=not args.skip_backup
    )

    # Run migration
    try:
        success = asyncio.run(migrator.run())
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        logger.warning("Migration interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Migration failed: {e}", exc_info=True)
        sys.exit(1)


if __name__ == "__main__":
    main()