LightRAG/lightrag/tools/download_cache.py

"""
Download all necessary cache files for offline deployment.

This module provides a CLI command to download tiktoken model cache files
for offline environments where internet access is not available.
"""

import os
import sys
from pathlib import Path


def download_tiktoken_cache(cache_dir: str | None = None, models: list[str] | None = None):
    """Download tiktoken models to local cache

    Args:
        cache_dir: Directory to store the cache files. If None, uses default location.
        models: List of model names to download. If None, downloads common models.

    Returns:
        Tuple of (success_count, failed_models)
    """
    try:
        import tiktoken
    except ImportError:
        print('Error: tiktoken is not installed.')
        print('Install with: pip install tiktoken')
        sys.exit(1)

    # Set cache directory if provided
    if cache_dir:
        cache_dir = os.path.abspath(cache_dir)
        os.environ['TIKTOKEN_CACHE_DIR'] = cache_dir
        cache_path = Path(cache_dir)
        cache_path.mkdir(parents=True, exist_ok=True)
        print(f'Using cache directory: {cache_dir}')
    else:
        cache_dir = os.environ.get('TIKTOKEN_CACHE_DIR', str(Path.home() / '.tiktoken_cache'))
        print(f'Using default cache directory: {cache_dir}')

    # Common models used by LightRAG and OpenAI
    if models is None:
        models = [
            'gpt-4o-mini',  # Default model for LightRAG
            'gpt-4o',  # GPT-4 Omni
            'gpt-4',  # GPT-4
            'gpt-3.5-turbo',  # GPT-3.5 Turbo
            'text-embedding-ada-002',  # Legacy embedding model
            'text-embedding-3-small',  # Small embedding model
            'text-embedding-3-large',  # Large embedding model
        ]

    print(f'\nDownloading {len(models)} tiktoken models...')
    print('=' * 70)

    success_count = 0
    failed_models = []

    for i, model in enumerate(models, 1):
        try:
            print(f'[{i}/{len(models)}] Downloading {model}...', end=' ', flush=True)
            encoding = tiktoken.encoding_for_model(model)
            # Trigger download by encoding a test string
            encoding.encode('test')
            print('✓ Done')
            success_count += 1
        except KeyError as e:
            print(f"✗ Failed: Unknown model '{model}'")
            failed_models.append((model, str(e)))
        except Exception as e:
            print(f'✗ Failed: {e}')
            failed_models.append((model, str(e)))

    print('=' * 70)
    print(f'\n✓ Successfully cached {success_count}/{len(models)} models')

    if failed_models:
        print(f'\n✗ Failed to download {len(failed_models)} models:')
        for model, error in failed_models:
            print(f'  - {model}: {error}')

    print(f'\nCache location: {cache_dir}')
    print('\nFor offline deployment:')
    print('  1. Copy directory to offline server:')
    print(f'     tar -czf tiktoken_cache.tar.gz {cache_dir}')
    print('     scp tiktoken_cache.tar.gz user@offline-server:/path/to/')
    print('')
    print('  2. On offline server, extract and set environment variable:')
    print('     tar -xzf tiktoken_cache.tar.gz')
    print('     export TIKTOKEN_CACHE_DIR=/path/to/tiktoken_cache')
    print('')
    print('  3. Or copy to default location:')
    print(f'     cp -r {cache_dir} ~/.tiktoken_cache/')

    return success_count, failed_models


def main():
    """Main entry point for the CLI command"""
    import argparse

    parser = argparse.ArgumentParser(
        prog='lightrag-download-cache',
        description='Download cache files for LightRAG offline deployment',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Download to default location (~/.tiktoken_cache)
  lightrag-download-cache

  # Download to specific directory
  lightrag-download-cache --cache-dir ./offline_cache/tiktoken

  # Download specific models only
  lightrag-download-cache --models gpt-4o-mini gpt-4

For more information, visit: https://github.com/HKUDS/LightRAG
        """,
    )

    parser.add_argument(
        '--cache-dir',
        help='Cache directory path (default: ~/.tiktoken_cache)',
        default=None,
    )
    parser.add_argument(
        '--models',
        nargs='+',
        help='Specific models to download (default: common models)',
        default=None,
    )
    parser.add_argument('--version', action='version', version='%(prog)s (LightRAG cache downloader)')

    args = parser.parse_args()

    print('=' * 70)
    print('LightRAG Offline Cache Downloader')
    print('=' * 70)

    try:
        success_count, failed_models = download_tiktoken_cache(args.cache_dir, args.models)

        print('\n' + '=' * 70)
        print('Download Complete')
        print('=' * 70)

        # Exit with error code if all downloads failed
        if success_count == 0:
            print('\n✗ All downloads failed. Please check your internet connection.')
            sys.exit(1)
        # Exit with warning code if some downloads failed
        elif failed_models:
            print(f'\n⚠ Some downloads failed ({len(failed_models)}/{success_count + len(failed_models)})')
            sys.exit(2)
        else:
            print('\n✓ All cache files downloaded successfully!')
            sys.exit(0)

    except KeyboardInterrupt:
        print('\n\n✗ Download interrupted by user')
        sys.exit(130)
    except Exception as e:
        print(f'\n\n✗ Error: {e}')
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == '__main__':
    main()