""" Download all necessary cache files for offline deployment. This module provides a CLI command to download tiktoken model cache files for offline environments where internet access is not available. """ import os import sys from pathlib import Path def download_tiktoken_cache(cache_dir: str = None, models: list = None): """Download tiktoken models to local cache Args: cache_dir: Directory to store the cache files. If None, uses default location. models: List of model names to download. If None, downloads common models. Returns: Tuple of (success_count, failed_models) """ try: import tiktoken except ImportError: print("Error: tiktoken is not installed.") print("Install with: pip install tiktoken") sys.exit(1) # Set cache directory if provided if cache_dir: cache_dir = os.path.abspath(cache_dir) os.environ["TIKTOKEN_CACHE_DIR"] = cache_dir cache_path = Path(cache_dir) cache_path.mkdir(parents=True, exist_ok=True) print(f"Using cache directory: {cache_dir}") else: cache_dir = os.environ.get( "TIKTOKEN_CACHE_DIR", str(Path.home() / ".tiktoken_cache") ) print(f"Using default cache directory: {cache_dir}") # Common models used by LightRAG and OpenAI if models is None: models = [ "gpt-4o-mini", # Default model for LightRAG "gpt-4o", # GPT-4 Omni "gpt-4", # GPT-4 "gpt-3.5-turbo", # GPT-3.5 Turbo "text-embedding-ada-002", # Legacy embedding model "text-embedding-3-small", # Small embedding model "text-embedding-3-large", # Large embedding model ] print(f"\nDownloading {len(models)} tiktoken models...") print("=" * 70) success_count = 0 failed_models = [] for i, model in enumerate(models, 1): try: print(f"[{i}/{len(models)}] Downloading {model}...", end=" ", flush=True) encoding = tiktoken.encoding_for_model(model) # Trigger download by encoding a test string encoding.encode("test") print("✓ Done") success_count += 1 except KeyError as e: print(f"✗ Failed: Unknown model '{model}'") failed_models.append((model, str(e))) except Exception as e: print(f"✗ Failed: {e}") failed_models.append((model, str(e))) print("=" * 70) print(f"\n✓ Successfully cached {success_count}/{len(models)} models") if failed_models: print(f"\n✗ Failed to download {len(failed_models)} models:") for model, error in failed_models: print(f" - {model}: {error}") print(f"\nCache location: {cache_dir}") print("\nFor offline deployment:") print(" 1. Copy directory to offline server:") print(f" tar -czf tiktoken_cache.tar.gz {cache_dir}") print(" scp tiktoken_cache.tar.gz user@offline-server:/path/to/") print("") print(" 2. On offline server, extract and set environment variable:") print(" tar -xzf tiktoken_cache.tar.gz") print(" export TIKTOKEN_CACHE_DIR=/path/to/tiktoken_cache") print("") print(" 3. Or copy to default location:") print(f" cp -r {cache_dir} ~/.tiktoken_cache/") return success_count, failed_models def main(): """Main entry point for the CLI command""" import argparse parser = argparse.ArgumentParser( prog="lightrag-download-cache", description="Download cache files for LightRAG offline deployment", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Download to default location (~/.tiktoken_cache) lightrag-download-cache # Download to specific directory lightrag-download-cache --cache-dir ./offline_cache/tiktoken # Download specific models only lightrag-download-cache --models gpt-4o-mini gpt-4 For more information, visit: https://github.com/HKUDS/LightRAG """, ) parser.add_argument( "--cache-dir", help="Cache directory path (default: ~/.tiktoken_cache)", default=None, ) parser.add_argument( "--models", nargs="+", help="Specific models to download (default: common models)", default=None, ) parser.add_argument( "--version", action="version", version="%(prog)s (LightRAG cache downloader)" ) args = parser.parse_args() print("=" * 70) print("LightRAG Offline Cache Downloader") print("=" * 70) try: success_count, failed_models = download_tiktoken_cache( args.cache_dir, args.models ) print("\n" + "=" * 70) print("Download Complete") print("=" * 70) # Exit with error code if all downloads failed if success_count == 0: print("\n✗ All downloads failed. Please check your internet connection.") sys.exit(1) # Exit with warning code if some downloads failed elif failed_models: print( f"\n⚠ Some downloads failed ({len(failed_models)}/{success_count + len(failed_models)})" ) sys.exit(2) else: print("\n✓ All cache files downloaded successfully!") sys.exit(0) except KeyboardInterrupt: print("\n\n✗ Download interrupted by user") sys.exit(130) except Exception as e: print(f"\n\n✗ Error: {e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()