LightRAG/lightrag/tenant_rag_manager.py

"""
Tenant-aware LightRAG instance manager with caching and isolation.

This module manages per-tenant and per-knowledge-base LightRAG instances,
handling initialization, caching, cleanup, and proper isolation between tenants.
"""

from typing import Dict, Optional, Tuple
from lightrag import LightRAG
from lightrag.services.tenant_service import TenantService
from lightrag.utils import logger
from lightrag.security import validate_identifier, validate_working_directory
import asyncio
import os


class TenantRAGManager:
    """
    Manages LightRAG instances per tenant/KB combination with caching and isolation.

    Features:
    - Automatic instance caching to avoid repeated initialization
    - Per-tenant isolation through separate working directories
    - Configurable max cached instances (LRU eviction)
    - Async-safe initialization with double-check locking
    - Proper resource cleanup on instance removal
    """

    def __init__(
        self,
        base_working_dir: str,
        tenant_service: TenantService,
        template_rag: Optional[LightRAG] = None,
        max_cached_instances: int = 100,
    ):
        """
        Initialize the TenantRAGManager.

        Args:
            base_working_dir: Base directory for all tenant/KB data storage
            tenant_service: Service for retrieving tenant configuration
            template_rag: Template RAG instance to copy configuration from
            max_cached_instances: Maximum number of LightRAG instances to keep cached
        """
        self.base_working_dir = base_working_dir
        self.tenant_service = tenant_service
        self.template_rag = template_rag
        self.max_cached_instances = max_cached_instances
        self._instances: Dict[Tuple[str, str], LightRAG] = {}
        self._lock = asyncio.Lock()
        self._access_order: list[Tuple[str, str]] = []  # Track access order for LRU
        logger.info(
            f"TenantRAGManager initialized with base_dir={base_working_dir}, "
            f"max_instances={max_cached_instances}, template_rag={template_rag is not None}"
        )

    async def get_rag_instance(
        self,
        tenant_id: str,
        kb_id: str,
        user_id: Optional[str] = None,
    ) -> LightRAG:
        """
        Get or create a LightRAG instance for a tenant/KB combination.

        This method implements double-check locking to avoid race conditions
        when multiple requests try to initialize the same instance concurrently.
        Instances are cached and reused across requests for the same tenant/KB.

        SECURITY: Validates user has access to requested tenant before returning instance.

        Args:
            tenant_id: The tenant ID (must be valid UUID)
            kb_id: The knowledge base ID (must be valid UUID)
            user_id: User identifier from JWT token (required for security validation)

        Returns:
            LightRAG: A properly initialized LightRAG instance for this tenant/KB

        Raises:
            ValueError: If the tenant does not exist or is inactive
            PermissionError: If user does not have access to the tenant
            HTTPException: If tenant_id or kb_id are invalid identifiers
        """
        # SECURITY: Validate identifier format to prevent injection attacks
        tenant_id = validate_identifier(tenant_id, "tenant_id")
        kb_id = validate_identifier(kb_id, "kb_id")

        cache_key = (tenant_id, kb_id)

        # First check (fast path - no lock)
        if cache_key in self._instances:
            instance = self._instances[cache_key]
            # Update access order for LRU
            if cache_key in self._access_order:
                self._access_order.remove(cache_key)
            self._access_order.append(cache_key)
            logger.debug(f"Cache hit for tenant={tenant_id}, kb={kb_id}")
            return instance

        # Acquire lock for initialization
        async with self._lock:
            # Second check (double-check locking pattern)
            if cache_key in self._instances:
                instance = self._instances[cache_key]
                if cache_key in self._access_order:
                    self._access_order.remove(cache_key)
                self._access_order.append(cache_key)
                logger.debug(
                    f"Cache hit (after lock) for tenant={tenant_id}, kb={kb_id}"
                )
                return instance

            logger.info(f"Creating new RAG instance for tenant={tenant_id}, kb={kb_id}")

            # Get tenant configuration
            tenant = await self.tenant_service.get_tenant(tenant_id)
            if not tenant or not tenant.is_active:
                raise ValueError(f"Tenant {tenant_id} not found or inactive")

            # SEC-003 FIX: Check if user authentication is required
            try:
                from lightrag.api.config import REQUIRE_USER_AUTH

                require_auth = REQUIRE_USER_AUTH
            except ImportError:
                require_auth = False

            # SECURITY: Verify user has access to this tenant
            if user_id:
                has_access = await self.tenant_service.verify_user_access(
                    user_id, tenant_id
                )
                if not has_access:
                    logger.warning(
                        f"Access denied: user={user_id} attempted to access tenant={tenant_id}"
                    )
                    raise PermissionError(f"Access denied to tenant {tenant_id}")
            elif require_auth:
                logger.error(
                    f"Access denied: user_id required but not provided for tenant={tenant_id}"
                )
                raise PermissionError("User authentication required for tenant access")
            else:
                logger.warning(
                    "No user_id provided for tenant access - allowing for backward compatibility"
                )

            # SECURITY: Create and validate tenant-specific working directory
            # This prevents path traversal attacks
            tenant_working_dir, composite_workspace = validate_working_directory(
                self.base_working_dir, tenant_id, kb_id
            )
            os.makedirs(tenant_working_dir, exist_ok=True)

            try:
                # Create LightRAG instance with tenant-specific configuration
                # Use template RAG configuration if available, otherwise use defaults
                if self.template_rag:
                    # Copy configuration from template RAG
                    instance = LightRAG(
                        working_dir=tenant_working_dir,
                        workspace=composite_workspace,
                        llm_model_func=self.template_rag.llm_model_func,
                        llm_model_name=self.template_rag.llm_model_name,
                        llm_model_max_async=self.template_rag.llm_model_max_async,
                        llm_model_kwargs=self.template_rag.llm_model_kwargs,
                        embedding_func=self.template_rag.embedding_func,
                        default_llm_timeout=self.template_rag.default_llm_timeout,
                        default_embedding_timeout=self.template_rag.default_embedding_timeout,
                        kv_storage=tenant.config.custom_metadata.get("kv_storage")
                        or self.template_rag.kv_storage,
                        vector_storage=tenant.config.custom_metadata.get(
                            "vector_storage"
                        )
                        or self.template_rag.vector_storage,
                        graph_storage=tenant.config.custom_metadata.get("graph_storage")
                        or self.template_rag.graph_storage,
                        doc_status_storage=self.template_rag.doc_status_storage,
                        vector_db_storage_cls_kwargs=self.template_rag.vector_db_storage_cls_kwargs,
                        enable_llm_cache=self.template_rag.enable_llm_cache,
                        enable_llm_cache_for_entity_extract=self.template_rag.enable_llm_cache_for_entity_extract,
                        rerank_model_func=self.template_rag.rerank_model_func,
                        chunk_token_size=self.template_rag.chunk_token_size,
                        chunk_overlap_token_size=self.template_rag.chunk_overlap_token_size,
                        max_parallel_insert=self.template_rag.max_parallel_insert,
                        max_graph_nodes=self.template_rag.max_graph_nodes,
                        addon_params=self.template_rag.addon_params,
                        ollama_server_infos=getattr(
                            self.template_rag, "ollama_server_infos", None
                        ),
                        # Override with tenant-specific settings
                        top_k=tenant.config.top_k,
                        chunk_top_k=getattr(tenant.config, "chunk_top_k", 40),
                        cosine_threshold=tenant.config.cosine_threshold,
                    )
                else:
                    # Fallback to basic configuration (will likely fail without embedding_func)
                    instance = LightRAG(
                        working_dir=tenant_working_dir,
                        workspace=composite_workspace,
                        kv_storage=tenant.config.custom_metadata.get(
                            "kv_storage", "JsonKVStorage"
                        ),
                        vector_storage=tenant.config.custom_metadata.get(
                            "vector_storage", "NanoVectorDBStorage"
                        ),
                        graph_storage=tenant.config.custom_metadata.get(
                            "graph_storage", "NetworkXStorage"
                        ),
                        top_k=tenant.config.top_k,
                        chunk_top_k=getattr(tenant.config, "chunk_top_k", 40),
                        cosine_threshold=tenant.config.cosine_threshold,
                    )

                # Initialize the instance's storages
                await instance.initialize_storages()

                # Check if we need to evict oldest instance
                if len(self._instances) >= self.max_cached_instances:
                    # Evict least recently used instance
                    if self._access_order:
                        oldest_key = self._access_order.pop(0)
                        if oldest_key in self._instances:
                            logger.info(
                                f"Evicting LRU instance: tenant={oldest_key[0]}, kb={oldest_key[1]}"
                            )
                            try:
                                await self._instances[oldest_key].finalize_storages()
                            except Exception as e:
                                logger.error(f"Error finalizing evicted instance: {e}")
                            del self._instances[oldest_key]

                # Cache the instance
                self._instances[cache_key] = instance
                self._access_order.append(cache_key)
                logger.info(
                    f"RAG instance created and cached for tenant={tenant_id}, kb={kb_id}"
                )
                return instance

            except Exception as e:
                logger.error(
                    f"Error creating RAG instance for tenant={tenant_id}, kb={kb_id}: {e}"
                )
                raise

    async def cleanup_instance(self, tenant_id: str, kb_id: str) -> None:
        """
        Clean up and remove a cached instance.

        This method should be called when a knowledge base is deleted or
        a tenant is removed to ensure proper resource cleanup.

        Args:
            tenant_id: The tenant ID
            kb_id: The knowledge base ID
        """
        cache_key = (tenant_id, kb_id)
        async with self._lock:
            if cache_key in self._instances:
                logger.info(
                    f"Cleaning up RAG instance for tenant={tenant_id}, kb={kb_id}"
                )
                try:
                    await self._instances[cache_key].finalize_storages()
                except Exception as e:
                    logger.error(f"Error finalizing instance during cleanup: {e}")
                del self._instances[cache_key]
                if cache_key in self._access_order:
                    self._access_order.remove(cache_key)

    async def cleanup_tenant_instances(self, tenant_id: str) -> None:
        """
        Clean up all cached instances for a specific tenant.

        This method should be called when a tenant is deleted to ensure
        all its knowledge bases are properly cleaned up.

        Args:
            tenant_id: The tenant ID
        """
        async with self._lock:
            keys_to_remove = [k for k in self._instances.keys() if k[0] == tenant_id]
            for key in keys_to_remove:
                logger.info(
                    f"Cleaning up RAG instance for tenant={key[0]}, kb={key[1]}"
                )
                try:
                    await self._instances[key].finalize_storages()
                except Exception as e:
                    logger.error(
                        f"Error finalizing instance during tenant cleanup: {e}"
                    )
                del self._instances[key]
                if key in self._access_order:
                    self._access_order.remove(key)

    async def cleanup_all(self) -> None:
        """
        Clean up all cached instances.

        This should be called during application shutdown to ensure
        all resources are properly released.
        """
        async with self._lock:
            logger.info(f"Cleaning up all {len(self._instances)} cached RAG instances")
            for key, instance in list(self._instances.items()):
                try:
                    await instance.finalize_storages()
                except Exception as e:
                    logger.error(f"Error finalizing instance {key}: {e}")
            self._instances.clear()
            self._access_order.clear()

    def get_instance_count(self) -> int:
        """Get the current number of cached instances."""
        return len(self._instances)

    def get_cached_keys(self) -> list[Tuple[str, str]]:
        """Get all currently cached tenant/KB combinations."""
        return list(self._instances.keys())

    def __repr__(self) -> str:
        """String representation of the manager state."""
        return (
            f"TenantRAGManager(instances={len(self._instances)}, "
            f"max_cached={self.max_cached_instances})"
        )