LightRAG/lightrag/tenant_rag_manager.py
2025-12-05 14:31:13 +08:00

329 lines
14 KiB
Python

"""
Tenant-aware LightRAG instance manager with caching and isolation.
This module manages per-tenant and per-knowledge-base LightRAG instances,
handling initialization, caching, cleanup, and proper isolation between tenants.
"""
from typing import Dict, Optional, Tuple
from lightrag import LightRAG
from lightrag.services.tenant_service import TenantService
from lightrag.utils import logger
from lightrag.security import validate_identifier, validate_working_directory
import asyncio
import os
class TenantRAGManager:
"""
Manages LightRAG instances per tenant/KB combination with caching and isolation.
Features:
- Automatic instance caching to avoid repeated initialization
- Per-tenant isolation through separate working directories
- Configurable max cached instances (LRU eviction)
- Async-safe initialization with double-check locking
- Proper resource cleanup on instance removal
"""
def __init__(
self,
base_working_dir: str,
tenant_service: TenantService,
template_rag: Optional[LightRAG] = None,
max_cached_instances: int = 100,
):
"""
Initialize the TenantRAGManager.
Args:
base_working_dir: Base directory for all tenant/KB data storage
tenant_service: Service for retrieving tenant configuration
template_rag: Template RAG instance to copy configuration from
max_cached_instances: Maximum number of LightRAG instances to keep cached
"""
self.base_working_dir = base_working_dir
self.tenant_service = tenant_service
self.template_rag = template_rag
self.max_cached_instances = max_cached_instances
self._instances: Dict[Tuple[str, str], LightRAG] = {}
self._lock = asyncio.Lock()
self._access_order: list[Tuple[str, str]] = [] # Track access order for LRU
logger.info(
f"TenantRAGManager initialized with base_dir={base_working_dir}, "
f"max_instances={max_cached_instances}, template_rag={template_rag is not None}"
)
async def get_rag_instance(
self,
tenant_id: str,
kb_id: str,
user_id: Optional[str] = None,
) -> LightRAG:
"""
Get or create a LightRAG instance for a tenant/KB combination.
This method implements double-check locking to avoid race conditions
when multiple requests try to initialize the same instance concurrently.
Instances are cached and reused across requests for the same tenant/KB.
SECURITY: Validates user has access to requested tenant before returning instance.
Args:
tenant_id: The tenant ID (must be valid UUID)
kb_id: The knowledge base ID (must be valid UUID)
user_id: User identifier from JWT token (required for security validation)
Returns:
LightRAG: A properly initialized LightRAG instance for this tenant/KB
Raises:
ValueError: If the tenant does not exist or is inactive
PermissionError: If user does not have access to the tenant
HTTPException: If tenant_id or kb_id are invalid identifiers
"""
# SECURITY: Validate identifier format to prevent injection attacks
tenant_id = validate_identifier(tenant_id, "tenant_id")
kb_id = validate_identifier(kb_id, "kb_id")
cache_key = (tenant_id, kb_id)
# First check (fast path - no lock)
if cache_key in self._instances:
instance = self._instances[cache_key]
# Update access order for LRU
if cache_key in self._access_order:
self._access_order.remove(cache_key)
self._access_order.append(cache_key)
logger.debug(f"Cache hit for tenant={tenant_id}, kb={kb_id}")
return instance
# Acquire lock for initialization
async with self._lock:
# Second check (double-check locking pattern)
if cache_key in self._instances:
instance = self._instances[cache_key]
if cache_key in self._access_order:
self._access_order.remove(cache_key)
self._access_order.append(cache_key)
logger.debug(
f"Cache hit (after lock) for tenant={tenant_id}, kb={kb_id}"
)
return instance
logger.info(f"Creating new RAG instance for tenant={tenant_id}, kb={kb_id}")
# Get tenant configuration
tenant = await self.tenant_service.get_tenant(tenant_id)
if not tenant or not tenant.is_active:
raise ValueError(f"Tenant {tenant_id} not found or inactive")
# SEC-003 FIX: Check if user authentication is required
try:
from lightrag.api.config import REQUIRE_USER_AUTH
require_auth = REQUIRE_USER_AUTH
except ImportError:
require_auth = False
# SECURITY: Verify user has access to this tenant
if user_id:
has_access = await self.tenant_service.verify_user_access(
user_id, tenant_id
)
if not has_access:
logger.warning(
f"Access denied: user={user_id} attempted to access tenant={tenant_id}"
)
raise PermissionError(f"Access denied to tenant {tenant_id}")
elif require_auth:
logger.error(
f"Access denied: user_id required but not provided for tenant={tenant_id}"
)
raise PermissionError("User authentication required for tenant access")
else:
logger.warning(
"No user_id provided for tenant access - allowing for backward compatibility"
)
# SECURITY: Create and validate tenant-specific working directory
# This prevents path traversal attacks
tenant_working_dir, composite_workspace = validate_working_directory(
self.base_working_dir, tenant_id, kb_id
)
os.makedirs(tenant_working_dir, exist_ok=True)
try:
# Create LightRAG instance with tenant-specific configuration
# Use template RAG configuration if available, otherwise use defaults
if self.template_rag:
# Copy configuration from template RAG
instance = LightRAG(
working_dir=tenant_working_dir,
workspace=composite_workspace,
llm_model_func=self.template_rag.llm_model_func,
llm_model_name=self.template_rag.llm_model_name,
llm_model_max_async=self.template_rag.llm_model_max_async,
llm_model_kwargs=self.template_rag.llm_model_kwargs,
embedding_func=self.template_rag.embedding_func,
default_llm_timeout=self.template_rag.default_llm_timeout,
default_embedding_timeout=self.template_rag.default_embedding_timeout,
kv_storage=tenant.config.custom_metadata.get("kv_storage")
or self.template_rag.kv_storage,
vector_storage=tenant.config.custom_metadata.get(
"vector_storage"
)
or self.template_rag.vector_storage,
graph_storage=tenant.config.custom_metadata.get("graph_storage")
or self.template_rag.graph_storage,
doc_status_storage=self.template_rag.doc_status_storage,
vector_db_storage_cls_kwargs=self.template_rag.vector_db_storage_cls_kwargs,
enable_llm_cache=self.template_rag.enable_llm_cache,
enable_llm_cache_for_entity_extract=self.template_rag.enable_llm_cache_for_entity_extract,
rerank_model_func=self.template_rag.rerank_model_func,
chunk_token_size=self.template_rag.chunk_token_size,
chunk_overlap_token_size=self.template_rag.chunk_overlap_token_size,
max_parallel_insert=self.template_rag.max_parallel_insert,
max_graph_nodes=self.template_rag.max_graph_nodes,
addon_params=self.template_rag.addon_params,
ollama_server_infos=getattr(
self.template_rag, "ollama_server_infos", None
),
# Override with tenant-specific settings
top_k=tenant.config.top_k,
chunk_top_k=getattr(tenant.config, "chunk_top_k", 40),
cosine_threshold=tenant.config.cosine_threshold,
)
else:
# Fallback to basic configuration (will likely fail without embedding_func)
instance = LightRAG(
working_dir=tenant_working_dir,
workspace=composite_workspace,
kv_storage=tenant.config.custom_metadata.get(
"kv_storage", "JsonKVStorage"
),
vector_storage=tenant.config.custom_metadata.get(
"vector_storage", "NanoVectorDBStorage"
),
graph_storage=tenant.config.custom_metadata.get(
"graph_storage", "NetworkXStorage"
),
top_k=tenant.config.top_k,
chunk_top_k=getattr(tenant.config, "chunk_top_k", 40),
cosine_threshold=tenant.config.cosine_threshold,
)
# Initialize the instance's storages
await instance.initialize_storages()
# Check if we need to evict oldest instance
if len(self._instances) >= self.max_cached_instances:
# Evict least recently used instance
if self._access_order:
oldest_key = self._access_order.pop(0)
if oldest_key in self._instances:
logger.info(
f"Evicting LRU instance: tenant={oldest_key[0]}, kb={oldest_key[1]}"
)
try:
await self._instances[oldest_key].finalize_storages()
except Exception as e:
logger.error(f"Error finalizing evicted instance: {e}")
del self._instances[oldest_key]
# Cache the instance
self._instances[cache_key] = instance
self._access_order.append(cache_key)
logger.info(
f"RAG instance created and cached for tenant={tenant_id}, kb={kb_id}"
)
return instance
except Exception as e:
logger.error(
f"Error creating RAG instance for tenant={tenant_id}, kb={kb_id}: {e}"
)
raise
async def cleanup_instance(self, tenant_id: str, kb_id: str) -> None:
"""
Clean up and remove a cached instance.
This method should be called when a knowledge base is deleted or
a tenant is removed to ensure proper resource cleanup.
Args:
tenant_id: The tenant ID
kb_id: The knowledge base ID
"""
cache_key = (tenant_id, kb_id)
async with self._lock:
if cache_key in self._instances:
logger.info(
f"Cleaning up RAG instance for tenant={tenant_id}, kb={kb_id}"
)
try:
await self._instances[cache_key].finalize_storages()
except Exception as e:
logger.error(f"Error finalizing instance during cleanup: {e}")
del self._instances[cache_key]
if cache_key in self._access_order:
self._access_order.remove(cache_key)
async def cleanup_tenant_instances(self, tenant_id: str) -> None:
"""
Clean up all cached instances for a specific tenant.
This method should be called when a tenant is deleted to ensure
all its knowledge bases are properly cleaned up.
Args:
tenant_id: The tenant ID
"""
async with self._lock:
keys_to_remove = [k for k in self._instances.keys() if k[0] == tenant_id]
for key in keys_to_remove:
logger.info(
f"Cleaning up RAG instance for tenant={key[0]}, kb={key[1]}"
)
try:
await self._instances[key].finalize_storages()
except Exception as e:
logger.error(
f"Error finalizing instance during tenant cleanup: {e}"
)
del self._instances[key]
if key in self._access_order:
self._access_order.remove(key)
async def cleanup_all(self) -> None:
"""
Clean up all cached instances.
This should be called during application shutdown to ensure
all resources are properly released.
"""
async with self._lock:
logger.info(f"Cleaning up all {len(self._instances)} cached RAG instances")
for key, instance in list(self._instances.items()):
try:
await instance.finalize_storages()
except Exception as e:
logger.error(f"Error finalizing instance {key}: {e}")
self._instances.clear()
self._access_order.clear()
def get_instance_count(self) -> int:
"""Get the current number of cached instances."""
return len(self._instances)
def get_cached_keys(self) -> list[Tuple[str, str]]:
"""Get all currently cached tenant/KB combinations."""
return list(self._instances.keys())
def __repr__(self) -> str:
"""String representation of the manager state."""
return (
f"TenantRAGManager(instances={len(self._instances)}, "
f"max_cached={self.max_cached_instances})"
)