329 lines
14 KiB
Python
329 lines
14 KiB
Python
"""
|
|
Tenant-aware LightRAG instance manager with caching and isolation.
|
|
|
|
This module manages per-tenant and per-knowledge-base LightRAG instances,
|
|
handling initialization, caching, cleanup, and proper isolation between tenants.
|
|
"""
|
|
|
|
from typing import Dict, Optional, Tuple
|
|
from lightrag import LightRAG
|
|
from lightrag.services.tenant_service import TenantService
|
|
from lightrag.utils import logger
|
|
from lightrag.security import validate_identifier, validate_working_directory
|
|
import asyncio
|
|
import os
|
|
|
|
|
|
class TenantRAGManager:
|
|
"""
|
|
Manages LightRAG instances per tenant/KB combination with caching and isolation.
|
|
|
|
Features:
|
|
- Automatic instance caching to avoid repeated initialization
|
|
- Per-tenant isolation through separate working directories
|
|
- Configurable max cached instances (LRU eviction)
|
|
- Async-safe initialization with double-check locking
|
|
- Proper resource cleanup on instance removal
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
base_working_dir: str,
|
|
tenant_service: TenantService,
|
|
template_rag: Optional[LightRAG] = None,
|
|
max_cached_instances: int = 100,
|
|
):
|
|
"""
|
|
Initialize the TenantRAGManager.
|
|
|
|
Args:
|
|
base_working_dir: Base directory for all tenant/KB data storage
|
|
tenant_service: Service for retrieving tenant configuration
|
|
template_rag: Template RAG instance to copy configuration from
|
|
max_cached_instances: Maximum number of LightRAG instances to keep cached
|
|
"""
|
|
self.base_working_dir = base_working_dir
|
|
self.tenant_service = tenant_service
|
|
self.template_rag = template_rag
|
|
self.max_cached_instances = max_cached_instances
|
|
self._instances: Dict[Tuple[str, str], LightRAG] = {}
|
|
self._lock = asyncio.Lock()
|
|
self._access_order: list[Tuple[str, str]] = [] # Track access order for LRU
|
|
logger.info(
|
|
f"TenantRAGManager initialized with base_dir={base_working_dir}, "
|
|
f"max_instances={max_cached_instances}, template_rag={template_rag is not None}"
|
|
)
|
|
|
|
async def get_rag_instance(
|
|
self,
|
|
tenant_id: str,
|
|
kb_id: str,
|
|
user_id: Optional[str] = None,
|
|
) -> LightRAG:
|
|
"""
|
|
Get or create a LightRAG instance for a tenant/KB combination.
|
|
|
|
This method implements double-check locking to avoid race conditions
|
|
when multiple requests try to initialize the same instance concurrently.
|
|
Instances are cached and reused across requests for the same tenant/KB.
|
|
|
|
SECURITY: Validates user has access to requested tenant before returning instance.
|
|
|
|
Args:
|
|
tenant_id: The tenant ID (must be valid UUID)
|
|
kb_id: The knowledge base ID (must be valid UUID)
|
|
user_id: User identifier from JWT token (required for security validation)
|
|
|
|
Returns:
|
|
LightRAG: A properly initialized LightRAG instance for this tenant/KB
|
|
|
|
Raises:
|
|
ValueError: If the tenant does not exist or is inactive
|
|
PermissionError: If user does not have access to the tenant
|
|
HTTPException: If tenant_id or kb_id are invalid identifiers
|
|
"""
|
|
# SECURITY: Validate identifier format to prevent injection attacks
|
|
tenant_id = validate_identifier(tenant_id, "tenant_id")
|
|
kb_id = validate_identifier(kb_id, "kb_id")
|
|
|
|
cache_key = (tenant_id, kb_id)
|
|
|
|
# First check (fast path - no lock)
|
|
if cache_key in self._instances:
|
|
instance = self._instances[cache_key]
|
|
# Update access order for LRU
|
|
if cache_key in self._access_order:
|
|
self._access_order.remove(cache_key)
|
|
self._access_order.append(cache_key)
|
|
logger.debug(f"Cache hit for tenant={tenant_id}, kb={kb_id}")
|
|
return instance
|
|
|
|
# Acquire lock for initialization
|
|
async with self._lock:
|
|
# Second check (double-check locking pattern)
|
|
if cache_key in self._instances:
|
|
instance = self._instances[cache_key]
|
|
if cache_key in self._access_order:
|
|
self._access_order.remove(cache_key)
|
|
self._access_order.append(cache_key)
|
|
logger.debug(
|
|
f"Cache hit (after lock) for tenant={tenant_id}, kb={kb_id}"
|
|
)
|
|
return instance
|
|
|
|
logger.info(f"Creating new RAG instance for tenant={tenant_id}, kb={kb_id}")
|
|
|
|
# Get tenant configuration
|
|
tenant = await self.tenant_service.get_tenant(tenant_id)
|
|
if not tenant or not tenant.is_active:
|
|
raise ValueError(f"Tenant {tenant_id} not found or inactive")
|
|
|
|
# SEC-003 FIX: Check if user authentication is required
|
|
try:
|
|
from lightrag.api.config import REQUIRE_USER_AUTH
|
|
|
|
require_auth = REQUIRE_USER_AUTH
|
|
except ImportError:
|
|
require_auth = False
|
|
|
|
# SECURITY: Verify user has access to this tenant
|
|
if user_id:
|
|
has_access = await self.tenant_service.verify_user_access(
|
|
user_id, tenant_id
|
|
)
|
|
if not has_access:
|
|
logger.warning(
|
|
f"Access denied: user={user_id} attempted to access tenant={tenant_id}"
|
|
)
|
|
raise PermissionError(f"Access denied to tenant {tenant_id}")
|
|
elif require_auth:
|
|
logger.error(
|
|
f"Access denied: user_id required but not provided for tenant={tenant_id}"
|
|
)
|
|
raise PermissionError("User authentication required for tenant access")
|
|
else:
|
|
logger.warning(
|
|
"No user_id provided for tenant access - allowing for backward compatibility"
|
|
)
|
|
|
|
# SECURITY: Create and validate tenant-specific working directory
|
|
# This prevents path traversal attacks
|
|
tenant_working_dir, composite_workspace = validate_working_directory(
|
|
self.base_working_dir, tenant_id, kb_id
|
|
)
|
|
os.makedirs(tenant_working_dir, exist_ok=True)
|
|
|
|
try:
|
|
# Create LightRAG instance with tenant-specific configuration
|
|
# Use template RAG configuration if available, otherwise use defaults
|
|
if self.template_rag:
|
|
# Copy configuration from template RAG
|
|
instance = LightRAG(
|
|
working_dir=tenant_working_dir,
|
|
workspace=composite_workspace,
|
|
llm_model_func=self.template_rag.llm_model_func,
|
|
llm_model_name=self.template_rag.llm_model_name,
|
|
llm_model_max_async=self.template_rag.llm_model_max_async,
|
|
llm_model_kwargs=self.template_rag.llm_model_kwargs,
|
|
embedding_func=self.template_rag.embedding_func,
|
|
default_llm_timeout=self.template_rag.default_llm_timeout,
|
|
default_embedding_timeout=self.template_rag.default_embedding_timeout,
|
|
kv_storage=tenant.config.custom_metadata.get("kv_storage")
|
|
or self.template_rag.kv_storage,
|
|
vector_storage=tenant.config.custom_metadata.get(
|
|
"vector_storage"
|
|
)
|
|
or self.template_rag.vector_storage,
|
|
graph_storage=tenant.config.custom_metadata.get("graph_storage")
|
|
or self.template_rag.graph_storage,
|
|
doc_status_storage=self.template_rag.doc_status_storage,
|
|
vector_db_storage_cls_kwargs=self.template_rag.vector_db_storage_cls_kwargs,
|
|
enable_llm_cache=self.template_rag.enable_llm_cache,
|
|
enable_llm_cache_for_entity_extract=self.template_rag.enable_llm_cache_for_entity_extract,
|
|
rerank_model_func=self.template_rag.rerank_model_func,
|
|
chunk_token_size=self.template_rag.chunk_token_size,
|
|
chunk_overlap_token_size=self.template_rag.chunk_overlap_token_size,
|
|
max_parallel_insert=self.template_rag.max_parallel_insert,
|
|
max_graph_nodes=self.template_rag.max_graph_nodes,
|
|
addon_params=self.template_rag.addon_params,
|
|
ollama_server_infos=getattr(
|
|
self.template_rag, "ollama_server_infos", None
|
|
),
|
|
# Override with tenant-specific settings
|
|
top_k=tenant.config.top_k,
|
|
chunk_top_k=getattr(tenant.config, "chunk_top_k", 40),
|
|
cosine_threshold=tenant.config.cosine_threshold,
|
|
)
|
|
else:
|
|
# Fallback to basic configuration (will likely fail without embedding_func)
|
|
instance = LightRAG(
|
|
working_dir=tenant_working_dir,
|
|
workspace=composite_workspace,
|
|
kv_storage=tenant.config.custom_metadata.get(
|
|
"kv_storage", "JsonKVStorage"
|
|
),
|
|
vector_storage=tenant.config.custom_metadata.get(
|
|
"vector_storage", "NanoVectorDBStorage"
|
|
),
|
|
graph_storage=tenant.config.custom_metadata.get(
|
|
"graph_storage", "NetworkXStorage"
|
|
),
|
|
top_k=tenant.config.top_k,
|
|
chunk_top_k=getattr(tenant.config, "chunk_top_k", 40),
|
|
cosine_threshold=tenant.config.cosine_threshold,
|
|
)
|
|
|
|
# Initialize the instance's storages
|
|
await instance.initialize_storages()
|
|
|
|
# Check if we need to evict oldest instance
|
|
if len(self._instances) >= self.max_cached_instances:
|
|
# Evict least recently used instance
|
|
if self._access_order:
|
|
oldest_key = self._access_order.pop(0)
|
|
if oldest_key in self._instances:
|
|
logger.info(
|
|
f"Evicting LRU instance: tenant={oldest_key[0]}, kb={oldest_key[1]}"
|
|
)
|
|
try:
|
|
await self._instances[oldest_key].finalize_storages()
|
|
except Exception as e:
|
|
logger.error(f"Error finalizing evicted instance: {e}")
|
|
del self._instances[oldest_key]
|
|
|
|
# Cache the instance
|
|
self._instances[cache_key] = instance
|
|
self._access_order.append(cache_key)
|
|
logger.info(
|
|
f"RAG instance created and cached for tenant={tenant_id}, kb={kb_id}"
|
|
)
|
|
return instance
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error creating RAG instance for tenant={tenant_id}, kb={kb_id}: {e}"
|
|
)
|
|
raise
|
|
|
|
async def cleanup_instance(self, tenant_id: str, kb_id: str) -> None:
|
|
"""
|
|
Clean up and remove a cached instance.
|
|
|
|
This method should be called when a knowledge base is deleted or
|
|
a tenant is removed to ensure proper resource cleanup.
|
|
|
|
Args:
|
|
tenant_id: The tenant ID
|
|
kb_id: The knowledge base ID
|
|
"""
|
|
cache_key = (tenant_id, kb_id)
|
|
async with self._lock:
|
|
if cache_key in self._instances:
|
|
logger.info(
|
|
f"Cleaning up RAG instance for tenant={tenant_id}, kb={kb_id}"
|
|
)
|
|
try:
|
|
await self._instances[cache_key].finalize_storages()
|
|
except Exception as e:
|
|
logger.error(f"Error finalizing instance during cleanup: {e}")
|
|
del self._instances[cache_key]
|
|
if cache_key in self._access_order:
|
|
self._access_order.remove(cache_key)
|
|
|
|
async def cleanup_tenant_instances(self, tenant_id: str) -> None:
|
|
"""
|
|
Clean up all cached instances for a specific tenant.
|
|
|
|
This method should be called when a tenant is deleted to ensure
|
|
all its knowledge bases are properly cleaned up.
|
|
|
|
Args:
|
|
tenant_id: The tenant ID
|
|
"""
|
|
async with self._lock:
|
|
keys_to_remove = [k for k in self._instances.keys() if k[0] == tenant_id]
|
|
for key in keys_to_remove:
|
|
logger.info(
|
|
f"Cleaning up RAG instance for tenant={key[0]}, kb={key[1]}"
|
|
)
|
|
try:
|
|
await self._instances[key].finalize_storages()
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error finalizing instance during tenant cleanup: {e}"
|
|
)
|
|
del self._instances[key]
|
|
if key in self._access_order:
|
|
self._access_order.remove(key)
|
|
|
|
async def cleanup_all(self) -> None:
|
|
"""
|
|
Clean up all cached instances.
|
|
|
|
This should be called during application shutdown to ensure
|
|
all resources are properly released.
|
|
"""
|
|
async with self._lock:
|
|
logger.info(f"Cleaning up all {len(self._instances)} cached RAG instances")
|
|
for key, instance in list(self._instances.items()):
|
|
try:
|
|
await instance.finalize_storages()
|
|
except Exception as e:
|
|
logger.error(f"Error finalizing instance {key}: {e}")
|
|
self._instances.clear()
|
|
self._access_order.clear()
|
|
|
|
def get_instance_count(self) -> int:
|
|
"""Get the current number of cached instances."""
|
|
return len(self._instances)
|
|
|
|
def get_cached_keys(self) -> list[Tuple[str, str]]:
|
|
"""Get all currently cached tenant/KB combinations."""
|
|
return list(self._instances.keys())
|
|
|
|
def __repr__(self) -> str:
|
|
"""String representation of the manager state."""
|
|
return (
|
|
f"TenantRAGManager(instances={len(self._instances)}, "
|
|
f"max_cached={self.max_cached_instances})"
|
|
)
|