#!/usr/bin/env python """ Test script for Workspace Isolation Feature Comprehensive test suite covering workspace isolation in LightRAG: 1. Pipeline Status Isolation - Data isolation between workspaces 2. Lock Mechanism - Parallel execution for different workspaces, serial for same workspace 3. Backward Compatibility - Legacy code without workspace parameters 4. Multi-Workspace Concurrency - Concurrent operations on different workspaces 5. NamespaceLock Re-entrance Protection - Prevents deadlocks 6. Different Namespace Lock Isolation - Locks isolated by namespace 7. Error Handling - Invalid workspace configurations 8. Update Flags Workspace Isolation - Update flags properly isolated 9. Empty Workspace Standardization - Empty workspace handling 10. (removed) Legacy JsonKVStorage workspace isolation 11. LightRAG End-to-End Workspace Isolation - Complete E2E test with two instances Total: 11 test scenarios """ import asyncio import os import shutil import time from pathlib import Path import numpy as np import pytest from lightrag.kg.shared_storage import ( clear_all_update_flags, finalize_share_data, get_all_update_flags_status, get_default_workspace, get_final_namespace, get_namespace_data, get_namespace_lock, get_update_flag, initialize_pipeline_status, initialize_share_data, set_all_update_flags, set_default_workspace, ) # ============================================================================= # Test Configuration # ============================================================================= # Test configuration is handled via pytest fixtures in conftest.py # - Use CLI options: --keep-artifacts, --stress-test, --test-workers=N # - Or environment variables: LIGHTRAG_KEEP_ARTIFACTS, LIGHTRAG_STRESS_TEST, LIGHTRAG_TEST_WORKERS # Priority: CLI options > Environment variables > Default values # ============================================================================= # Pytest Fixtures # ============================================================================= @pytest.fixture(autouse=True) def setup_shared_data(): """Initialize shared data before each test""" initialize_share_data() yield finalize_share_data() async def _measure_lock_parallelism( workload: list[tuple[str, str, str]], hold_time: float = 0.05 ) -> tuple[int, list[tuple[str, str]], dict[str, float]]: """Run lock acquisition workload and capture peak concurrency and timeline. Args: workload: List of (name, workspace, namespace) tuples hold_time: How long each worker holds the lock (seconds) Returns: Tuple of (max_parallel, timeline, metrics) where: - max_parallel: Peak number of concurrent lock holders - timeline: List of (name, event) tuples tracking execution order - metrics: Dict with performance metrics (total_duration, max_concurrency, etc.) """ running = 0 max_parallel = 0 timeline: list[tuple[str, str]] = [] start_time = time.time() async def worker(name: str, workspace: str, namespace: str) -> None: nonlocal running, max_parallel lock = get_namespace_lock(namespace, workspace) async with lock: running += 1 max_parallel = max(max_parallel, running) timeline.append((name, 'start')) await asyncio.sleep(hold_time) timeline.append((name, 'end')) running -= 1 await asyncio.gather(*(worker(*args) for args in workload)) metrics = { 'total_duration': time.time() - start_time, 'max_concurrency': max_parallel, 'avg_hold_time': hold_time, 'num_workers': len(workload), } return max_parallel, timeline, metrics def _assert_no_timeline_overlap(timeline: list[tuple[str, str]]) -> None: """Ensure that timeline events never overlap for sequential execution. This function implements a finite state machine that validates: - No overlapping lock acquisitions (only one task active at a time) - Proper lock release order (task releases its own lock) - All locks are properly released Args: timeline: List of (name, event) tuples where event is "start" or "end" Raises: AssertionError: If timeline shows overlapping execution or improper locking """ active_task = None for name, event in timeline: if event == 'start': if active_task is not None: raise AssertionError(f"Task '{name}' started before '{active_task}' released the lock") active_task = name else: if active_task != name: raise AssertionError(f"Task '{name}' finished while '{active_task}' was expected to hold the lock") active_task = None if active_task is not None: raise AssertionError(f"Task '{active_task}' did not release the lock properly") # ============================================================================= # Test 1: Pipeline Status Isolation Test # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_pipeline_status_isolation(): """ Test that pipeline status is isolated between different workspaces. """ # Purpose: Ensure pipeline_status shared data remains unique per workspace. # Scope: initialize_pipeline_status and get_namespace_data interactions. print('\n' + '=' * 60) print('TEST 1: Pipeline Status Isolation') print('=' * 60) # Initialize shared storage initialize_share_data() # Initialize pipeline status for two different workspaces workspace1 = 'test_workspace_1' workspace2 = 'test_workspace_2' await initialize_pipeline_status(workspace1) await initialize_pipeline_status(workspace2) # Get pipeline status data for both workspaces data1 = await get_namespace_data('pipeline_status', workspace=workspace1) data2 = await get_namespace_data('pipeline_status', workspace=workspace2) # Verify they are independent objects assert data1 is not data2, 'Pipeline status data objects are the same (should be different)' # Modify workspace1's data and verify workspace2 is not affected data1['test_key'] = 'workspace1_value' # Re-fetch to ensure we get the latest data data1_check = await get_namespace_data('pipeline_status', workspace=workspace1) data2_check = await get_namespace_data('pipeline_status', workspace=workspace2) assert 'test_key' in data1_check, 'test_key not found in workspace1' assert data1_check['test_key'] == 'workspace1_value', ( f'workspace1 test_key value incorrect: {data1_check.get("test_key")}' ) assert 'test_key' not in data2_check, f'test_key leaked to workspace2: {data2_check.get("test_key")}' print('✅ PASSED: Pipeline Status Isolation') print(' Different workspaces have isolated pipeline status') # ============================================================================= # Test 2: Lock Mechanism Test (No Deadlocks) # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_lock_mechanism(stress_test_mode, parallel_workers): """ Test that the new keyed lock mechanism works correctly without deadlocks. Tests both parallel execution for different workspaces and serialization for the same workspace. """ # Purpose: Validate that keyed locks isolate workspaces while serializing # requests within the same workspace. Scope: get_namespace_lock scheduling # semantics for both cross-workspace and single-workspace cases. print('\n' + '=' * 60) print('TEST 2: Lock Mechanism (No Deadlocks)') print('=' * 60) # Test 2.1: Different workspaces should run in parallel print('\nTest 2.1: Different workspaces locks should be parallel') # Support stress testing with configurable number of workers num_workers = parallel_workers if stress_test_mode else 3 parallel_workload = [(f'ws_{chr(97 + i)}', f'ws_{chr(97 + i)}', 'test_namespace') for i in range(num_workers)] max_parallel, timeline_parallel, metrics = await _measure_lock_parallelism(parallel_workload) assert max_parallel >= 2, ( 'Locks for distinct workspaces should overlap; ' f'observed max concurrency: {max_parallel}, timeline={timeline_parallel}' ) print('✅ PASSED: Lock Mechanism - Parallel (Different Workspaces)') print(f' Locks overlapped for different workspaces (max concurrency={max_parallel})') print(f' Performance: {metrics["total_duration"]:.3f}s for {metrics["num_workers"]} workers') # Test 2.2: Same workspace should serialize print('\nTest 2.2: Same workspace locks should serialize') serial_workload = [ ('serial_run_1', 'ws_same', 'test_namespace'), ('serial_run_2', 'ws_same', 'test_namespace'), ] ( max_parallel_serial, timeline_serial, metrics_serial, ) = await _measure_lock_parallelism(serial_workload) assert max_parallel_serial == 1, ( f'Same workspace locks should not overlap; observed {max_parallel_serial} with timeline {timeline_serial}' ) _assert_no_timeline_overlap(timeline_serial) print('✅ PASSED: Lock Mechanism - Serial (Same Workspace)') print(' Same workspace operations executed sequentially with no overlap') print(f' Performance: {metrics_serial["total_duration"]:.3f}s for {metrics_serial["num_workers"]} tasks') # ============================================================================= # Test 3: Backward Compatibility Test # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_backward_compatibility(): """ Test that legacy code without workspace parameter still works correctly. """ # Purpose: Validate backward-compatible defaults when workspace arguments # are omitted. Scope: get_final_namespace, set/get_default_workspace and # initialize_pipeline_status fallback behavior. print('\n' + '=' * 60) print('TEST 3: Backward Compatibility') print('=' * 60) # Test 3.1: get_final_namespace with None should use default workspace print('\nTest 3.1: get_final_namespace with workspace=None') set_default_workspace('my_default_workspace') final_ns = get_final_namespace('pipeline_status') expected = 'my_default_workspace:pipeline_status' assert final_ns == expected, f'Expected {expected}, got {final_ns}' print('✅ PASSED: Backward Compatibility - get_final_namespace') print(f' Correctly uses default workspace: {final_ns}') # Test 3.2: get_default_workspace print('\nTest 3.2: get/set default workspace') set_default_workspace('test_default') retrieved = get_default_workspace() assert retrieved == 'test_default', f"Expected 'test_default', got {retrieved}" print('✅ PASSED: Backward Compatibility - default workspace') print(f' Default workspace set/get correctly: {retrieved}') # Test 3.3: Empty workspace handling print('\nTest 3.3: Empty workspace handling') set_default_workspace('') final_ns_empty = get_final_namespace('pipeline_status', workspace=None) expected_empty = 'pipeline_status' # Should be just the namespace without ':' assert final_ns_empty == expected_empty, f"Expected '{expected_empty}', got '{final_ns_empty}'" print('✅ PASSED: Backward Compatibility - empty workspace') print(f" Empty workspace handled correctly: '{final_ns_empty}'") # Test 3.4: None workspace with default set print('\nTest 3.4: initialize_pipeline_status with workspace=None') set_default_workspace('compat_test_workspace') initialize_share_data() await initialize_pipeline_status(workspace=None) # Should use default # Try to get data using the default workspace explicitly data = await get_namespace_data('pipeline_status', workspace='compat_test_workspace') assert data is not None, 'Failed to initialize pipeline status with default workspace' print('✅ PASSED: Backward Compatibility - pipeline init with None') print(' Pipeline status initialized with default workspace') # ============================================================================= # Test 4: Multi-Workspace Concurrency Test # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_multi_workspace_concurrency(): """ Test that multiple workspaces can operate concurrently without interference. Simulates concurrent operations on different workspaces. """ # Purpose: Simulate concurrent workloads touching pipeline_status across # workspaces. Scope: initialize_pipeline_status, get_namespace_lock, and # shared dictionary mutation while ensuring isolation. print('\n' + '=' * 60) print('TEST 4: Multi-Workspace Concurrency') print('=' * 60) initialize_share_data() async def workspace_operations(workspace_id): """Simulate operations on a specific workspace""" print(f'\n [{workspace_id}] Starting operations') # Initialize pipeline status await initialize_pipeline_status(workspace_id) # Get lock and perform operations lock = get_namespace_lock('test_operations', workspace_id) async with lock: # Get workspace data data = await get_namespace_data('pipeline_status', workspace=workspace_id) # Modify data data[f'{workspace_id}_key'] = f'{workspace_id}_value' data['timestamp'] = time.time() # Simulate some work await asyncio.sleep(0.1) print(f' [{workspace_id}] Completed operations') return workspace_id # Run multiple workspaces concurrently workspaces = ['concurrent_ws_1', 'concurrent_ws_2', 'concurrent_ws_3'] start = time.time() results_list = await asyncio.gather(*[workspace_operations(ws) for ws in workspaces]) elapsed = time.time() - start print(f'\n All workspaces completed in {elapsed:.2f}s') # Verify all workspaces completed assert set(results_list) == set(workspaces), 'Not all workspaces completed' print('✅ PASSED: Multi-Workspace Concurrency - Execution') print(f' All {len(workspaces)} workspaces completed successfully in {elapsed:.2f}s') # Verify data isolation - each workspace should have its own data print('\n Verifying data isolation...') for ws in workspaces: data = await get_namespace_data('pipeline_status', workspace=ws) expected_key = f'{ws}_key' expected_value = f'{ws}_value' assert expected_key in data, f'Data not properly isolated for {ws}: missing {expected_key}' assert data[expected_key] == expected_value, ( f'Data not properly isolated for {ws}: {expected_key}={data[expected_key]} (expected {expected_value})' ) print(f' [{ws}] Data correctly isolated: {expected_key}={data[expected_key]}') print('✅ PASSED: Multi-Workspace Concurrency - Data Isolation') print(' All workspaces have properly isolated data') # ============================================================================= # Test 5: NamespaceLock Re-entrance Protection # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_namespace_lock_reentrance(): """ Test that NamespaceLock prevents re-entrance in the same coroutine and allows concurrent use in different coroutines. """ # Purpose: Ensure NamespaceLock enforces single entry per coroutine while # allowing concurrent reuse through ContextVar isolation. Scope: lock # re-entrance checks and concurrent gather semantics. print('\n' + '=' * 60) print('TEST 5: NamespaceLock Re-entrance Protection') print('=' * 60) # Test 5.1: Same coroutine re-entrance should fail print('\nTest 5.1: Same coroutine re-entrance should raise RuntimeError') lock = get_namespace_lock('test_reentrance', 'test_ws') reentrance_failed_correctly = False try: async with lock: print(' Acquired lock first time') # Try to acquire the same lock again in the same coroutine async with lock: print(' ERROR: Should not reach here - re-entrance succeeded!') except RuntimeError as e: if 'already acquired' in str(e).lower(): print(f' ✓ Re-entrance correctly blocked: {e}') reentrance_failed_correctly = True else: raise assert reentrance_failed_correctly, 'Re-entrance protection not working' print('✅ PASSED: NamespaceLock Re-entrance Protection') print(' Re-entrance correctly raises RuntimeError') # Test 5.2: Same NamespaceLock instance in different coroutines should succeed print('\nTest 5.2: Same NamespaceLock instance in different coroutines') shared_lock = get_namespace_lock('test_concurrent', 'test_ws') concurrent_results = [] async def use_shared_lock(coroutine_id): """Use the same NamespaceLock instance""" async with shared_lock: concurrent_results.append(f'coroutine_{coroutine_id}_start') await asyncio.sleep(0.1) concurrent_results.append(f'coroutine_{coroutine_id}_end') # This should work because each coroutine gets its own ContextVar await asyncio.gather( use_shared_lock(1), use_shared_lock(2), ) # Both coroutines should have completed expected_entries = 4 # 2 starts + 2 ends assert len(concurrent_results) == expected_entries, ( f'Expected {expected_entries} entries, got {len(concurrent_results)}' ) print('✅ PASSED: NamespaceLock Concurrent Reuse') print(f' Same NamespaceLock instance used successfully in {expected_entries // 2} concurrent coroutines') # ============================================================================= # Test 6: Different Namespace Lock Isolation # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_different_namespace_lock_isolation(): """ Test that locks for different namespaces (same workspace) are independent. """ # Purpose: Confirm that namespace isolation is enforced even when workspace # is the same. Scope: get_namespace_lock behavior when namespaces differ. print('\n' + '=' * 60) print('TEST 6: Different Namespace Lock Isolation') print('=' * 60) print('\nTesting locks with same workspace but different namespaces') workload = [ ('ns_a', 'same_ws', 'namespace_a'), ('ns_b', 'same_ws', 'namespace_b'), ('ns_c', 'same_ws', 'namespace_c'), ] max_parallel, timeline, metrics = await _measure_lock_parallelism(workload) assert max_parallel >= 2, ( 'Different namespaces within the same workspace should run concurrently; ' f'observed max concurrency {max_parallel} with timeline {timeline}' ) print('✅ PASSED: Different Namespace Lock Isolation') print(f' Different namespace locks ran in parallel (max concurrency={max_parallel})') print(f' Performance: {metrics["total_duration"]:.3f}s for {metrics["num_workers"]} namespaces') # ============================================================================= # Test 7: Error Handling # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_error_handling(): """ Test error handling for invalid workspace configurations. """ # Purpose: Validate guardrails for workspace normalization and namespace # derivation. Scope: set_default_workspace conversions and get_final_namespace # failure paths when configuration is invalid. print('\n' + '=' * 60) print('TEST 7: Error Handling') print('=' * 60) # Test 7.0: Missing default workspace should raise ValueError print('\nTest 7.0: Missing workspace raises ValueError') with pytest.raises(ValueError): get_final_namespace('test_namespace', workspace=None) # Test 7.1: set_default_workspace(None) converts to empty string print('\nTest 7.1: set_default_workspace(None) converts to empty string') set_default_workspace(None) default_ws = get_default_workspace() # Should convert None to "" automatically assert default_ws == '', f"Expected empty string, got: '{default_ws}'" print('✅ PASSED: Error Handling - None to Empty String') print(f" set_default_workspace(None) correctly converts to empty string: '{default_ws}'") # Test 7.2: Empty string workspace behavior print('\nTest 7.2: Empty string workspace creates valid namespace') # With empty workspace, should create namespace without colon final_ns = get_final_namespace('test_namespace', workspace='') assert final_ns == 'test_namespace', f"Unexpected namespace: '{final_ns}'" print('✅ PASSED: Error Handling - Empty Workspace Namespace') print(f" Empty workspace creates valid namespace: '{final_ns}'") # Restore default workspace for other tests set_default_workspace('') # ============================================================================= # Test 8: Update Flags Workspace Isolation # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_update_flags_workspace_isolation(): """ Test that update flags are properly isolated between workspaces. """ # Purpose: Confirm update flag setters/readers respect workspace scoping. # Scope: set_all_update_flags, clear_all_update_flags, get_all_update_flags_status, # and get_update_flag interactions across namespaces. print('\n' + '=' * 60) print('TEST 8: Update Flags Workspace Isolation') print('=' * 60) initialize_share_data() workspace1 = 'update_flags_ws1' workspace2 = 'update_flags_ws2' test_namespace = 'test_update_flags_ns' # Initialize namespaces for both workspaces await initialize_pipeline_status(workspace1) await initialize_pipeline_status(workspace2) # Test 8.1: set_all_update_flags isolation print('\nTest 8.1: set_all_update_flags workspace isolation') # Create flags for both workspaces (simulating workers) flag1_obj = await get_update_flag(test_namespace, workspace=workspace1) flag2_obj = await get_update_flag(test_namespace, workspace=workspace2) # Initial state should be False assert flag1_obj.value is False, 'Flag1 initial value should be False' assert flag2_obj.value is False, 'Flag2 initial value should be False' # Set all flags for workspace1 await set_all_update_flags(test_namespace, workspace=workspace1) # Check that only workspace1's flags are set assert flag1_obj.value is True, f'Flag1 should be True after set_all_update_flags, got {flag1_obj.value}' assert flag2_obj.value is False, f'Flag2 should still be False, got {flag2_obj.value}' print('✅ PASSED: Update Flags - set_all_update_flags Isolation') print(f' set_all_update_flags isolated: ws1={flag1_obj.value}, ws2={flag2_obj.value}') # Test 8.2: clear_all_update_flags isolation print('\nTest 8.2: clear_all_update_flags workspace isolation') # Set flags for both workspaces await set_all_update_flags(test_namespace, workspace=workspace1) await set_all_update_flags(test_namespace, workspace=workspace2) # Verify both are set assert flag1_obj.value is True, 'Flag1 should be True' assert flag2_obj.value is True, 'Flag2 should be True' # Clear only workspace1 await clear_all_update_flags(test_namespace, workspace=workspace1) # Check that only workspace1's flags are cleared assert flag1_obj.value is False, f'Flag1 should be False after clear, got {flag1_obj.value}' assert flag2_obj.value is True, f'Flag2 should still be True, got {flag2_obj.value}' print('✅ PASSED: Update Flags - clear_all_update_flags Isolation') print(f' clear_all_update_flags isolated: ws1={flag1_obj.value}, ws2={flag2_obj.value}') # Test 8.3: get_all_update_flags_status workspace filtering print('\nTest 8.3: get_all_update_flags_status workspace filtering') # Initialize more namespaces for testing await get_update_flag('ns_a', workspace=workspace1) await get_update_flag('ns_b', workspace=workspace1) await get_update_flag('ns_c', workspace=workspace2) # Set flags for workspace1 await set_all_update_flags('ns_a', workspace=workspace1) await set_all_update_flags('ns_b', workspace=workspace1) # Set flags for workspace2 await set_all_update_flags('ns_c', workspace=workspace2) # Get status for workspace1 only status1 = await get_all_update_flags_status(workspace=workspace1) # Check that workspace1's namespaces are present # The keys should include workspace1's namespaces but not workspace2's workspace1_keys = [k for k in status1 if workspace1 in k] workspace2_keys = [k for k in status1 if workspace2 in k] assert len(workspace1_keys) > 0, f'workspace1 keys should be present, got {len(workspace1_keys)}' assert len(workspace2_keys) == 0, f'workspace2 keys should not be present, got {len(workspace2_keys)}' for key, values in status1.items(): assert all(values), f'All flags in {key} should be True, got {values}' # Workspace2 query should only surface workspace2 namespaces status2 = await get_all_update_flags_status(workspace=workspace2) expected_ws2_keys = { f'{workspace2}:{test_namespace}', f'{workspace2}:ns_c', } assert set(status2.keys()) == expected_ws2_keys, f'Unexpected namespaces for workspace2: {status2.keys()}' for key, values in status2.items(): assert all(values), f'All flags in {key} should be True, got {values}' print('✅ PASSED: Update Flags - get_all_update_flags_status Filtering') print(f' Status correctly filtered: ws1 keys={len(workspace1_keys)}, ws2 keys={len(workspace2_keys)}') # ============================================================================= # Test 9: Empty Workspace Standardization # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_empty_workspace_standardization(): """ Test that empty workspace is properly standardized to "" instead of "_". """ # Purpose: Verify namespace formatting when workspace is an empty string. # Scope: get_final_namespace output and initialize_pipeline_status behavior # between empty and non-empty workspaces. print('\n' + '=' * 60) print('TEST 9: Empty Workspace Standardization') print('=' * 60) # Test 9.1: Empty string workspace creates namespace without colon print('\nTest 9.1: Empty string workspace namespace format') set_default_workspace('') final_ns = get_final_namespace('test_namespace', workspace=None) # Should be just "test_namespace" without colon prefix assert final_ns == 'test_namespace', f"Unexpected namespace format: '{final_ns}' (expected 'test_namespace')" print('✅ PASSED: Empty Workspace Standardization - Format') print(f" Empty workspace creates correct namespace: '{final_ns}'") # Test 9.2: Empty workspace vs non-empty workspace behavior print('\nTest 9.2: Empty vs non-empty workspace behavior') initialize_share_data() # Initialize with empty workspace await initialize_pipeline_status(workspace='') data_empty = await get_namespace_data('pipeline_status', workspace='') # Initialize with non-empty workspace await initialize_pipeline_status(workspace='test_ws') data_nonempty = await get_namespace_data('pipeline_status', workspace='test_ws') # They should be different objects assert data_empty is not data_nonempty, 'Empty and non-empty workspaces share data (should be independent)' print('✅ PASSED: Empty Workspace Standardization - Behavior') print(' Empty and non-empty workspaces have independent data') # ============================================================================= # # ============================================================================= # Test 10 removed (legacy JsonKVStorage) # ============================================================================= # ============================================================================= # Test 11: LightRAG End-to-End Integration Test # ============================================================================= @pytest.mark.offline @pytest.mark.asyncio async def test_lightrag_end_to_end_workspace_isolation(keep_test_artifacts): """ End-to-end test: Create two LightRAG instances with different workspaces, insert different data, and verify file separation. Uses mock LLM and embedding functions to avoid external API calls. """ # Purpose: Validate that full LightRAG flows keep artifacts scoped per workspace. # Scope: LightRAG.initialize_storages + ainsert side effects plus filesystem # verification for generated storage files. print('\n' + '=' * 60) print('TEST 11: LightRAG End-to-End Workspace Isolation') print('=' * 60) # Create temporary test directory under project temp/ test_dir = str(Path(__file__).parent.parent / 'temp/test_lightrag_end_to_end_workspace_isolation') if os.path.exists(test_dir): shutil.rmtree(test_dir) os.makedirs(test_dir, exist_ok=True) print(f'\n Using test directory: {test_dir}') try: # Factory function to create different mock LLM functions for each workspace def create_mock_llm_func(workspace_name): """Create a mock LLM function that returns different content based on workspace""" async def mock_llm_func(prompt, system_prompt=None, history_messages=None, **kwargs) -> str: if history_messages is None: history_messages = [] # Add coroutine switching to simulate async I/O and allow concurrent execution await asyncio.sleep(0) # Return different responses based on workspace # Format: entity<|#|>entity_name<|#|>entity_type<|#|>entity_description # Format: relation<|#|>source_entity<|#|>target_entity<|#|>keywords<|#|>description if workspace_name == 'project_a': return """entity<|#|>Artificial Intelligence<|#|>concept<|#|>AI is a field of computer science focused on creating intelligent machines. entity<|#|>Machine Learning<|#|>concept<|#|>Machine Learning is a subset of AI that enables systems to learn from data. relation<|#|>Machine Learning<|#|>Artificial Intelligence<|#|>subset, related field<|#|>Machine Learning is a key component and subset of Artificial Intelligence. <|COMPLETE|>""" else: # project_b return """entity<|#|>Deep Learning<|#|>concept<|#|>Deep Learning is a subset of machine learning using neural networks with multiple layers. entity<|#|>Neural Networks<|#|>concept<|#|>Neural Networks are computing systems inspired by biological neural networks. relation<|#|>Deep Learning<|#|>Neural Networks<|#|>uses, composed of<|#|>Deep Learning uses multiple layers of Neural Networks to learn representations. <|COMPLETE|>""" return mock_llm_func # Mock embedding function async def mock_embedding_func(texts: list[str]) -> np.ndarray: # Add coroutine switching to simulate async I/O and allow concurrent execution await asyncio.sleep(0) return np.random.rand(len(texts), 384) # 384-dimensional vectors # Test 11.1: Create two LightRAG instances with different workspaces print('\nTest 11.1: Create two LightRAG instances with different workspaces') from lightrag import LightRAG from lightrag.utils import EmbeddingFunc, Tokenizer # Create different mock LLM functions for each workspace mock_llm_func_a = create_mock_llm_func('project_a') mock_llm_func_b = create_mock_llm_func('project_b') class _SimpleTokenizerImpl: def encode(self, content: str) -> list[int]: return [ord(ch) for ch in content] def decode(self, tokens: list[int]) -> str: return ''.join(chr(t) for t in tokens) tokenizer = Tokenizer('mock-tokenizer', _SimpleTokenizerImpl()) rag1 = LightRAG( working_dir=test_dir, workspace='project_a', llm_model_func=mock_llm_func_a, embedding_func=EmbeddingFunc( embedding_dim=384, max_token_size=8192, func=mock_embedding_func, ), tokenizer=tokenizer, ) rag2 = LightRAG( working_dir=test_dir, workspace='project_b', llm_model_func=mock_llm_func_b, embedding_func=EmbeddingFunc( embedding_dim=384, max_token_size=8192, func=mock_embedding_func, ), tokenizer=tokenizer, ) # Initialize storages await rag1.initialize_storages() await rag2.initialize_storages() print(' RAG1 created: workspace=project_a') print(' RAG2 created: workspace=project_b') # Test 11.2: Insert different data to each RAG instance (CONCURRENTLY) print('\nTest 11.2: Insert different data to each RAG instance (concurrently)') text_for_project_a = ( 'This document is about Artificial Intelligence and Machine Learning. AI is transforming the world.' ) text_for_project_b = ( 'This document is about Deep Learning and Neural Networks. Deep learning uses multiple layers.' ) # Insert to both projects concurrently to test workspace isolation under concurrent load print(' Starting concurrent insert operations...') start_time = time.time() await asyncio.gather(rag1.ainsert(text_for_project_a), rag2.ainsert(text_for_project_b)) elapsed_time = time.time() - start_time print(f' Inserted to project_a: {len(text_for_project_a)} chars (concurrent)') print(f' Inserted to project_b: {len(text_for_project_b)} chars (concurrent)') print(f' Total concurrent execution time: {elapsed_time:.3f}s') # Test 11.3: Verify file structure print('\nTest 11.3: Verify workspace directory structure') project_a_dir = Path(test_dir) / 'project_a' project_b_dir = Path(test_dir) / 'project_b' project_a_exists = project_a_dir.exists() project_b_exists = project_b_dir.exists() print(f' project_a directory: {project_a_dir}') print(f' project_a exists: {project_a_exists}') print(f' project_b directory: {project_b_dir}') print(f' project_b exists: {project_b_exists}') assert project_a_exists, 'project_a directory should exist' assert project_b_exists, 'project_b directory should exist' # List files in each directory print('\n Files in project_a/:') for file in sorted(project_a_dir.glob('*')): if file.is_file(): size = file.stat().st_size print(f' - {file.name} ({size} bytes)') print('\n Files in project_b/:') for file in sorted(project_b_dir.glob('*')): if file.is_file(): size = file.stat().st_size print(f' - {file.name} ({size} bytes)') print('✅ PASSED: LightRAG E2E - File Structure') print(' Workspace directories correctly created and separated') # Test 11.4: Verify data isolation by checking file contents print('\nTest 11.4: Verify data isolation (check file contents)') # Check if full_docs storage files exist and contain different content docs_a_file = project_a_dir / 'kv_store_full_docs.json' docs_b_file = project_b_dir / 'kv_store_full_docs.json' if docs_a_file.exists() and docs_b_file.exists(): import json with open(docs_a_file) as f: docs_a_content = json.load(f) with open(docs_b_file) as f: docs_b_content = json.load(f) print(f' project_a doc count: {len(docs_a_content)}') print(f' project_b doc count: {len(docs_b_content)}') # Verify they contain different data assert docs_a_content != docs_b_content, 'Document storage not properly isolated' # Verify each workspace contains its own text content docs_a_str = json.dumps(docs_a_content) docs_b_str = json.dumps(docs_b_content) # Check project_a contains its text and NOT project_b's text assert 'Artificial Intelligence' in docs_a_str, "project_a should contain 'Artificial Intelligence'" assert 'Machine Learning' in docs_a_str, "project_a should contain 'Machine Learning'" assert 'Deep Learning' not in docs_a_str, "project_a should NOT contain 'Deep Learning' from project_b" assert 'Neural Networks' not in docs_a_str, "project_a should NOT contain 'Neural Networks' from project_b" # Check project_b contains its text and NOT project_a's text assert 'Deep Learning' in docs_b_str, "project_b should contain 'Deep Learning'" assert 'Neural Networks' in docs_b_str, "project_b should contain 'Neural Networks'" assert 'Artificial Intelligence' not in docs_b_str, ( "project_b should NOT contain 'Artificial Intelligence' from project_a" ) # Note: "Machine Learning" might appear in project_b's text, so we skip that check print('✅ PASSED: LightRAG E2E - Data Isolation') print(' Document storage correctly isolated between workspaces') print(' project_a contains only its own data') print(' project_b contains only its own data') else: print(' Document storage files not found (may not be created yet)') print('✅ PASSED: LightRAG E2E - Data Isolation') print(' Skipped file content check (files not created)') print('\n ✓ Test complete - workspace isolation verified at E2E level') finally: # Cleanup test directory (unless keep_test_artifacts is set) if os.path.exists(test_dir) and not keep_test_artifacts: shutil.rmtree(test_dir) print(f'\n Cleaned up test directory: {test_dir}') elif keep_test_artifacts: print(f'\n Kept test directory for inspection: {test_dir}')