fix: prevent double-release in UnifiedLock.__aexit__ error recovery

Problem: When UnifiedLock.__aexit__ encountered an exception during async_lock.release(), the error recovery logic would incorrectly attempt to release async_lock again because it only checked main_lock_released flag. This could cause: - Double-release attempts on already-failed locks - Masking of original exceptions - Undefined behavior in lock state Root Cause: The recovery logic used only main_lock_released to determine whether to attempt async_lock release, without tracking whether async_lock.release() had already been attempted and failed. Fix: - Added async_lock_released flag to track async_lock release attempts - Updated recovery logic condition to check both main_lock_released AND async_lock_released before attempting async_lock release - This ensures async_lock.release() is only called once, even if it fails Testing: - Added test_aexit_no_double_release_on_async_lock_failure: Verifies async_lock.release() is called only once when it fails - Added test_aexit_recovery_on_main_lock_failure: Verifies recovery logic still works when main lock fails - All 5 UnifiedLock safety tests pass Impact: - Eliminates double-release bugs in multiprocess lock scenarios - Preserves correct error propagation - Maintains recovery logic for legitimate failure cases Files Modified: - lightrag/kg/shared_storage.py: Added async_lock_released tracking - tests/test_unified_lock_safety.py: Added 2 new tests (5 total now pass)
2025-11-23 16:34:08 +08:00 · 2025-11-23 16:34:08 +08:00 · 204a2535c8
commit 204a2535c8
parent 49bbb3a4d7
2 changed files with 108 additions and 2 deletions
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@ -206,6 +206,7 @@ class UnifiedLock(Generic[T]):

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        main_lock_released = False
+        async_lock_released = False
        try:
            # Release main lock first
            if self._lock is not None:
@ -229,6 +230,7 @@ class UnifiedLock(Generic[T]):
                    level="DEBUG",
                    enable_output=self._enable_logging,
                )
+                async_lock_released = True

        except Exception as e:
            direct_log(
@ -237,9 +239,10 @@ class UnifiedLock(Generic[T]):
                enable_output=True,
            )

-            # If main lock release failed but async lock hasn't been released, try to release it
+            # If main lock release failed but async lock hasn't been attempted yet, try to release it
            if (
                not main_lock_released
+                and not async_lock_released
                and not self._is_async
                and self._async_lock is not None
            ):
--- a/tests/test_unified_lock_safety.py
+++ b/tests/test_unified_lock_safety.py
@ -5,12 +5,16 @@ This test module verifies that UnifiedLock raises RuntimeError instead of
 allowing unprotected execution when the underlying lock is None, preventing
 false security and potential race conditions.

-Critical Bug: When self._lock is None, __aenter__ used to log WARNING but
+Critical Bug 1: When self._lock is None, __aenter__ used to log WARNING but
 still return successfully, allowing critical sections to run without lock
 protection, causing race conditions and data corruption.
+
+Critical Bug 2: In __aexit__, when async_lock.release() fails, the error
+recovery logic would attempt to release it again, causing double-release issues.
 """

 import pytest
+from unittest.mock import MagicMock, AsyncMock
 from lightrag.kg.shared_storage import UnifiedLock


@ -86,3 +90,102 @@ class TestUnifiedLockSafety:
            or "lock" in error_message.lower()
        )
        assert "initialize_share_data" in error_message or "None" in error_message
+
+    @pytest.mark.asyncio
+    async def test_aexit_no_double_release_on_async_lock_failure(self):
+        """
+        Test that __aexit__ doesn't attempt to release async_lock twice when it fails.
+
+        Scenario: async_lock.release() fails during normal release.
+        Expected: Recovery logic should NOT attempt to release async_lock again,
+                  preventing double-release issues.
+
+        This tests Bug 2 fix: async_lock_released tracking prevents double release.
+        """
+        # Create mock locks
+        main_lock = MagicMock()
+        main_lock.acquire = MagicMock()
+        main_lock.release = MagicMock()
+
+        async_lock = AsyncMock()
+        async_lock.acquire = AsyncMock()
+
+        # Make async_lock.release() fail
+        release_call_count = 0
+
+        def mock_release_fail():
+            nonlocal release_call_count
+            release_call_count += 1
+            raise RuntimeError("Async lock release failed")
+
+        async_lock.release = MagicMock(side_effect=mock_release_fail)
+
+        # Create UnifiedLock with both locks (sync mode with async_lock)
+        lock = UnifiedLock(
+            lock=main_lock,
+            is_async=False,
+            name="test_double_release",
+            enable_logging=False,
+        )
+        lock._async_lock = async_lock
+
+        # Try to use the lock - should fail during __aexit__
+        try:
+            async with lock:
+                pass
+        except RuntimeError as e:
+            # Should get the async lock release error
+            assert "Async lock release failed" in str(e)
+
+        # Verify async_lock.release() was called only ONCE, not twice
+        assert release_call_count == 1, (
+            f"async_lock.release() should be called only once, but was called {release_call_count} times"
+        )
+
+        # Main lock should have been released successfully
+        main_lock.release.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_aexit_recovery_on_main_lock_failure(self):
+        """
+        Test that __aexit__ recovery logic works when main lock release fails.
+
+        Scenario: main_lock.release() fails before async_lock is attempted.
+        Expected: Recovery logic should attempt to release async_lock to prevent
+                  resource leaks.
+
+        This verifies the recovery logic still works correctly with async_lock_released tracking.
+        """
+        # Create mock locks
+        main_lock = MagicMock()
+        main_lock.acquire = MagicMock()
+
+        # Make main_lock.release() fail
+        def mock_main_release_fail():
+            raise RuntimeError("Main lock release failed")
+
+        main_lock.release = MagicMock(side_effect=mock_main_release_fail)
+
+        async_lock = AsyncMock()
+        async_lock.acquire = AsyncMock()
+        async_lock.release = MagicMock()
+
+        # Create UnifiedLock with both locks (sync mode with async_lock)
+        lock = UnifiedLock(
+            lock=main_lock, is_async=False, name="test_recovery", enable_logging=False
+        )
+        lock._async_lock = async_lock
+
+        # Try to use the lock - should fail during __aexit__
+        try:
+            async with lock:
+                pass
+        except RuntimeError as e:
+            # Should get the main lock release error
+            assert "Main lock release failed" in str(e)
+
+        # Main lock release should have been attempted
+        main_lock.release.assert_called_once()
+
+        # Recovery logic should have attempted to release async_lock
+        async_lock.release.assert_called_once()