Merge pull request #1809 from danielaskdd/fix-redis

Enhance Redis connection handling with retries and timeouts
2025-07-19 11:52:25 +08:00 · 2025-07-19 11:52:25 +08:00 · 678da3e398
commit 678da3e398
parent 801bdc520e 96b94acc83
4 changed files with 78 additions and 11 deletions
--- a/env.example
+++ b/env.example
@ -216,6 +216,10 @@ QDRANT_URL=http://localhost:6333

 ### Redis
 REDIS_URI=redis://localhost:6379
+REDIS_SOCKET_TIMEOUT=30
+REDIS_CONNECT_TIMEOUT=10
+REDIS_MAX_CONNECTIONS=100
+REDIS_RETRY_ATTEMPTS=3
 # REDIS_WORKSPACE=forced_workspace_name

 ### Memgraph Configuration
--- a/lightrag/kg/redis_impl.py
+++ b/lightrag/kg/redis_impl.py
@ -11,7 +11,7 @@ if not pm.is_installed("redis"):

 # aioredis is a depricated library, replaced with redis
 from redis.asyncio import Redis, ConnectionPool  # type: ignore
-from redis.exceptions import RedisError, ConnectionError  # type: ignore
+from redis.exceptions import RedisError, ConnectionError, TimeoutError  # type: ignore
 from lightrag.utils import logger

 from lightrag.base import (
@ -22,14 +22,35 @@ from lightrag.base import (
 )
 import json

+# Import tenacity for retry logic
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+    before_sleep_log,
+)

 config = configparser.ConfigParser()
 config.read("config.ini", "utf-8")

-# Constants for Redis connection pool
-MAX_CONNECTIONS = 100
-SOCKET_TIMEOUT = 5.0
-SOCKET_CONNECT_TIMEOUT = 3.0
+# Constants for Redis connection pool with environment variable support
+MAX_CONNECTIONS = int(os.getenv("REDIS_MAX_CONNECTIONS", "200"))
+SOCKET_TIMEOUT = float(os.getenv("REDIS_SOCKET_TIMEOUT", "30.0"))
+SOCKET_CONNECT_TIMEOUT = float(os.getenv("REDIS_CONNECT_TIMEOUT", "10.0"))
+RETRY_ATTEMPTS = int(os.getenv("REDIS_RETRY_ATTEMPTS", "3"))
+
+# Tenacity retry decorator for Redis operations
+redis_retry = retry(
+    stop=stop_after_attempt(RETRY_ATTEMPTS),
+    wait=wait_exponential(multiplier=1, min=1, max=8),
+    retry=(
+        retry_if_exception_type(ConnectionError)
+        | retry_if_exception_type(TimeoutError)
+        | retry_if_exception_type(RedisError)
+    ),
+    before_sleep=before_sleep_log(logger, "WARNING"),
+)


 class RedisConnectionManager:
@ -220,6 +241,7 @@ class RedisKVStorage(BaseKVStorage):
        """Ensure Redis resources are cleaned up when exiting context."""
        await self.close()

+    @redis_retry
    async def get_by_id(self, id: str) -> dict[str, Any] | None:
        async with self._get_redis_connection() as redis:
            try:
@ -235,6 +257,7 @@ class RedisKVStorage(BaseKVStorage):
                logger.error(f"JSON decode error for id {id}: {e}")
                return None

+    @redis_retry
    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
        async with self._get_redis_connection() as redis:
            try:
@ -311,6 +334,7 @@ class RedisKVStorage(BaseKVStorage):
            existing_ids = {keys_list[i] for i, exists in enumerate(results) if exists}
            return set(keys) - existing_ids

+    @redis_retry
    async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
        if not data:
            return
@ -790,6 +814,7 @@ class RedisDocStatusStorage(DocStatusStorage):
        """Redis handles persistence automatically"""
        pass

+    @redis_retry
    async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
        """Insert or update document status data"""
        if not data:
@ -811,6 +836,7 @@ class RedisDocStatusStorage(DocStatusStorage):
                logger.error(f"JSON decode error during upsert: {e}")
                raise

+    @redis_retry
    async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
        async with self._get_redis_connection() as redis:
            try:
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -1127,7 +1127,7 @@ class LightRAG:
                                }
                            )

-                        # Concurrency is controlled by graph db lock for individual entities and relationships
+                        # Concurrency is controlled by keyed lock for individual entities and relationships
                        if file_extraction_stage_ok:
                            try:
                                # Get chunk_results from entity_relation_task
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -480,8 +480,25 @@ async def _rebuild_knowledge_from_chunks(
            pipeline_status["latest_message"] = status_message
            pipeline_status["history_messages"].append(status_message)

-    # Execute all tasks in parallel with semaphore control
-    await asyncio.gather(*tasks)
+    # Execute all tasks in parallel with semaphore control and early failure detection
+    done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
+
+    # Check if any task raised an exception
+    for task in done:
+        if task.exception():
+            # If a task failed, cancel all pending tasks
+            for pending_task in pending:
+                pending_task.cancel()
+
+            # Wait for cancellation to complete
+            if pending:
+                await asyncio.wait(pending)
+
+            # Re-raise the exception to notify the caller
+            raise task.exception()
+
+    # If all tasks completed successfully, collect results
+    # (No need to collect results since these tasks don't return values)

    # Final status report
    status_message = f"KG rebuild completed: {rebuilt_entities_count} entities and {rebuilt_relationships_count} relationships rebuilt successfully."
@ -1262,8 +1279,11 @@ async def merge_nodes_and_edges(
        async with semaphore:
            workspace = global_config.get("workspace", "")
            namespace = f"{workspace}:GraphDB" if workspace else "GraphDB"
+            # Sort the edge_key components to ensure consistent lock key generation
+            sorted_edge_key = sorted([edge_key[0], edge_key[1]])
+            logger.info(f"Processing edge: {sorted_edge_key[0]} - {sorted_edge_key[1]}")
            async with get_storage_keyed_lock(
-                f"{edge_key[0]}-{edge_key[1]}",
+                f"{sorted_edge_key[0]}-{sorted_edge_key[1]}",
                namespace=namespace,
                enable_logging=False,
            ):
@ -1310,8 +1330,25 @@ async def merge_nodes_and_edges(
    for edge_key, edges in all_edges.items():
        tasks.append(asyncio.create_task(_locked_process_edges(edge_key, edges)))

-    # Execute all tasks in parallel with semaphore control
-    await asyncio.gather(*tasks)
+    # Execute all tasks in parallel with semaphore control and early failure detection
+    done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
+
+    # Check if any task raised an exception
+    for task in done:
+        if task.exception():
+            # If a task failed, cancel all pending tasks
+            for pending_task in pending:
+                pending_task.cancel()
+
+            # Wait for cancellation to complete
+            if pending:
+                await asyncio.wait(pending)
+
+            # Re-raise the exception to notify the caller
+            raise task.exception()
+
+    # If all tasks completed successfully, collect results
+    # (No need to collect results since these tasks don't return values)


 async def extract_entities(