Merge pull request #1809 from danielaskdd/fix-redis
Enhance Redis connection handling with retries and timeouts
This commit is contained in:
commit
678da3e398
4 changed files with 78 additions and 11 deletions
|
|
@ -216,6 +216,10 @@ QDRANT_URL=http://localhost:6333
|
||||||
|
|
||||||
### Redis
|
### Redis
|
||||||
REDIS_URI=redis://localhost:6379
|
REDIS_URI=redis://localhost:6379
|
||||||
|
REDIS_SOCKET_TIMEOUT=30
|
||||||
|
REDIS_CONNECT_TIMEOUT=10
|
||||||
|
REDIS_MAX_CONNECTIONS=100
|
||||||
|
REDIS_RETRY_ATTEMPTS=3
|
||||||
# REDIS_WORKSPACE=forced_workspace_name
|
# REDIS_WORKSPACE=forced_workspace_name
|
||||||
|
|
||||||
### Memgraph Configuration
|
### Memgraph Configuration
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ if not pm.is_installed("redis"):
|
||||||
|
|
||||||
# aioredis is a depricated library, replaced with redis
|
# aioredis is a depricated library, replaced with redis
|
||||||
from redis.asyncio import Redis, ConnectionPool # type: ignore
|
from redis.asyncio import Redis, ConnectionPool # type: ignore
|
||||||
from redis.exceptions import RedisError, ConnectionError # type: ignore
|
from redis.exceptions import RedisError, ConnectionError, TimeoutError # type: ignore
|
||||||
from lightrag.utils import logger
|
from lightrag.utils import logger
|
||||||
|
|
||||||
from lightrag.base import (
|
from lightrag.base import (
|
||||||
|
|
@ -22,14 +22,35 @@ from lightrag.base import (
|
||||||
)
|
)
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
# Import tenacity for retry logic
|
||||||
|
from tenacity import (
|
||||||
|
retry,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential,
|
||||||
|
retry_if_exception_type,
|
||||||
|
before_sleep_log,
|
||||||
|
)
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config.read("config.ini", "utf-8")
|
config.read("config.ini", "utf-8")
|
||||||
|
|
||||||
# Constants for Redis connection pool
|
# Constants for Redis connection pool with environment variable support
|
||||||
MAX_CONNECTIONS = 100
|
MAX_CONNECTIONS = int(os.getenv("REDIS_MAX_CONNECTIONS", "200"))
|
||||||
SOCKET_TIMEOUT = 5.0
|
SOCKET_TIMEOUT = float(os.getenv("REDIS_SOCKET_TIMEOUT", "30.0"))
|
||||||
SOCKET_CONNECT_TIMEOUT = 3.0
|
SOCKET_CONNECT_TIMEOUT = float(os.getenv("REDIS_CONNECT_TIMEOUT", "10.0"))
|
||||||
|
RETRY_ATTEMPTS = int(os.getenv("REDIS_RETRY_ATTEMPTS", "3"))
|
||||||
|
|
||||||
|
# Tenacity retry decorator for Redis operations
|
||||||
|
redis_retry = retry(
|
||||||
|
stop=stop_after_attempt(RETRY_ATTEMPTS),
|
||||||
|
wait=wait_exponential(multiplier=1, min=1, max=8),
|
||||||
|
retry=(
|
||||||
|
retry_if_exception_type(ConnectionError)
|
||||||
|
| retry_if_exception_type(TimeoutError)
|
||||||
|
| retry_if_exception_type(RedisError)
|
||||||
|
),
|
||||||
|
before_sleep=before_sleep_log(logger, "WARNING"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class RedisConnectionManager:
|
class RedisConnectionManager:
|
||||||
|
|
@ -220,6 +241,7 @@ class RedisKVStorage(BaseKVStorage):
|
||||||
"""Ensure Redis resources are cleaned up when exiting context."""
|
"""Ensure Redis resources are cleaned up when exiting context."""
|
||||||
await self.close()
|
await self.close()
|
||||||
|
|
||||||
|
@redis_retry
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
async with self._get_redis_connection() as redis:
|
async with self._get_redis_connection() as redis:
|
||||||
try:
|
try:
|
||||||
|
|
@ -235,6 +257,7 @@ class RedisKVStorage(BaseKVStorage):
|
||||||
logger.error(f"JSON decode error for id {id}: {e}")
|
logger.error(f"JSON decode error for id {id}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@redis_retry
|
||||||
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
||||||
async with self._get_redis_connection() as redis:
|
async with self._get_redis_connection() as redis:
|
||||||
try:
|
try:
|
||||||
|
|
@ -311,6 +334,7 @@ class RedisKVStorage(BaseKVStorage):
|
||||||
existing_ids = {keys_list[i] for i, exists in enumerate(results) if exists}
|
existing_ids = {keys_list[i] for i, exists in enumerate(results) if exists}
|
||||||
return set(keys) - existing_ids
|
return set(keys) - existing_ids
|
||||||
|
|
||||||
|
@redis_retry
|
||||||
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
@ -790,6 +814,7 @@ class RedisDocStatusStorage(DocStatusStorage):
|
||||||
"""Redis handles persistence automatically"""
|
"""Redis handles persistence automatically"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@redis_retry
|
||||||
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
||||||
"""Insert or update document status data"""
|
"""Insert or update document status data"""
|
||||||
if not data:
|
if not data:
|
||||||
|
|
@ -811,6 +836,7 @@ class RedisDocStatusStorage(DocStatusStorage):
|
||||||
logger.error(f"JSON decode error during upsert: {e}")
|
logger.error(f"JSON decode error during upsert: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
@redis_retry
|
||||||
async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
|
async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
|
||||||
async with self._get_redis_connection() as redis:
|
async with self._get_redis_connection() as redis:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -1127,7 +1127,7 @@ class LightRAG:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Concurrency is controlled by graph db lock for individual entities and relationships
|
# Concurrency is controlled by keyed lock for individual entities and relationships
|
||||||
if file_extraction_stage_ok:
|
if file_extraction_stage_ok:
|
||||||
try:
|
try:
|
||||||
# Get chunk_results from entity_relation_task
|
# Get chunk_results from entity_relation_task
|
||||||
|
|
|
||||||
|
|
@ -480,8 +480,25 @@ async def _rebuild_knowledge_from_chunks(
|
||||||
pipeline_status["latest_message"] = status_message
|
pipeline_status["latest_message"] = status_message
|
||||||
pipeline_status["history_messages"].append(status_message)
|
pipeline_status["history_messages"].append(status_message)
|
||||||
|
|
||||||
# Execute all tasks in parallel with semaphore control
|
# Execute all tasks in parallel with semaphore control and early failure detection
|
||||||
await asyncio.gather(*tasks)
|
done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
|
||||||
|
|
||||||
|
# Check if any task raised an exception
|
||||||
|
for task in done:
|
||||||
|
if task.exception():
|
||||||
|
# If a task failed, cancel all pending tasks
|
||||||
|
for pending_task in pending:
|
||||||
|
pending_task.cancel()
|
||||||
|
|
||||||
|
# Wait for cancellation to complete
|
||||||
|
if pending:
|
||||||
|
await asyncio.wait(pending)
|
||||||
|
|
||||||
|
# Re-raise the exception to notify the caller
|
||||||
|
raise task.exception()
|
||||||
|
|
||||||
|
# If all tasks completed successfully, collect results
|
||||||
|
# (No need to collect results since these tasks don't return values)
|
||||||
|
|
||||||
# Final status report
|
# Final status report
|
||||||
status_message = f"KG rebuild completed: {rebuilt_entities_count} entities and {rebuilt_relationships_count} relationships rebuilt successfully."
|
status_message = f"KG rebuild completed: {rebuilt_entities_count} entities and {rebuilt_relationships_count} relationships rebuilt successfully."
|
||||||
|
|
@ -1262,8 +1279,11 @@ async def merge_nodes_and_edges(
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
workspace = global_config.get("workspace", "")
|
workspace = global_config.get("workspace", "")
|
||||||
namespace = f"{workspace}:GraphDB" if workspace else "GraphDB"
|
namespace = f"{workspace}:GraphDB" if workspace else "GraphDB"
|
||||||
|
# Sort the edge_key components to ensure consistent lock key generation
|
||||||
|
sorted_edge_key = sorted([edge_key[0], edge_key[1]])
|
||||||
|
logger.info(f"Processing edge: {sorted_edge_key[0]} - {sorted_edge_key[1]}")
|
||||||
async with get_storage_keyed_lock(
|
async with get_storage_keyed_lock(
|
||||||
f"{edge_key[0]}-{edge_key[1]}",
|
f"{sorted_edge_key[0]}-{sorted_edge_key[1]}",
|
||||||
namespace=namespace,
|
namespace=namespace,
|
||||||
enable_logging=False,
|
enable_logging=False,
|
||||||
):
|
):
|
||||||
|
|
@ -1310,8 +1330,25 @@ async def merge_nodes_and_edges(
|
||||||
for edge_key, edges in all_edges.items():
|
for edge_key, edges in all_edges.items():
|
||||||
tasks.append(asyncio.create_task(_locked_process_edges(edge_key, edges)))
|
tasks.append(asyncio.create_task(_locked_process_edges(edge_key, edges)))
|
||||||
|
|
||||||
# Execute all tasks in parallel with semaphore control
|
# Execute all tasks in parallel with semaphore control and early failure detection
|
||||||
await asyncio.gather(*tasks)
|
done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
|
||||||
|
|
||||||
|
# Check if any task raised an exception
|
||||||
|
for task in done:
|
||||||
|
if task.exception():
|
||||||
|
# If a task failed, cancel all pending tasks
|
||||||
|
for pending_task in pending:
|
||||||
|
pending_task.cancel()
|
||||||
|
|
||||||
|
# Wait for cancellation to complete
|
||||||
|
if pending:
|
||||||
|
await asyncio.wait(pending)
|
||||||
|
|
||||||
|
# Re-raise the exception to notify the caller
|
||||||
|
raise task.exception()
|
||||||
|
|
||||||
|
# If all tasks completed successfully, collect results
|
||||||
|
# (No need to collect results since these tasks don't return values)
|
||||||
|
|
||||||
|
|
||||||
async def extract_entities(
|
async def extract_entities(
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue