Add VDB error handling with retries for data consistency

- Add safe_vdb_operation_with_exception util
- Wrap VDB ops in entity/relationship code
- Ensure exceptions propagate on failure
- Add retry logic with configurable delays
This commit is contained in:
yangdx 2025-09-03 21:15:09 +08:00
parent 61fb2444f0
commit 7ef2f0dff6
2 changed files with 233 additions and 81 deletions

View file

@ -30,6 +30,7 @@ from .utils import (
pick_by_vector_similarity, pick_by_vector_similarity,
process_chunks_unified, process_chunks_unified,
build_file_path, build_file_path,
safe_vdb_operation_with_exception,
) )
from .base import ( from .base import (
BaseGraphStorage, BaseGraphStorage,
@ -930,7 +931,8 @@ async def _rebuild_single_entity(
async def _update_entity_storage( async def _update_entity_storage(
final_description: str, entity_type: str, file_paths: set[str] final_description: str, entity_type: str, file_paths: set[str]
): ):
# Update entity in graph storage try:
# Update entity in graph storage (critical path)
updated_entity_data = { updated_entity_data = {
**current_entity, **current_entity,
"description": final_description, "description": final_description,
@ -942,12 +944,11 @@ async def _rebuild_single_entity(
} }
await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data) await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data)
# Update entity in vector database # Update entity in vector database (equally critical)
entity_vdb_id = compute_mdhash_id(entity_name, prefix="ent-") entity_vdb_id = compute_mdhash_id(entity_name, prefix="ent-")
entity_content = f"{entity_name}\n{final_description}" entity_content = f"{entity_name}\n{final_description}"
await entities_vdb.upsert(
{ vdb_data = {
entity_vdb_id: { entity_vdb_id: {
"content": entity_content, "content": entity_content,
"entity_name": entity_name, "entity_name": entity_name,
@ -957,8 +958,21 @@ async def _rebuild_single_entity(
"file_path": updated_entity_data["file_path"], "file_path": updated_entity_data["file_path"],
} }
} }
# Use safe operation wrapper - VDB failure must throw exception
await safe_vdb_operation_with_exception(
operation=lambda: entities_vdb.upsert(vdb_data),
operation_name="rebuild_entity_upsert",
entity_name=entity_name,
max_retries=3,
retry_delay=0.1,
) )
except Exception as e:
error_msg = f"Failed to update entity storage for `{entity_name}`: {e}"
logger.error(error_msg)
raise # Re-raise exception
# Collect all entity data from relevant chunks # Collect all entity data from relevant chunks
all_entity_data = [] all_entity_data = []
for chunk_id in chunk_ids: for chunk_id in chunk_ids:
@ -1145,6 +1159,7 @@ async def _rebuild_single_relationship(
await knowledge_graph_inst.upsert_edge(src, tgt, updated_relationship_data) await knowledge_graph_inst.upsert_edge(src, tgt, updated_relationship_data)
# Update relationship in vector database # Update relationship in vector database
try:
rel_vdb_id = compute_mdhash_id(src + tgt, prefix="rel-") rel_vdb_id = compute_mdhash_id(src + tgt, prefix="rel-")
rel_vdb_id_reverse = compute_mdhash_id(tgt + src, prefix="rel-") rel_vdb_id_reverse = compute_mdhash_id(tgt + src, prefix="rel-")
@ -1158,8 +1173,7 @@ async def _rebuild_single_relationship(
# Insert new vector record # Insert new vector record
rel_content = f"{combined_keywords}\t{src}\n{tgt}\n{final_description}" rel_content = f"{combined_keywords}\t{src}\n{tgt}\n{final_description}"
await relationships_vdb.upsert( vdb_data = {
{
rel_vdb_id: { rel_vdb_id: {
"src_id": src, "src_id": src,
"tgt_id": tgt, "tgt_id": tgt,
@ -1171,8 +1185,21 @@ async def _rebuild_single_relationship(
"file_path": updated_relationship_data["file_path"], "file_path": updated_relationship_data["file_path"],
} }
} }
# Use safe operation wrapper - VDB failure must throw exception
await safe_vdb_operation_with_exception(
operation=lambda: relationships_vdb.upsert(vdb_data),
operation_name="rebuild_relationship_upsert",
entity_name=f"{src}-{tgt}",
max_retries=3,
retry_delay=0.2,
) )
except Exception as e:
error_msg = f"Failed to rebuild relationship storage for `{src}-{tgt}`: {e}"
logger.error(error_msg)
raise # Re-raise exception
async def _merge_nodes_then_upsert( async def _merge_nodes_then_upsert(
entity_name: str, entity_name: str,
@ -1516,6 +1543,8 @@ async def merge_nodes_and_edges(
async with get_storage_keyed_lock( async with get_storage_keyed_lock(
[entity_name], namespace=namespace, enable_logging=False [entity_name], namespace=namespace, enable_logging=False
): ):
try:
# Graph database operation (critical path, must succeed)
entity_data = await _merge_nodes_then_upsert( entity_data = await _merge_nodes_then_upsert(
entity_name, entity_name,
entities, entities,
@ -1525,19 +1554,58 @@ async def merge_nodes_and_edges(
pipeline_status_lock, pipeline_status_lock,
llm_response_cache, llm_response_cache,
) )
if entity_vdb is not None:
# Vector database operation (equally critical, must succeed)
if entity_vdb is not None and entity_data:
data_for_vdb = { data_for_vdb = {
compute_mdhash_id(entity_data["entity_name"], prefix="ent-"): { compute_mdhash_id(
entity_data["entity_name"], prefix="ent-"
): {
"entity_name": entity_data["entity_name"], "entity_name": entity_data["entity_name"],
"entity_type": entity_data["entity_type"], "entity_type": entity_data["entity_type"],
"content": f"{entity_data['entity_name']}\n{entity_data['description']}", "content": f"{entity_data['entity_name']}\n{entity_data['description']}",
"source_id": entity_data["source_id"], "source_id": entity_data["source_id"],
"file_path": entity_data.get("file_path", "unknown_source"), "file_path": entity_data.get(
"file_path", "unknown_source"
),
} }
} }
await entity_vdb.upsert(data_for_vdb)
# Use safe operation wrapper - VDB failure must throw exception
await safe_vdb_operation_with_exception(
operation=lambda: entity_vdb.upsert(data_for_vdb),
operation_name="entity_upsert",
entity_name=entity_name,
max_retries=3,
retry_delay=0.1,
)
return entity_data return entity_data
except Exception as e:
# Any database operation failure is critical
error_msg = (
f"Critical error in entity processing for `{entity_name}`: {e}"
)
logger.error(error_msg)
# Try to update pipeline status, but don't let status update failure affect main exception
try:
if (
pipeline_status is not None
and pipeline_status_lock is not None
):
async with pipeline_status_lock:
pipeline_status["latest_message"] = error_msg
pipeline_status["history_messages"].append(error_msg)
except Exception as status_error:
logger.error(
f"Failed to update pipeline status: {status_error}"
)
# Re-raise the original exception
raise
# Create entity processing tasks # Create entity processing tasks
entity_tasks = [] entity_tasks = []
for entity_name, entities in all_nodes.items(): for entity_name, entities in all_nodes.items():
@ -1584,7 +1652,10 @@ async def merge_nodes_and_edges(
namespace=namespace, namespace=namespace,
enable_logging=False, enable_logging=False,
): ):
try:
added_entities = [] # Track entities added during edge processing added_entities = [] # Track entities added during edge processing
# Graph database operation (critical path, must succeed)
edge_data = await _merge_edges_then_upsert( edge_data = await _merge_edges_then_upsert(
edge_key[0], edge_key[0],
edge_key[1], edge_key[1],
@ -1600,6 +1671,7 @@ async def merge_nodes_and_edges(
if edge_data is None: if edge_data is None:
return None, [] return None, []
# Vector database operation (equally critical, must succeed)
if relationships_vdb is not None: if relationships_vdb is not None:
data_for_vdb = { data_for_vdb = {
compute_mdhash_id( compute_mdhash_id(
@ -1610,13 +1682,46 @@ async def merge_nodes_and_edges(
"keywords": edge_data["keywords"], "keywords": edge_data["keywords"],
"content": f"{edge_data['src_id']}\t{edge_data['tgt_id']}\n{edge_data['keywords']}\n{edge_data['description']}", "content": f"{edge_data['src_id']}\t{edge_data['tgt_id']}\n{edge_data['keywords']}\n{edge_data['description']}",
"source_id": edge_data["source_id"], "source_id": edge_data["source_id"],
"file_path": edge_data.get("file_path", "unknown_source"), "file_path": edge_data.get(
"file_path", "unknown_source"
),
"weight": edge_data.get("weight", 1.0), "weight": edge_data.get("weight", 1.0),
} }
} }
await relationships_vdb.upsert(data_for_vdb)
# Use safe operation wrapper - VDB failure must throw exception
await safe_vdb_operation_with_exception(
operation=lambda: relationships_vdb.upsert(data_for_vdb),
operation_name="relationship_upsert",
entity_name=f"{edge_data['src_id']}-{edge_data['tgt_id']}",
max_retries=3,
retry_delay=0.1,
)
return edge_data, added_entities return edge_data, added_entities
except Exception as e:
# Any database operation failure is critical
error_msg = f"Critical error in relationship processing for `{sorted_edge_key}`: {e}"
logger.error(error_msg)
# Try to update pipeline status, but don't let status update failure affect main exception
try:
if (
pipeline_status is not None
and pipeline_status_lock is not None
):
async with pipeline_status_lock:
pipeline_status["latest_message"] = error_msg
pipeline_status["history_messages"].append(error_msg)
except Exception as status_error:
logger.error(
f"Failed to update pipeline status: {status_error}"
)
# Re-raise the original exception
raise
# Create relationship processing tasks # Create relationship processing tasks
edge_tasks = [] edge_tasks = []
for edge_key, edges in all_edges.items(): for edge_key, edges in all_edges.items():

View file

@ -14,7 +14,7 @@ from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from functools import wraps from functools import wraps
from hashlib import md5 from hashlib import md5
from typing import Any, Protocol, Callable, TYPE_CHECKING, List from typing import Any, Protocol, Callable, TYPE_CHECKING, List, Optional
import numpy as np import numpy as np
from dotenv import load_dotenv from dotenv import load_dotenv
@ -57,6 +57,53 @@ except ImportError:
) )
async def safe_vdb_operation_with_exception(
operation: Callable,
operation_name: str,
entity_name: str = "",
max_retries: int = 3,
retry_delay: float = 0.2,
logger_func: Optional[Callable] = None
) -> None:
"""
Safely execute vector database operations with retry mechanism and exception handling.
This function ensures that VDB operations are executed with proper error handling
and retry logic. If all retries fail, it raises an exception to maintain data consistency.
Args:
operation: The async operation to execute
operation_name: Operation name for logging purposes
entity_name: Entity name for logging purposes
max_retries: Maximum number of retry attempts
retry_delay: Delay between retries in seconds
logger_func: Logger function to use for error messages
Raises:
Exception: When operation fails after all retry attempts
"""
log_func = logger_func or logger.warning
last_exception = None
for attempt in range(max_retries):
try:
await operation()
return # Success, return immediately
except Exception as e:
last_exception = e
if attempt == max_retries - 1:
error_msg = f"VDB {operation_name} failed for {entity_name} after {max_retries} attempts: {e}"
log_func(error_msg)
raise Exception(error_msg) from e
else:
log_func(f"VDB {operation_name} attempt {attempt + 1} failed for {entity_name}: {e}, retrying...")
if retry_delay > 0:
await asyncio.sleep(retry_delay)
# This line should theoretically never be reached, but included for safety
raise Exception(f"Max retries exceeded for {operation_name}") from last_exception
def get_env_value( def get_env_value(
env_key: str, default: any, value_type: type = str, special_none: bool = False env_key: str, default: any, value_type: type = str, special_none: bool = False
) -> any: ) -> any: