feat: Ensure thread safety for graph write operations

Add a lock to delete, adelete_by_entity, and adelete_by_relation methods to prevent race conditions and ensure data consistency during concurrent modifications to the knowledge graph.
This commit is contained in:
yangdx 2025-06-23 09:57:56 +08:00
parent c947b20bb1
commit 9fae0eadff

View file

@ -35,6 +35,7 @@ from lightrag.kg import (
from lightrag.kg.shared_storage import ( from lightrag.kg.shared_storage import (
get_namespace_data, get_namespace_data,
get_pipeline_status_lock, get_pipeline_status_lock,
get_graph_db_lock,
) )
from .base import ( from .base import (
@ -1728,109 +1729,113 @@ class LightRAG:
relationships_to_delete = set() relationships_to_delete = set()
relationships_to_rebuild = {} # (src, tgt) -> remaining_chunk_ids relationships_to_rebuild = {} # (src, tgt) -> remaining_chunk_ids
# Process entities # Use graph database lock to ensure atomic merges and updates
all_labels = await self.chunk_entity_relation_graph.get_all_labels() graph_db_lock = get_graph_db_lock(enable_logging=False)
for node_label in all_labels: async with graph_db_lock:
node_data = await self.chunk_entity_relation_graph.get_node(node_label)
if node_data and "source_id" in node_data:
# Split source_id using GRAPH_FIELD_SEP
sources = set(node_data["source_id"].split(GRAPH_FIELD_SEP))
remaining_sources = sources - chunk_ids
if not remaining_sources: # Process entities
entities_to_delete.add(node_label) all_labels = await self.chunk_entity_relation_graph.get_all_labels()
logger.debug( for node_label in all_labels:
f"Entity {node_label} marked for deletion - no remaining sources" node_data = await self.chunk_entity_relation_graph.get_node(node_label)
) if node_data and "source_id" in node_data:
elif remaining_sources != sources: # Split source_id using GRAPH_FIELD_SEP
# Entity needs to be rebuilt from remaining chunks sources = set(node_data["source_id"].split(GRAPH_FIELD_SEP))
entities_to_rebuild[node_label] = remaining_sources remaining_sources = sources - chunk_ids
logger.debug(
f"Entity {node_label} will be rebuilt from {len(remaining_sources)} remaining chunks"
)
# Process relationships if not remaining_sources:
for node_label in all_labels: entities_to_delete.add(node_label)
node_edges = await self.chunk_entity_relation_graph.get_node_edges( logger.debug(
node_label f"Entity {node_label} marked for deletion - no remaining sources"
) )
if node_edges: elif remaining_sources != sources:
for src, tgt in node_edges: # Entity needs to be rebuilt from remaining chunks
edge_data = await self.chunk_entity_relation_graph.get_edge( entities_to_rebuild[node_label] = remaining_sources
src, tgt logger.debug(
) f"Entity {node_label} will be rebuilt from {len(remaining_sources)} remaining chunks"
if edge_data and "source_id" in edge_data: )
# Split source_id using GRAPH_FIELD_SEP
sources = set(edge_data["source_id"].split(GRAPH_FIELD_SEP))
remaining_sources = sources - chunk_ids
if not remaining_sources: # Process relationships
relationships_to_delete.add((src, tgt)) for node_label in all_labels:
logger.debug( node_edges = await self.chunk_entity_relation_graph.get_node_edges(
f"Relationship {src}-{tgt} marked for deletion - no remaining sources" node_label
)
elif remaining_sources != sources:
# Relationship needs to be rebuilt from remaining chunks
relationships_to_rebuild[(src, tgt)] = remaining_sources
logger.debug(
f"Relationship {src}-{tgt} will be rebuilt from {len(remaining_sources)} remaining chunks"
)
# 5. Delete chunks from storage
if chunk_ids:
await self.chunks_vdb.delete(chunk_ids)
await self.text_chunks.delete(chunk_ids)
logger.info(f"Deleted {len(chunk_ids)} chunks from storage")
# 6. Delete entities that have no remaining sources
if entities_to_delete:
# Delete from vector database
entity_vdb_ids = [
compute_mdhash_id(entity, prefix="ent-")
for entity in entities_to_delete
]
await self.entities_vdb.delete(entity_vdb_ids)
# Delete from graph
await self.chunk_entity_relation_graph.remove_nodes(
list(entities_to_delete)
)
logger.info(f"Deleted {len(entities_to_delete)} entities")
# 7. Delete relationships that have no remaining sources
if relationships_to_delete:
# Delete from vector database
rel_ids_to_delete = []
for src, tgt in relationships_to_delete:
rel_ids_to_delete.extend(
[
compute_mdhash_id(src + tgt, prefix="rel-"),
compute_mdhash_id(tgt + src, prefix="rel-"),
]
) )
await self.relationships_vdb.delete(rel_ids_to_delete) if node_edges:
for src, tgt in node_edges:
edge_data = await self.chunk_entity_relation_graph.get_edge(
src, tgt
)
if edge_data and "source_id" in edge_data:
# Split source_id using GRAPH_FIELD_SEP
sources = set(edge_data["source_id"].split(GRAPH_FIELD_SEP))
remaining_sources = sources - chunk_ids
# Delete from graph if not remaining_sources:
await self.chunk_entity_relation_graph.remove_edges( relationships_to_delete.add((src, tgt))
list(relationships_to_delete) logger.debug(
) f"Relationship {src}-{tgt} marked for deletion - no remaining sources"
logger.info(f"Deleted {len(relationships_to_delete)} relationships") )
elif remaining_sources != sources:
# Relationship needs to be rebuilt from remaining chunks
relationships_to_rebuild[(src, tgt)] = remaining_sources
logger.debug(
f"Relationship {src}-{tgt} will be rebuilt from {len(remaining_sources)} remaining chunks"
)
# 8. **OPTIMIZATION 2**: Rebuild entities and relationships from remaining chunks # 5. Delete chunks from storage
if entities_to_rebuild or relationships_to_rebuild: if chunk_ids:
logger.info( await self.chunks_vdb.delete(chunk_ids)
f"Rebuilding {len(entities_to_rebuild)} entities and {len(relationships_to_rebuild)} relationships..." await self.text_chunks.delete(chunk_ids)
) logger.info(f"Deleted {len(chunk_ids)} chunks from storage")
await _rebuild_knowledge_from_chunks(
entities_to_rebuild=entities_to_rebuild, # 6. Delete entities that have no remaining sources
relationships_to_rebuild=relationships_to_rebuild, if entities_to_delete:
knowledge_graph_inst=self.chunk_entity_relation_graph, # Delete from vector database
entities_vdb=self.entities_vdb, entity_vdb_ids = [
relationships_vdb=self.relationships_vdb, compute_mdhash_id(entity, prefix="ent-")
text_chunks=self.text_chunks, for entity in entities_to_delete
llm_response_cache=self.llm_response_cache, ]
global_config=asdict(self), await self.entities_vdb.delete(entity_vdb_ids)
)
# Delete from graph
await self.chunk_entity_relation_graph.remove_nodes(
list(entities_to_delete)
)
logger.info(f"Deleted {len(entities_to_delete)} entities")
# 7. Delete relationships that have no remaining sources
if relationships_to_delete:
# Delete from vector database
rel_ids_to_delete = []
for src, tgt in relationships_to_delete:
rel_ids_to_delete.extend(
[
compute_mdhash_id(src + tgt, prefix="rel-"),
compute_mdhash_id(tgt + src, prefix="rel-"),
]
)
await self.relationships_vdb.delete(rel_ids_to_delete)
# Delete from graph
await self.chunk_entity_relation_graph.remove_edges(
list(relationships_to_delete)
)
logger.info(f"Deleted {len(relationships_to_delete)} relationships")
# 8. **OPTIMIZATION 2**: Rebuild entities and relationships from remaining chunks
if entities_to_rebuild or relationships_to_rebuild:
logger.info(
f"Rebuilding {len(entities_to_rebuild)} entities and {len(relationships_to_rebuild)} relationships..."
)
await _rebuild_knowledge_from_chunks(
entities_to_rebuild=entities_to_rebuild,
relationships_to_rebuild=relationships_to_rebuild,
knowledge_graph_inst=self.chunk_entity_relation_graph,
entities_vdb=self.entities_vdb,
relationships_vdb=self.relationships_vdb,
text_chunks=self.text_chunks,
llm_response_cache=self.llm_response_cache,
global_config=asdict(self),
)
# 9. Delete original document and status # 9. Delete original document and status
await self.full_docs.delete([doc_id]) await self.full_docs.delete([doc_id])
@ -1857,12 +1862,15 @@ class LightRAG:
""" """
from .utils_graph import adelete_by_entity from .utils_graph import adelete_by_entity
return await adelete_by_entity( # Use graph database lock to ensure atomic merges and updates
self.chunk_entity_relation_graph, graph_db_lock = get_graph_db_lock(enable_logging=False)
self.entities_vdb, async with graph_db_lock:
self.relationships_vdb, return await adelete_by_entity(
entity_name, self.chunk_entity_relation_graph,
) self.entities_vdb,
self.relationships_vdb,
entity_name,
)
def delete_by_entity(self, entity_name: str) -> None: def delete_by_entity(self, entity_name: str) -> None:
loop = always_get_an_event_loop() loop = always_get_an_event_loop()
@ -1877,12 +1885,15 @@ class LightRAG:
""" """
from .utils_graph import adelete_by_relation from .utils_graph import adelete_by_relation
return await adelete_by_relation( # Use graph database lock to ensure atomic merges and updates
self.chunk_entity_relation_graph, graph_db_lock = get_graph_db_lock(enable_logging=False)
self.relationships_vdb, async with graph_db_lock:
source_entity, return await adelete_by_relation(
target_entity, self.chunk_entity_relation_graph,
) self.relationships_vdb,
source_entity,
target_entity,
)
def delete_by_relation(self, source_entity: str, target_entity: str) -> None: def delete_by_relation(self, source_entity: str, target_entity: str) -> None:
loop = always_get_an_event_loop() loop = always_get_an_event_loop()