feat: Ensure thread safety for graph write operations
Add a lock to delete, adelete_by_entity, and adelete_by_relation methods to prevent race conditions and ensure data consistency during concurrent modifications to the knowledge graph.
This commit is contained in:
parent
c947b20bb1
commit
9fae0eadff
1 changed files with 120 additions and 109 deletions
|
|
@ -35,6 +35,7 @@ from lightrag.kg import (
|
||||||
from lightrag.kg.shared_storage import (
|
from lightrag.kg.shared_storage import (
|
||||||
get_namespace_data,
|
get_namespace_data,
|
||||||
get_pipeline_status_lock,
|
get_pipeline_status_lock,
|
||||||
|
get_graph_db_lock,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
|
|
@ -1728,109 +1729,113 @@ class LightRAG:
|
||||||
relationships_to_delete = set()
|
relationships_to_delete = set()
|
||||||
relationships_to_rebuild = {} # (src, tgt) -> remaining_chunk_ids
|
relationships_to_rebuild = {} # (src, tgt) -> remaining_chunk_ids
|
||||||
|
|
||||||
# Process entities
|
# Use graph database lock to ensure atomic merges and updates
|
||||||
all_labels = await self.chunk_entity_relation_graph.get_all_labels()
|
graph_db_lock = get_graph_db_lock(enable_logging=False)
|
||||||
for node_label in all_labels:
|
async with graph_db_lock:
|
||||||
node_data = await self.chunk_entity_relation_graph.get_node(node_label)
|
|
||||||
if node_data and "source_id" in node_data:
|
|
||||||
# Split source_id using GRAPH_FIELD_SEP
|
|
||||||
sources = set(node_data["source_id"].split(GRAPH_FIELD_SEP))
|
|
||||||
remaining_sources = sources - chunk_ids
|
|
||||||
|
|
||||||
if not remaining_sources:
|
# Process entities
|
||||||
entities_to_delete.add(node_label)
|
all_labels = await self.chunk_entity_relation_graph.get_all_labels()
|
||||||
logger.debug(
|
for node_label in all_labels:
|
||||||
f"Entity {node_label} marked for deletion - no remaining sources"
|
node_data = await self.chunk_entity_relation_graph.get_node(node_label)
|
||||||
)
|
if node_data and "source_id" in node_data:
|
||||||
elif remaining_sources != sources:
|
# Split source_id using GRAPH_FIELD_SEP
|
||||||
# Entity needs to be rebuilt from remaining chunks
|
sources = set(node_data["source_id"].split(GRAPH_FIELD_SEP))
|
||||||
entities_to_rebuild[node_label] = remaining_sources
|
remaining_sources = sources - chunk_ids
|
||||||
logger.debug(
|
|
||||||
f"Entity {node_label} will be rebuilt from {len(remaining_sources)} remaining chunks"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process relationships
|
if not remaining_sources:
|
||||||
for node_label in all_labels:
|
entities_to_delete.add(node_label)
|
||||||
node_edges = await self.chunk_entity_relation_graph.get_node_edges(
|
logger.debug(
|
||||||
node_label
|
f"Entity {node_label} marked for deletion - no remaining sources"
|
||||||
)
|
)
|
||||||
if node_edges:
|
elif remaining_sources != sources:
|
||||||
for src, tgt in node_edges:
|
# Entity needs to be rebuilt from remaining chunks
|
||||||
edge_data = await self.chunk_entity_relation_graph.get_edge(
|
entities_to_rebuild[node_label] = remaining_sources
|
||||||
src, tgt
|
logger.debug(
|
||||||
)
|
f"Entity {node_label} will be rebuilt from {len(remaining_sources)} remaining chunks"
|
||||||
if edge_data and "source_id" in edge_data:
|
)
|
||||||
# Split source_id using GRAPH_FIELD_SEP
|
|
||||||
sources = set(edge_data["source_id"].split(GRAPH_FIELD_SEP))
|
|
||||||
remaining_sources = sources - chunk_ids
|
|
||||||
|
|
||||||
if not remaining_sources:
|
# Process relationships
|
||||||
relationships_to_delete.add((src, tgt))
|
for node_label in all_labels:
|
||||||
logger.debug(
|
node_edges = await self.chunk_entity_relation_graph.get_node_edges(
|
||||||
f"Relationship {src}-{tgt} marked for deletion - no remaining sources"
|
node_label
|
||||||
)
|
|
||||||
elif remaining_sources != sources:
|
|
||||||
# Relationship needs to be rebuilt from remaining chunks
|
|
||||||
relationships_to_rebuild[(src, tgt)] = remaining_sources
|
|
||||||
logger.debug(
|
|
||||||
f"Relationship {src}-{tgt} will be rebuilt from {len(remaining_sources)} remaining chunks"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 5. Delete chunks from storage
|
|
||||||
if chunk_ids:
|
|
||||||
await self.chunks_vdb.delete(chunk_ids)
|
|
||||||
await self.text_chunks.delete(chunk_ids)
|
|
||||||
logger.info(f"Deleted {len(chunk_ids)} chunks from storage")
|
|
||||||
|
|
||||||
# 6. Delete entities that have no remaining sources
|
|
||||||
if entities_to_delete:
|
|
||||||
# Delete from vector database
|
|
||||||
entity_vdb_ids = [
|
|
||||||
compute_mdhash_id(entity, prefix="ent-")
|
|
||||||
for entity in entities_to_delete
|
|
||||||
]
|
|
||||||
await self.entities_vdb.delete(entity_vdb_ids)
|
|
||||||
|
|
||||||
# Delete from graph
|
|
||||||
await self.chunk_entity_relation_graph.remove_nodes(
|
|
||||||
list(entities_to_delete)
|
|
||||||
)
|
|
||||||
logger.info(f"Deleted {len(entities_to_delete)} entities")
|
|
||||||
|
|
||||||
# 7. Delete relationships that have no remaining sources
|
|
||||||
if relationships_to_delete:
|
|
||||||
# Delete from vector database
|
|
||||||
rel_ids_to_delete = []
|
|
||||||
for src, tgt in relationships_to_delete:
|
|
||||||
rel_ids_to_delete.extend(
|
|
||||||
[
|
|
||||||
compute_mdhash_id(src + tgt, prefix="rel-"),
|
|
||||||
compute_mdhash_id(tgt + src, prefix="rel-"),
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
await self.relationships_vdb.delete(rel_ids_to_delete)
|
if node_edges:
|
||||||
|
for src, tgt in node_edges:
|
||||||
|
edge_data = await self.chunk_entity_relation_graph.get_edge(
|
||||||
|
src, tgt
|
||||||
|
)
|
||||||
|
if edge_data and "source_id" in edge_data:
|
||||||
|
# Split source_id using GRAPH_FIELD_SEP
|
||||||
|
sources = set(edge_data["source_id"].split(GRAPH_FIELD_SEP))
|
||||||
|
remaining_sources = sources - chunk_ids
|
||||||
|
|
||||||
# Delete from graph
|
if not remaining_sources:
|
||||||
await self.chunk_entity_relation_graph.remove_edges(
|
relationships_to_delete.add((src, tgt))
|
||||||
list(relationships_to_delete)
|
logger.debug(
|
||||||
)
|
f"Relationship {src}-{tgt} marked for deletion - no remaining sources"
|
||||||
logger.info(f"Deleted {len(relationships_to_delete)} relationships")
|
)
|
||||||
|
elif remaining_sources != sources:
|
||||||
|
# Relationship needs to be rebuilt from remaining chunks
|
||||||
|
relationships_to_rebuild[(src, tgt)] = remaining_sources
|
||||||
|
logger.debug(
|
||||||
|
f"Relationship {src}-{tgt} will be rebuilt from {len(remaining_sources)} remaining chunks"
|
||||||
|
)
|
||||||
|
|
||||||
# 8. **OPTIMIZATION 2**: Rebuild entities and relationships from remaining chunks
|
# 5. Delete chunks from storage
|
||||||
if entities_to_rebuild or relationships_to_rebuild:
|
if chunk_ids:
|
||||||
logger.info(
|
await self.chunks_vdb.delete(chunk_ids)
|
||||||
f"Rebuilding {len(entities_to_rebuild)} entities and {len(relationships_to_rebuild)} relationships..."
|
await self.text_chunks.delete(chunk_ids)
|
||||||
)
|
logger.info(f"Deleted {len(chunk_ids)} chunks from storage")
|
||||||
await _rebuild_knowledge_from_chunks(
|
|
||||||
entities_to_rebuild=entities_to_rebuild,
|
# 6. Delete entities that have no remaining sources
|
||||||
relationships_to_rebuild=relationships_to_rebuild,
|
if entities_to_delete:
|
||||||
knowledge_graph_inst=self.chunk_entity_relation_graph,
|
# Delete from vector database
|
||||||
entities_vdb=self.entities_vdb,
|
entity_vdb_ids = [
|
||||||
relationships_vdb=self.relationships_vdb,
|
compute_mdhash_id(entity, prefix="ent-")
|
||||||
text_chunks=self.text_chunks,
|
for entity in entities_to_delete
|
||||||
llm_response_cache=self.llm_response_cache,
|
]
|
||||||
global_config=asdict(self),
|
await self.entities_vdb.delete(entity_vdb_ids)
|
||||||
)
|
|
||||||
|
# Delete from graph
|
||||||
|
await self.chunk_entity_relation_graph.remove_nodes(
|
||||||
|
list(entities_to_delete)
|
||||||
|
)
|
||||||
|
logger.info(f"Deleted {len(entities_to_delete)} entities")
|
||||||
|
|
||||||
|
# 7. Delete relationships that have no remaining sources
|
||||||
|
if relationships_to_delete:
|
||||||
|
# Delete from vector database
|
||||||
|
rel_ids_to_delete = []
|
||||||
|
for src, tgt in relationships_to_delete:
|
||||||
|
rel_ids_to_delete.extend(
|
||||||
|
[
|
||||||
|
compute_mdhash_id(src + tgt, prefix="rel-"),
|
||||||
|
compute_mdhash_id(tgt + src, prefix="rel-"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await self.relationships_vdb.delete(rel_ids_to_delete)
|
||||||
|
|
||||||
|
# Delete from graph
|
||||||
|
await self.chunk_entity_relation_graph.remove_edges(
|
||||||
|
list(relationships_to_delete)
|
||||||
|
)
|
||||||
|
logger.info(f"Deleted {len(relationships_to_delete)} relationships")
|
||||||
|
|
||||||
|
# 8. **OPTIMIZATION 2**: Rebuild entities and relationships from remaining chunks
|
||||||
|
if entities_to_rebuild or relationships_to_rebuild:
|
||||||
|
logger.info(
|
||||||
|
f"Rebuilding {len(entities_to_rebuild)} entities and {len(relationships_to_rebuild)} relationships..."
|
||||||
|
)
|
||||||
|
await _rebuild_knowledge_from_chunks(
|
||||||
|
entities_to_rebuild=entities_to_rebuild,
|
||||||
|
relationships_to_rebuild=relationships_to_rebuild,
|
||||||
|
knowledge_graph_inst=self.chunk_entity_relation_graph,
|
||||||
|
entities_vdb=self.entities_vdb,
|
||||||
|
relationships_vdb=self.relationships_vdb,
|
||||||
|
text_chunks=self.text_chunks,
|
||||||
|
llm_response_cache=self.llm_response_cache,
|
||||||
|
global_config=asdict(self),
|
||||||
|
)
|
||||||
|
|
||||||
# 9. Delete original document and status
|
# 9. Delete original document and status
|
||||||
await self.full_docs.delete([doc_id])
|
await self.full_docs.delete([doc_id])
|
||||||
|
|
@ -1857,12 +1862,15 @@ class LightRAG:
|
||||||
"""
|
"""
|
||||||
from .utils_graph import adelete_by_entity
|
from .utils_graph import adelete_by_entity
|
||||||
|
|
||||||
return await adelete_by_entity(
|
# Use graph database lock to ensure atomic merges and updates
|
||||||
self.chunk_entity_relation_graph,
|
graph_db_lock = get_graph_db_lock(enable_logging=False)
|
||||||
self.entities_vdb,
|
async with graph_db_lock:
|
||||||
self.relationships_vdb,
|
return await adelete_by_entity(
|
||||||
entity_name,
|
self.chunk_entity_relation_graph,
|
||||||
)
|
self.entities_vdb,
|
||||||
|
self.relationships_vdb,
|
||||||
|
entity_name,
|
||||||
|
)
|
||||||
|
|
||||||
def delete_by_entity(self, entity_name: str) -> None:
|
def delete_by_entity(self, entity_name: str) -> None:
|
||||||
loop = always_get_an_event_loop()
|
loop = always_get_an_event_loop()
|
||||||
|
|
@ -1877,12 +1885,15 @@ class LightRAG:
|
||||||
"""
|
"""
|
||||||
from .utils_graph import adelete_by_relation
|
from .utils_graph import adelete_by_relation
|
||||||
|
|
||||||
return await adelete_by_relation(
|
# Use graph database lock to ensure atomic merges and updates
|
||||||
self.chunk_entity_relation_graph,
|
graph_db_lock = get_graph_db_lock(enable_logging=False)
|
||||||
self.relationships_vdb,
|
async with graph_db_lock:
|
||||||
source_entity,
|
return await adelete_by_relation(
|
||||||
target_entity,
|
self.chunk_entity_relation_graph,
|
||||||
)
|
self.relationships_vdb,
|
||||||
|
source_entity,
|
||||||
|
target_entity,
|
||||||
|
)
|
||||||
|
|
||||||
def delete_by_relation(self, source_entity: str, target_entity: str) -> None:
|
def delete_by_relation(self, source_entity: str, target_entity: str) -> None:
|
||||||
loop = always_get_an_event_loop()
|
loop = always_get_an_event_loop()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue