From 19014c647113a1b0a2d97ffad1b7826124974877 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 10 Sep 2025 17:06:57 +0800 Subject: [PATCH] feat: enhance entity/relationship merging with description length comparison - Implement description length comparison in gleaning merge logic (extract_entities) - Apply same logic to knowledge graph reconstruction (_rebuild_knowledge_from_chunks) - Prioritize entities/relationships with longer descriptions for better quality - Use list() instead of extend() for performance optimization when replacing --- lightrag/operate.py | 93 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 23 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index fa715762..c2ad079b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -538,21 +538,51 @@ async def _rebuild_knowledge_from_chunks( ) # Merge entities and relationships from this extraction result - # Only keep the first occurrence of each entity_name in the same chunk_id + # Compare description lengths and keep the better version for the same chunk_id for entity_name, entity_list in entities.items(): - if ( - entity_name not in chunk_entities[chunk_id] - or len(chunk_entities[chunk_id][entity_name]) == 0 - ): + if entity_name not in chunk_entities[chunk_id]: + # New entity for this chunk_id chunk_entities[chunk_id][entity_name].extend(entity_list) + elif len(chunk_entities[chunk_id][entity_name]) == 0: + # Empty list, add the new entities + chunk_entities[chunk_id][entity_name].extend(entity_list) + else: + # Compare description lengths and keep the better one + existing_desc_len = len( + chunk_entities[chunk_id][entity_name][0].get( + "description", "" + ) + or "" + ) + new_desc_len = len(entity_list[0].get("description", "") or "") - # Only keep the first occurrence of each rel_key in the same chunk_id + if new_desc_len > existing_desc_len: + # Replace with the new entity that has longer description + chunk_entities[chunk_id][entity_name] = list(entity_list) + # Otherwise keep existing version + + # Compare description lengths and keep the better version for the same chunk_id for rel_key, rel_list in relationships.items(): - if ( - rel_key not in chunk_relationships[chunk_id] - or len(chunk_relationships[chunk_id][rel_key]) == 0 - ): + if rel_key not in chunk_relationships[chunk_id]: + # New relationship for this chunk_id chunk_relationships[chunk_id][rel_key].extend(rel_list) + elif len(chunk_relationships[chunk_id][rel_key]) == 0: + # Empty list, add the new relationships + chunk_relationships[chunk_id][rel_key].extend(rel_list) + else: + # Compare description lengths and keep the better one + existing_desc_len = len( + chunk_relationships[chunk_id][rel_key][0].get( + "description", "" + ) + or "" + ) + new_desc_len = len(rel_list[0].get("description", "") or "") + + if new_desc_len > existing_desc_len: + # Replace with the new relationship that has longer description + chunk_relationships[chunk_id][rel_key] = list(rel_list) + # Otherwise keep existing version except Exception as e: status_message = ( @@ -2014,19 +2044,36 @@ async def extract_entities( completion_delimiter=context_base["completion_delimiter"], ) - # Merge results - only add entities and edges with new names - for entity_name, entities in glean_nodes.items(): - if ( - entity_name not in maybe_nodes - ): # Only accetp entities with new name in gleaning stage - maybe_nodes[entity_name] = [] # Explicitly create the list - maybe_nodes[entity_name].extend(entities) - for edge_key, edges in glean_edges.items(): - if ( - edge_key not in maybe_edges - ): # Only accetp edges with new name in gleaning stage - maybe_edges[edge_key] = [] # Explicitly create the list - maybe_edges[edge_key].extend(edges) + # Merge results - compare description lengths to choose better version + for entity_name, glean_entities in glean_nodes.items(): + if entity_name in maybe_nodes: + # Compare description lengths and keep the better one + original_desc_len = len( + maybe_nodes[entity_name][0].get("description", "") or "" + ) + glean_desc_len = len(glean_entities[0].get("description", "") or "") + + if glean_desc_len > original_desc_len: + maybe_nodes[entity_name] = list(glean_entities) + # Otherwise keep original version + else: + # New entity from gleaning stage + maybe_nodes[entity_name] = list(glean_entities) + + for edge_key, glean_edges in glean_edges.items(): + if edge_key in maybe_edges: + # Compare description lengths and keep the better one + original_desc_len = len( + maybe_edges[edge_key][0].get("description", "") or "" + ) + glean_desc_len = len(glean_edges[0].get("description", "") or "") + + if glean_desc_len > original_desc_len: + maybe_edges[edge_key] = list(glean_edges) + # Otherwise keep original version + else: + # New edge from gleaning stage + maybe_edges[edge_key] = list(glean_edges) # Batch update chunk's llm_cache_list with all collected cache keys if cache_keys_collector and text_chunks_storage: