Update operate.py

This commit is contained in:
zrguo 2025-06-09 19:47:29 +08:00
parent 9a71a10bc0
commit 3a9494ab60

View file

@ -453,9 +453,7 @@ async def _rebuild_single_entity(
# Helper function to update entity in both graph and vector storage # Helper function to update entity in both graph and vector storage
async def _update_entity_storage( async def _update_entity_storage(
final_description: str, final_description: str, entity_type: str, file_paths: set[str]
entity_type: str,
file_paths: set[str]
): ):
# Update entity in graph storage # Update entity in graph storage
updated_entity_data = { updated_entity_data = {
@ -463,7 +461,9 @@ async def _rebuild_single_entity(
"description": final_description, "description": final_description,
"entity_type": entity_type, "entity_type": entity_type,
"source_id": GRAPH_FIELD_SEP.join(chunk_ids), "source_id": GRAPH_FIELD_SEP.join(chunk_ids),
"file_path": GRAPH_FIELD_SEP.join(file_paths) if file_paths else current_entity.get("file_path", "unknown_source"), "file_path": GRAPH_FIELD_SEP.join(file_paths)
if file_paths
else current_entity.get("file_path", "unknown_source"),
} }
await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data) await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data)
@ -474,7 +474,9 @@ async def _rebuild_single_entity(
try: try:
await entities_vdb.delete([entity_vdb_id]) await entities_vdb.delete([entity_vdb_id])
except Exception as e: except Exception as e:
logger.debug(f"Could not delete old entity vector record {entity_vdb_id}: {e}") logger.debug(
f"Could not delete old entity vector record {entity_vdb_id}: {e}"
)
# Insert new vector record # Insert new vector record
entity_content = f"{entity_name}\n{final_description}" entity_content = f"{entity_name}\n{final_description}"
@ -510,36 +512,38 @@ async def _rebuild_single_entity(
all_entity_data.extend(chunk_entities[chunk_id][entity_name]) all_entity_data.extend(chunk_entities[chunk_id][entity_name])
if not all_entity_data: if not all_entity_data:
logger.warning(f"No cached entity data found for {entity_name}, trying to rebuild from relationships") logger.warning(
f"No cached entity data found for {entity_name}, trying to rebuild from relationships"
)
# Get all edges connected to this entity # Get all edges connected to this entity
edges = await knowledge_graph_inst.get_node_edges(entity_name) edges = await knowledge_graph_inst.get_node_edges(entity_name)
if not edges: if not edges:
logger.warning(f"No relationships found for entity {entity_name}") logger.warning(f"No relationships found for entity {entity_name}")
return return
# Collect relationship data to extract entity information # Collect relationship data to extract entity information
relationship_descriptions = [] relationship_descriptions = []
file_paths = set() file_paths = set()
# Get edge data for all connected relationships # Get edge data for all connected relationships
for src_id, tgt_id in edges: for src_id, tgt_id in edges:
edge_data = await knowledge_graph_inst.get_edge(src_id, tgt_id) edge_data = await knowledge_graph_inst.get_edge(src_id, tgt_id)
if edge_data: if edge_data:
if edge_data.get("description"): if edge_data.get("description"):
relationship_descriptions.append(edge_data["description"]) relationship_descriptions.append(edge_data["description"])
if edge_data.get("file_path"): if edge_data.get("file_path"):
edge_file_paths = edge_data["file_path"].split(GRAPH_FIELD_SEP) edge_file_paths = edge_data["file_path"].split(GRAPH_FIELD_SEP)
file_paths.update(edge_file_paths) file_paths.update(edge_file_paths)
# Generate description from relationships or fallback to current # Generate description from relationships or fallback to current
if relationship_descriptions: if relationship_descriptions:
combined_description = GRAPH_FIELD_SEP.join(relationship_descriptions) combined_description = GRAPH_FIELD_SEP.join(relationship_descriptions)
final_description = await _generate_final_description(combined_description) final_description = await _generate_final_description(combined_description)
else: else:
final_description = current_entity.get("description", "") final_description = current_entity.get("description", "")
entity_type = current_entity.get("entity_type", "UNKNOWN") entity_type = current_entity.get("entity_type", "UNKNOWN")
await _update_entity_storage(final_description, entity_type, file_paths) await _update_entity_storage(final_description, entity_type, file_paths)
return return
@ -635,11 +639,12 @@ async def _rebuild_single_relationship(
if keywords if keywords
else current_relationship.get("keywords", "") else current_relationship.get("keywords", "")
) )
avg_weight = ( # weight = (
sum(weights) / len(weights) # sum(weights) / len(weights)
if weights # if weights
else current_relationship.get("weight", 1.0) # else current_relationship.get("weight", 1.0)
) # )
weight = sum(weights) if weights else current_relationship.get("weight", 1.0)
# Use summary if description is too long # Use summary if description is too long
if len(combined_description) > global_config["summary_to_max_tokens"]: if len(combined_description) > global_config["summary_to_max_tokens"]:
@ -657,7 +662,7 @@ async def _rebuild_single_relationship(
**current_relationship, **current_relationship,
"description": final_description, "description": final_description,
"keywords": combined_keywords, "keywords": combined_keywords,
"weight": avg_weight, "weight": weight,
"source_id": GRAPH_FIELD_SEP.join(chunk_ids), "source_id": GRAPH_FIELD_SEP.join(chunk_ids),
"file_path": GRAPH_FIELD_SEP.join(file_paths) "file_path": GRAPH_FIELD_SEP.join(file_paths)
if file_paths if file_paths
@ -688,7 +693,7 @@ async def _rebuild_single_relationship(
"content": rel_content, "content": rel_content,
"keywords": combined_keywords, "keywords": combined_keywords,
"description": final_description, "description": final_description,
"weight": avg_weight, "weight": weight,
"file_path": updated_relationship_data["file_path"], "file_path": updated_relationship_data["file_path"],
} }
} }
@ -838,8 +843,7 @@ async def _merge_edges_then_upsert(
) )
# Process edges_data with None checks # Process edges_data with None checks
all_weights = [dp["weight"] for dp in edges_data] + already_weights weight = sum([dp["weight"] for dp in edges_data] + already_weights)
weight = sum(all_weights) / len(all_weights)
description = GRAPH_FIELD_SEP.join( description = GRAPH_FIELD_SEP.join(
sorted( sorted(
set( set(