cherry-pick 904b1f46
This commit is contained in:
parent
de3f5f10c2
commit
77a715f61b
2 changed files with 319 additions and 318 deletions
|
|
@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
|
|||
# Default values for extraction settings
|
||||
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
|
||||
DEFAULT_MAX_GLEANING = 1
|
||||
DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
|
||||
|
||||
# Number of description fragments to trigger LLM summary
|
||||
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
|
||||
|
|
|
|||
|
|
@ -57,6 +57,8 @@ from lightrag.constants import (
|
|||
SOURCE_IDS_LIMIT_METHOD_KEEP,
|
||||
SOURCE_IDS_LIMIT_METHOD_FIFO,
|
||||
DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
|
||||
DEFAULT_MAX_FILE_PATHS,
|
||||
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||
)
|
||||
from lightrag.kg.shared_storage import get_storage_keyed_lock
|
||||
import time
|
||||
|
|
@ -68,6 +70,27 @@ from dotenv import load_dotenv
|
|||
load_dotenv(dotenv_path=".env", override=False)
|
||||
|
||||
|
||||
def _truncate_entity_identifier(
|
||||
identifier: str, limit: int, chunk_key: str, identifier_role: str
|
||||
) -> str:
|
||||
"""Truncate entity identifiers that exceed the configured length limit."""
|
||||
|
||||
if len(identifier) <= limit:
|
||||
return identifier
|
||||
|
||||
display_value = identifier[:limit]
|
||||
preview = identifier[:20] # Show first 20 characters as preview
|
||||
logger.warning(
|
||||
"%s: %s exceeded %d characters (len: %d, preview: '%s...'",
|
||||
chunk_key,
|
||||
identifier_role,
|
||||
limit,
|
||||
len(identifier),
|
||||
preview,
|
||||
)
|
||||
return display_value
|
||||
|
||||
|
||||
def chunking_by_token_size(
|
||||
tokenizer: Tokenizer,
|
||||
content: str,
|
||||
|
|
@ -951,7 +974,14 @@ async def _process_extraction_result(
|
|||
record_attributes, chunk_key, timestamp, file_path
|
||||
)
|
||||
if entity_data is not None:
|
||||
maybe_nodes[entity_data["entity_name"]].append(entity_data)
|
||||
truncated_name = _truncate_entity_identifier(
|
||||
entity_data["entity_name"],
|
||||
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||
chunk_key,
|
||||
"Entity name",
|
||||
)
|
||||
entity_data["entity_name"] = truncated_name
|
||||
maybe_nodes[truncated_name].append(entity_data)
|
||||
continue
|
||||
|
||||
# Try to parse as relationship
|
||||
|
|
@ -959,9 +989,21 @@ async def _process_extraction_result(
|
|||
record_attributes, chunk_key, timestamp, file_path
|
||||
)
|
||||
if relationship_data is not None:
|
||||
maybe_edges[
|
||||
(relationship_data["src_id"], relationship_data["tgt_id"])
|
||||
].append(relationship_data)
|
||||
truncated_source = _truncate_entity_identifier(
|
||||
relationship_data["src_id"],
|
||||
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||
chunk_key,
|
||||
"Relationship source entity",
|
||||
)
|
||||
truncated_target = _truncate_entity_identifier(
|
||||
relationship_data["tgt_id"],
|
||||
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||
chunk_key,
|
||||
"Relationship target entity",
|
||||
)
|
||||
relationship_data["src_id"] = truncated_source
|
||||
relationship_data["tgt_id"] = truncated_target
|
||||
maybe_edges[(truncated_source, truncated_target)].append(relationship_data)
|
||||
|
||||
return dict(maybe_nodes), dict(maybe_edges)
|
||||
|
||||
|
|
@ -1025,7 +1067,7 @@ async def _rebuild_single_entity(
|
|||
async def _update_entity_storage(
|
||||
final_description: str,
|
||||
entity_type: str,
|
||||
file_paths: set[str],
|
||||
file_paths: list[str],
|
||||
source_chunk_ids: list[str],
|
||||
truncation_info: str = "",
|
||||
):
|
||||
|
|
@ -1194,8 +1236,6 @@ async def _rebuild_single_entity(
|
|||
f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
||||
)
|
||||
|
||||
file_paths = set(file_paths_list)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
description_list = list(dict.fromkeys(descriptions))
|
||||
entity_types = list(dict.fromkeys(entity_types))
|
||||
|
|
@ -1230,7 +1270,7 @@ async def _rebuild_single_entity(
|
|||
await _update_entity_storage(
|
||||
final_description,
|
||||
entity_type,
|
||||
file_paths,
|
||||
file_paths_list,
|
||||
limited_chunk_ids,
|
||||
truncation_info,
|
||||
)
|
||||
|
|
@ -1353,8 +1393,6 @@ async def _rebuild_single_relationship(
|
|||
f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
||||
)
|
||||
|
||||
file_paths = set(file_paths_list)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
description_list = list(dict.fromkeys(descriptions))
|
||||
keywords = list(dict.fromkeys(keywords))
|
||||
|
|
@ -1397,8 +1435,8 @@ async def _rebuild_single_relationship(
|
|||
"keywords": combined_keywords,
|
||||
"weight": weight,
|
||||
"source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids),
|
||||
"file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp])
|
||||
if file_paths
|
||||
"file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths_list if fp])
|
||||
if file_paths_list
|
||||
else current_relationship.get("file_path", "unknown_source"),
|
||||
"truncate": truncation_info,
|
||||
}
|
||||
|
|
@ -1483,6 +1521,7 @@ async def _merge_nodes_then_upsert(
|
|||
already_description = []
|
||||
already_file_paths = []
|
||||
|
||||
# 1. Get existing node data from knowledge graph
|
||||
already_node = await knowledge_graph_inst.get_node(entity_name)
|
||||
if already_node:
|
||||
already_entity_types.append(already_node["entity_type"])
|
||||
|
|
@ -1490,14 +1529,6 @@ async def _merge_nodes_then_upsert(
|
|||
already_file_paths.extend(already_node["file_path"].split(GRAPH_FIELD_SEP))
|
||||
already_description.extend(already_node["description"].split(GRAPH_FIELD_SEP))
|
||||
|
||||
entity_type = sorted(
|
||||
Counter(
|
||||
[dp["entity_type"] for dp in nodes_data] + already_entity_types
|
||||
).items(),
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)[0][0] # Get the entity type with the highest count
|
||||
|
||||
new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")]
|
||||
|
||||
existing_full_source_ids = []
|
||||
|
|
@ -1513,6 +1544,7 @@ async def _merge_nodes_then_upsert(
|
|||
chunk_id for chunk_id in already_source_ids if chunk_id
|
||||
]
|
||||
|
||||
# 2. Merging new source ids with existing ones
|
||||
full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids)
|
||||
|
||||
if entity_chunks_storage is not None and full_source_ids:
|
||||
|
|
@ -1525,6 +1557,7 @@ async def _merge_nodes_then_upsert(
|
|||
}
|
||||
)
|
||||
|
||||
# 3. Finalize source_id by applying source ids limit
|
||||
limit_method = global_config.get("source_ids_limit_method")
|
||||
max_source_limit = global_config.get("max_source_ids_per_entity")
|
||||
source_ids = apply_source_ids_limit(
|
||||
|
|
@ -1534,7 +1567,7 @@ async def _merge_nodes_then_upsert(
|
|||
identifier=f"`{entity_name}`",
|
||||
)
|
||||
|
||||
# Only apply filtering in KEEP(ignore new) mode
|
||||
# 4. Only keep nodes not filter by apply_source_ids_limit if limit_method is KEEP
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
|
||||
allowed_source_ids = set(source_ids)
|
||||
filtered_nodes = []
|
||||
|
|
@ -1549,18 +1582,40 @@ async def _merge_nodes_then_upsert(
|
|||
continue
|
||||
filtered_nodes.append(dp)
|
||||
nodes_data = filtered_nodes
|
||||
else:
|
||||
# In FIFO mode, keep all node descriptions - truncation happens at source_ids level only
|
||||
else: # In FIFO mode, keep all nodes - truncation happens at source_ids level only
|
||||
nodes_data = list(nodes_data)
|
||||
|
||||
skip_summary_due_to_limit = (
|
||||
# 5. Check if we need to skip summary due to source_ids limit
|
||||
if (
|
||||
limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
and len(existing_full_source_ids) >= max_source_limit
|
||||
and not nodes_data
|
||||
and already_description
|
||||
)
|
||||
):
|
||||
if already_node:
|
||||
logger.info(
|
||||
f"Skipped `{entity_name}`: KEEP old chunks {already_source_ids}/{len(full_source_ids)}"
|
||||
)
|
||||
existing_node_data = dict(already_node)
|
||||
return existing_node_data
|
||||
else:
|
||||
logger.error(f"Internal Error: already_node missing for `{entity_name}`")
|
||||
raise ValueError(
|
||||
f"Internal Error: already_node missing for `{entity_name}`"
|
||||
)
|
||||
|
||||
# Deduplicate by description, keeping first occurrence
|
||||
# 6.1 Finalize source_id
|
||||
source_id = GRAPH_FIELD_SEP.join(source_ids)
|
||||
|
||||
# 6.2 Finalize entity type by highest count
|
||||
entity_type = sorted(
|
||||
Counter(
|
||||
[dp["entity_type"] for dp in nodes_data] + already_entity_types
|
||||
).items(),
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)[0][0]
|
||||
|
||||
# 7. Deduplicate nodes by description, keeping first occurrence in the same document
|
||||
unique_nodes = {}
|
||||
for dp in nodes_data:
|
||||
desc = dp.get("description")
|
||||
|
|
@ -1569,159 +1624,122 @@ async def _merge_nodes_then_upsert(
|
|||
if desc not in unique_nodes:
|
||||
unique_nodes[desc] = dp
|
||||
|
||||
# Sort description by timestamp, then by description length (largest to smallest) when timestamps are the same
|
||||
# Sort description by timestamp, then by description length when timestamps are the same
|
||||
sorted_nodes = sorted(
|
||||
unique_nodes.values(),
|
||||
key=lambda x: (x.get("timestamp", 0), -len(x.get("description", ""))),
|
||||
)
|
||||
sorted_descriptions = [dp["description"] for dp in sorted_nodes]
|
||||
|
||||
truncation_info = ""
|
||||
dd_message = ""
|
||||
has_placeholder = False # Initialize to track placeholder in file paths
|
||||
|
||||
# Combine already_description with sorted new sorted descriptions
|
||||
description_list = already_description + sorted_descriptions
|
||||
if not description_list:
|
||||
logger.error(f"Entity {entity_name} has no description")
|
||||
raise ValueError(f"Entity {entity_name} has no description")
|
||||
|
||||
# 8. Get summary description an LLM usage status
|
||||
description, llm_was_used = await _handle_entity_relation_summary(
|
||||
"Entity",
|
||||
entity_name,
|
||||
description_list,
|
||||
GRAPH_FIELD_SEP,
|
||||
global_config,
|
||||
llm_response_cache,
|
||||
)
|
||||
|
||||
# 9. Build file_path within MAX_FILE_PATHS
|
||||
file_paths_list = []
|
||||
seen_paths = set()
|
||||
has_placeholder = False # Indicating file_path has been truncated before
|
||||
|
||||
max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS)
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Collect from already_file_paths, excluding placeholder
|
||||
for fp in already_file_paths:
|
||||
if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders
|
||||
has_placeholder = True
|
||||
continue
|
||||
if fp and fp not in seen_paths:
|
||||
file_paths_list.append(fp)
|
||||
seen_paths.add(fp)
|
||||
|
||||
# Collect from new data
|
||||
for dp in nodes_data:
|
||||
file_path_item = dp.get("file_path")
|
||||
if file_path_item and file_path_item not in seen_paths:
|
||||
file_paths_list.append(file_path_item)
|
||||
seen_paths.add(file_path_item)
|
||||
|
||||
# Apply count limit
|
||||
if len(file_paths_list) > max_file_paths:
|
||||
limit_method = global_config.get(
|
||||
"source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
)
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
# Add + sign to indicate actual file count is higher
|
||||
original_count_str = (
|
||||
f"{len(file_paths_list)}+" if has_placeholder else str(len(file_paths_list))
|
||||
)
|
||||
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||
# FIFO: keep tail (newest), discard head
|
||||
file_paths_list = file_paths_list[-max_file_paths:]
|
||||
file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
|
||||
else:
|
||||
# KEEP: keep head (earliest), discard tail
|
||||
file_paths_list = file_paths_list[:max_file_paths]
|
||||
file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
|
||||
|
||||
logger.info(
|
||||
f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
|
||||
)
|
||||
# Finalize file_path
|
||||
file_path = GRAPH_FIELD_SEP.join(file_paths_list)
|
||||
|
||||
# 10.Log based on actual LLM usage
|
||||
num_fragment = len(description_list)
|
||||
already_fragment = len(already_description)
|
||||
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
|
||||
if deduplicated_num > 0:
|
||||
dd_message = f"dd:{deduplicated_num}"
|
||||
if llm_was_used:
|
||||
status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
else:
|
||||
status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
|
||||
if skip_summary_due_to_limit:
|
||||
description = (
|
||||
already_node.get("description", "(no description)")
|
||||
if already_node
|
||||
else "(no description)"
|
||||
truncation_info = truncation_info_log = ""
|
||||
if len(source_ids) < len(full_source_ids):
|
||||
# Add truncation info from apply_source_ids_limit if truncation occurred
|
||||
truncation_info_log = f"{limit_method} {len(source_ids)}/{len(full_source_ids)}"
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||
truncation_info = truncation_info_log
|
||||
else:
|
||||
truncation_info = "KEEP Old"
|
||||
|
||||
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
|
||||
dd_message = ""
|
||||
if deduplicated_num > 0:
|
||||
# Duplicated description detected across multiple trucks for the same entity
|
||||
dd_message = f"dd {deduplicated_num}"
|
||||
|
||||
if dd_message or truncation_info_log:
|
||||
status_message += (
|
||||
f" ({', '.join(filter(None, [truncation_info_log, dd_message]))})"
|
||||
)
|
||||
status_message = f"Skip merge for `{entity_name}`: KEEP limit reached"
|
||||
logger.debug(status_message)
|
||||
|
||||
# Add message to pipeline satus when merge happens
|
||||
if already_fragment > 0 or llm_was_used:
|
||||
logger.info(status_message)
|
||||
if pipeline_status is not None and pipeline_status_lock is not None:
|
||||
async with pipeline_status_lock:
|
||||
pipeline_status["latest_message"] = status_message
|
||||
pipeline_status["history_messages"].append(status_message)
|
||||
existing_node_data = dict(already_node or {})
|
||||
if not existing_node_data:
|
||||
existing_node_data = {
|
||||
"entity_id": entity_name,
|
||||
"entity_type": entity_type,
|
||||
"description": description,
|
||||
"source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids),
|
||||
"file_path": GRAPH_FIELD_SEP.join(already_file_paths),
|
||||
"created_at": int(time.time()),
|
||||
"truncate": "",
|
||||
}
|
||||
existing_node_data["entity_name"] = entity_name
|
||||
return existing_node_data
|
||||
elif num_fragment > 0:
|
||||
# Get summary and LLM usage status
|
||||
description, llm_was_used = await _handle_entity_relation_summary(
|
||||
"Entity",
|
||||
entity_name,
|
||||
description_list,
|
||||
GRAPH_FIELD_SEP,
|
||||
global_config,
|
||||
llm_response_cache,
|
||||
)
|
||||
|
||||
# Log based on actual LLM usage
|
||||
if llm_was_used:
|
||||
status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
else:
|
||||
status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
|
||||
# Add truncation info from apply_source_ids_limit if truncation occurred
|
||||
if len(source_ids) < len(full_source_ids):
|
||||
# Add + sign if has_placeholder is True, indicating actual file count is higher
|
||||
full_source_count_str = (
|
||||
f"{len(full_source_ids)}+"
|
||||
if has_placeholder
|
||||
else str(len(full_source_ids))
|
||||
)
|
||||
truncation_info = (
|
||||
f"{limit_method}:{len(source_ids)}/{full_source_count_str}"
|
||||
)
|
||||
|
||||
if dd_message or truncation_info:
|
||||
status_message += (
|
||||
f" ({', '.join(filter(None, [truncation_info, dd_message]))})"
|
||||
)
|
||||
|
||||
if already_fragment > 0 or llm_was_used:
|
||||
logger.info(status_message)
|
||||
if pipeline_status is not None and pipeline_status_lock is not None:
|
||||
async with pipeline_status_lock:
|
||||
pipeline_status["latest_message"] = status_message
|
||||
pipeline_status["history_messages"].append(status_message)
|
||||
else:
|
||||
logger.debug(status_message)
|
||||
|
||||
else:
|
||||
logger.error(f"Entity {entity_name} has no description")
|
||||
description = "(no description)"
|
||||
|
||||
source_id = GRAPH_FIELD_SEP.join(source_ids)
|
||||
|
||||
# Build file_path with count limit
|
||||
if skip_summary_due_to_limit:
|
||||
# Skip limit, keep original file_path
|
||||
file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
|
||||
else:
|
||||
# Collect and apply limit
|
||||
file_paths_list = []
|
||||
seen_paths = set()
|
||||
has_placeholder = False # Track if already_file_paths contains placeholder
|
||||
|
||||
# Get placeholder to filter it out
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Collect from already_file_paths, excluding placeholder
|
||||
for fp in already_file_paths:
|
||||
# Check if this is a placeholder record
|
||||
if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders
|
||||
has_placeholder = True
|
||||
continue
|
||||
if fp and fp not in seen_paths:
|
||||
file_paths_list.append(fp)
|
||||
seen_paths.add(fp)
|
||||
|
||||
# Collect from new data
|
||||
for dp in nodes_data:
|
||||
file_path_item = dp.get("file_path")
|
||||
if file_path_item and file_path_item not in seen_paths:
|
||||
file_paths_list.append(file_path_item)
|
||||
seen_paths.add(file_path_item)
|
||||
|
||||
# Apply count limit
|
||||
max_file_paths = global_config.get("max_file_paths")
|
||||
|
||||
if len(file_paths_list) > max_file_paths:
|
||||
limit_method = global_config.get(
|
||||
"source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
)
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
original_count = len(file_paths_list)
|
||||
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||
# FIFO: keep tail (newest), discard head
|
||||
file_paths_list = file_paths_list[-max_file_paths:]
|
||||
else:
|
||||
# KEEP: keep head (earliest), discard tail
|
||||
file_paths_list = file_paths_list[:max_file_paths]
|
||||
|
||||
file_paths_list.append(
|
||||
f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
|
||||
)
|
||||
logger.info(
|
||||
f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
||||
)
|
||||
|
||||
file_path = GRAPH_FIELD_SEP.join(file_paths_list)
|
||||
logger.debug(status_message)
|
||||
|
||||
# 11. Update both graph and vector db
|
||||
node_data = dict(
|
||||
entity_id=entity_name,
|
||||
entity_type=entity_type,
|
||||
|
|
@ -1782,6 +1800,7 @@ async def _merge_edges_then_upsert(
|
|||
already_keywords = []
|
||||
already_file_paths = []
|
||||
|
||||
# 1. Get existing edge data from graph storage
|
||||
if await knowledge_graph_inst.has_edge(src_id, tgt_id):
|
||||
already_edge = await knowledge_graph_inst.get_edge(src_id, tgt_id)
|
||||
# Handle the case where get_edge returns None or missing fields
|
||||
|
|
@ -1831,6 +1850,7 @@ async def _merge_edges_then_upsert(
|
|||
chunk_id for chunk_id in already_source_ids if chunk_id
|
||||
]
|
||||
|
||||
# 2. Merge new source ids with existing ones
|
||||
full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids)
|
||||
|
||||
if relation_chunks_storage is not None and full_source_ids:
|
||||
|
|
@ -1843,6 +1863,7 @@ async def _merge_edges_then_upsert(
|
|||
}
|
||||
)
|
||||
|
||||
# 3. Finalize source_id by applying source ids limit
|
||||
limit_method = global_config.get("source_ids_limit_method")
|
||||
max_source_limit = global_config.get("max_source_ids_per_relation")
|
||||
source_ids = apply_source_ids_limit(
|
||||
|
|
@ -1855,7 +1876,7 @@ async def _merge_edges_then_upsert(
|
|||
global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
)
|
||||
|
||||
# Only apply filtering in KEEP(ignore new) mode
|
||||
# 4. Only keep edges with source_id in the final source_ids list if in KEEP mode
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
|
||||
allowed_source_ids = set(source_ids)
|
||||
filtered_edges = []
|
||||
|
|
@ -1870,21 +1891,51 @@ async def _merge_edges_then_upsert(
|
|||
continue
|
||||
filtered_edges.append(dp)
|
||||
edges_data = filtered_edges
|
||||
else:
|
||||
# In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only
|
||||
else: # In FIFO mode, keep all edges - truncation happens at source_ids level only
|
||||
edges_data = list(edges_data)
|
||||
|
||||
skip_summary_due_to_limit = (
|
||||
# 5. Check if we need to skip summary due to source_ids limit
|
||||
if (
|
||||
limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
and len(existing_full_source_ids) >= max_source_limit
|
||||
and not edges_data
|
||||
and already_description
|
||||
)
|
||||
):
|
||||
if already_edge:
|
||||
logger.info(
|
||||
f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks {already_source_ids}/{len(full_source_ids)}"
|
||||
)
|
||||
existing_edge_data = dict(already_edge)
|
||||
return existing_edge_data
|
||||
else:
|
||||
logger.error(
|
||||
f"Internal Error: already_node missing for `{src_id}`~`{tgt_id}`"
|
||||
)
|
||||
raise ValueError(
|
||||
f"Internal Error: already_node missing for `{src_id}`~`{tgt_id}`"
|
||||
)
|
||||
|
||||
# Process edges_data with None checks
|
||||
# 6.1 Finalize source_id
|
||||
source_id = GRAPH_FIELD_SEP.join(source_ids)
|
||||
|
||||
# 6.2 Finalize weight by summing new edges and existing weights
|
||||
weight = sum([dp["weight"] for dp in edges_data] + already_weights)
|
||||
|
||||
# Deduplicate by description, keeping first occurrence
|
||||
# 6.2 Finalize keywords by merging existing and new keywords
|
||||
all_keywords = set()
|
||||
# Process already_keywords (which are comma-separated)
|
||||
for keyword_str in already_keywords:
|
||||
if keyword_str: # Skip empty strings
|
||||
all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip())
|
||||
# Process new keywords from edges_data
|
||||
for edge in edges_data:
|
||||
if edge.get("keywords"):
|
||||
all_keywords.update(
|
||||
k.strip() for k in edge["keywords"].split(",") if k.strip()
|
||||
)
|
||||
# Join all unique keywords with commas
|
||||
keywords = ",".join(sorted(all_keywords))
|
||||
|
||||
# 7. Deduplicate by description, keeping first occurrence in the same document
|
||||
unique_edges = {}
|
||||
for dp in edges_data:
|
||||
description_value = dp.get("description")
|
||||
|
|
@ -1900,170 +1951,119 @@ async def _merge_edges_then_upsert(
|
|||
)
|
||||
sorted_descriptions = [dp["description"] for dp in sorted_edges]
|
||||
|
||||
truncation_info = ""
|
||||
dd_message = ""
|
||||
has_placeholder = False # Initialize to track placeholder in file paths
|
||||
|
||||
# Combine already_description with sorted new descriptions
|
||||
description_list = already_description + sorted_descriptions
|
||||
if not description_list:
|
||||
logger.error(f"Relation {src_id}~{tgt_id} has no description")
|
||||
raise ValueError(f"Relation {src_id}~{tgt_id} has no description")
|
||||
|
||||
num_fragment = len(description_list)
|
||||
already_fragment = len(already_description)
|
||||
deduplicated_num = already_fragment + len(edges_data) - num_fragment
|
||||
if deduplicated_num > 0:
|
||||
dd_message = f"dd:{deduplicated_num}"
|
||||
# 8. Get summary description an LLM usage status
|
||||
description, llm_was_used = await _handle_entity_relation_summary(
|
||||
"Relation",
|
||||
f"({src_id}, {tgt_id})",
|
||||
description_list,
|
||||
GRAPH_FIELD_SEP,
|
||||
global_config,
|
||||
llm_response_cache,
|
||||
)
|
||||
|
||||
if skip_summary_due_to_limit:
|
||||
description = (
|
||||
already_edge.get("description", "(no description)")
|
||||
if already_edge
|
||||
else "(no description)"
|
||||
# 9. Build file_path within MAX_FILE_PATHS limit
|
||||
file_paths_list = []
|
||||
seen_paths = set()
|
||||
has_placeholder = False # Track if already_file_paths contains placeholder
|
||||
|
||||
max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS)
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Collect from already_file_paths, excluding placeholder
|
||||
for fp in already_file_paths:
|
||||
# Check if this is a placeholder record
|
||||
if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders
|
||||
has_placeholder = True
|
||||
continue
|
||||
if fp and fp not in seen_paths:
|
||||
file_paths_list.append(fp)
|
||||
seen_paths.add(fp)
|
||||
|
||||
# Collect from new data
|
||||
for dp in edges_data:
|
||||
file_path_item = dp.get("file_path")
|
||||
if file_path_item and file_path_item not in seen_paths:
|
||||
file_paths_list.append(file_path_item)
|
||||
seen_paths.add(file_path_item)
|
||||
|
||||
# Apply count limit
|
||||
max_file_paths = global_config.get("max_file_paths")
|
||||
|
||||
if len(file_paths_list) > max_file_paths:
|
||||
limit_method = global_config.get(
|
||||
"source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
)
|
||||
status_message = f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached"
|
||||
logger.debug(status_message)
|
||||
if pipeline_status is not None and pipeline_status_lock is not None:
|
||||
async with pipeline_status_lock:
|
||||
pipeline_status["latest_message"] = status_message
|
||||
pipeline_status["history_messages"].append(status_message)
|
||||
existing_edge_data = dict(already_edge or {})
|
||||
if not existing_edge_data:
|
||||
existing_edge_data = {
|
||||
"description": description,
|
||||
"keywords": GRAPH_FIELD_SEP.join(already_keywords),
|
||||
"source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids),
|
||||
"file_path": GRAPH_FIELD_SEP.join(already_file_paths),
|
||||
"weight": sum(already_weights) if already_weights else 0.0,
|
||||
"truncate": "",
|
||||
"created_at": int(time.time()),
|
||||
}
|
||||
existing_edge_data.setdefault("created_at", int(time.time()))
|
||||
existing_edge_data["src_id"] = src_id
|
||||
existing_edge_data["tgt_id"] = tgt_id
|
||||
return existing_edge_data
|
||||
elif num_fragment > 0:
|
||||
# Get summary and LLM usage status
|
||||
description, llm_was_used = await _handle_entity_relation_summary(
|
||||
"Relation",
|
||||
f"({src_id}, {tgt_id})",
|
||||
description_list,
|
||||
GRAPH_FIELD_SEP,
|
||||
global_config,
|
||||
llm_response_cache,
|
||||
)
|
||||
|
||||
# Log based on actual LLM usage
|
||||
if llm_was_used:
|
||||
status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
else:
|
||||
status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
|
||||
# Add truncation info from apply_source_ids_limit if truncation occurred
|
||||
if len(source_ids) < len(full_source_ids):
|
||||
# Add + sign if has_placeholder is True, indicating actual file count is higher
|
||||
full_source_count_str = (
|
||||
f"{len(full_source_ids)}+"
|
||||
if has_placeholder
|
||||
else str(len(full_source_ids))
|
||||
)
|
||||
truncation_info = (
|
||||
f"{limit_method}:{len(source_ids)}/{full_source_count_str}"
|
||||
)
|
||||
|
||||
if dd_message or truncation_info:
|
||||
status_message += (
|
||||
f" ({', '.join(filter(None, [truncation_info, dd_message]))})"
|
||||
)
|
||||
|
||||
if already_fragment > 0 or llm_was_used:
|
||||
logger.info(status_message)
|
||||
if pipeline_status is not None and pipeline_status_lock is not None:
|
||||
async with pipeline_status_lock:
|
||||
pipeline_status["latest_message"] = status_message
|
||||
pipeline_status["history_messages"].append(status_message)
|
||||
else:
|
||||
logger.debug(status_message)
|
||||
|
||||
else:
|
||||
logger.error(f"Edge {src_id} - {tgt_id} has no description")
|
||||
description = "(no description)"
|
||||
|
||||
# Split all existing and new keywords into individual terms, then combine and deduplicate
|
||||
all_keywords = set()
|
||||
# Process already_keywords (which are comma-separated)
|
||||
for keyword_str in already_keywords:
|
||||
if keyword_str: # Skip empty strings
|
||||
all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip())
|
||||
# Process new keywords from edges_data
|
||||
for edge in edges_data:
|
||||
if edge.get("keywords"):
|
||||
all_keywords.update(
|
||||
k.strip() for k in edge["keywords"].split(",") if k.strip()
|
||||
)
|
||||
# Join all unique keywords with commas
|
||||
keywords = ",".join(sorted(all_keywords))
|
||||
|
||||
source_id = GRAPH_FIELD_SEP.join(source_ids)
|
||||
|
||||
# Build file_path with count limit
|
||||
if skip_summary_due_to_limit:
|
||||
# Skip limit, keep original file_path
|
||||
file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
|
||||
else:
|
||||
# Collect and apply limit
|
||||
file_paths_list = []
|
||||
seen_paths = set()
|
||||
has_placeholder = False # Track if already_file_paths contains placeholder
|
||||
|
||||
# Get placeholder to filter it out
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Collect from already_file_paths, excluding placeholder
|
||||
for fp in already_file_paths:
|
||||
# Check if this is a placeholder record
|
||||
if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders
|
||||
has_placeholder = True
|
||||
continue
|
||||
if fp and fp not in seen_paths:
|
||||
file_paths_list.append(fp)
|
||||
seen_paths.add(fp)
|
||||
# Add + sign to indicate actual file count is higher
|
||||
original_count_str = (
|
||||
f"{len(file_paths_list)}+" if has_placeholder else str(len(file_paths_list))
|
||||
)
|
||||
|
||||
# Collect from new data
|
||||
for dp in edges_data:
|
||||
file_path_item = dp.get("file_path")
|
||||
if file_path_item and file_path_item not in seen_paths:
|
||||
file_paths_list.append(file_path_item)
|
||||
seen_paths.add(file_path_item)
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||
# FIFO: keep tail (newest), discard head
|
||||
file_paths_list = file_paths_list[-max_file_paths:]
|
||||
file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
|
||||
else:
|
||||
# KEEP: keep head (earliest), discard tail
|
||||
file_paths_list = file_paths_list[:max_file_paths]
|
||||
file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
|
||||
|
||||
# Apply count limit
|
||||
max_file_paths = global_config.get("max_file_paths")
|
||||
logger.info(
|
||||
f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
|
||||
)
|
||||
# Finalize file_path
|
||||
file_path = GRAPH_FIELD_SEP.join(file_paths_list)
|
||||
|
||||
if len(file_paths_list) > max_file_paths:
|
||||
limit_method = global_config.get(
|
||||
"source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
|
||||
)
|
||||
file_path_placeholder = global_config.get(
|
||||
"file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
|
||||
)
|
||||
original_count = len(file_paths_list)
|
||||
# 10. Log based on actual LLM usage
|
||||
num_fragment = len(description_list)
|
||||
already_fragment = len(already_description)
|
||||
if llm_was_used:
|
||||
status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
else:
|
||||
status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
|
||||
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||
# FIFO: keep tail (newest), discard head
|
||||
file_paths_list = file_paths_list[-max_file_paths:]
|
||||
else:
|
||||
# KEEP: keep head (earliest), discard tail
|
||||
file_paths_list = file_paths_list[:max_file_paths]
|
||||
truncation_info = truncation_info_log = ""
|
||||
if len(source_ids) < len(full_source_ids):
|
||||
# Add truncation info from apply_source_ids_limit if truncation occurred
|
||||
truncation_info_log = f"{limit_method} {len(source_ids)}/{len(full_source_ids)}"
|
||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||
truncation_info = truncation_info_log
|
||||
else:
|
||||
truncation_info = "KEEP Old"
|
||||
|
||||
file_paths_list.append(
|
||||
f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
|
||||
)
|
||||
logger.info(
|
||||
f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
||||
)
|
||||
deduplicated_num = already_fragment + len(edges_data) - num_fragment
|
||||
dd_message = ""
|
||||
if deduplicated_num > 0:
|
||||
# Duplicated description detected across multiple trucks for the same entity
|
||||
dd_message = f"dd {deduplicated_num}"
|
||||
|
||||
file_path = GRAPH_FIELD_SEP.join(file_paths_list)
|
||||
if dd_message or truncation_info_log:
|
||||
status_message += (
|
||||
f" ({', '.join(filter(None, [truncation_info_log, dd_message]))})"
|
||||
)
|
||||
|
||||
# Add message to pipeline satus when merge happens
|
||||
if already_fragment > 0 or llm_was_used:
|
||||
logger.info(status_message)
|
||||
if pipeline_status is not None and pipeline_status_lock is not None:
|
||||
async with pipeline_status_lock:
|
||||
pipeline_status["latest_message"] = status_message
|
||||
pipeline_status["history_messages"].append(status_message)
|
||||
else:
|
||||
logger.debug(status_message)
|
||||
|
||||
# 11. Update both graph and vector db
|
||||
for need_insert_id in [src_id, tgt_id]:
|
||||
if not (await knowledge_graph_inst.has_node(need_insert_id)):
|
||||
node_created_at = int(time.time())
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue