This commit is contained in:
Raphaël MANSUY 2025-12-04 19:18:37 +08:00
parent ef2355a7ac
commit 18a8f57b89

View file

@ -1026,7 +1026,7 @@ async def _rebuild_single_entity(
async def _update_entity_storage(
final_description: str,
entity_type: str,
file_paths: list[str],
file_paths: set[str],
source_chunk_ids: list[str],
truncation_info: str = "",
):
@ -1195,6 +1195,8 @@ async def _rebuild_single_entity(
f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
)
file_paths = set(file_paths_list)
# Remove duplicates while preserving order
description_list = list(dict.fromkeys(descriptions))
entity_types = list(dict.fromkeys(entity_types))
@ -1229,7 +1231,7 @@ async def _rebuild_single_entity(
await _update_entity_storage(
final_description,
entity_type,
file_paths_list,
file_paths,
limited_chunk_ids,
truncation_info,
)
@ -1352,6 +1354,8 @@ async def _rebuild_single_relationship(
f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
)
file_paths = set(file_paths_list)
# Remove duplicates while preserving order
description_list = list(dict.fromkeys(descriptions))
keywords = list(dict.fromkeys(keywords))
@ -1394,8 +1398,8 @@ async def _rebuild_single_relationship(
"keywords": combined_keywords,
"weight": weight,
"source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids),
"file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths_list if fp])
if file_paths_list
"file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp])
if file_paths
else current_relationship.get("file_path", "unknown_source"),
"truncate": truncation_info,
}
@ -1646,12 +1650,11 @@ async def _merge_nodes_then_upsert(
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
# FIFO: keep tail (newest), discard head
file_paths_list = file_paths_list[-max_file_paths:]
file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
else:
# KEEP: keep head (earliest), discard tail
file_paths_list = file_paths_list[:max_file_paths]
file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
file_paths_list.append(f"...{file_path_placeholder}({limit_method})...")
logger.info(
f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
)
@ -1673,7 +1676,7 @@ async def _merge_nodes_then_upsert(
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
truncation_info = truncation_info_log
else:
truncation_info = "KEEP Old"
truncation_info = "Keep Old Chunks"
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
dd_message = ""
@ -1968,12 +1971,15 @@ async def _merge_edges_then_upsert(
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
# FIFO: keep tail (newest), discard head
file_paths_list = file_paths_list[-max_file_paths:]
file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
else:
# KEEP: keep head (earliest), discard tail
file_paths_list = file_paths_list[:max_file_paths]
file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
# Add + sign if has_placeholder is True, indicating actual file count is higher
file_paths_list.append(
f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..."
)
logger.info(
f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
)
@ -1995,7 +2001,7 @@ async def _merge_edges_then_upsert(
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
truncation_info = truncation_info_log
else:
truncation_info = "KEEP Old"
truncation_info = "Keep Old Chunks"
deduplicated_num = already_fragment + len(edges_data) - num_fragment
dd_message = ""