From 18a8f57b898f16aba8d120cd8abf6de289263af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Thu, 4 Dec 2025 19:18:37 +0800 Subject: [PATCH] cherry-pick be3d274a --- lightrag/operate.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 4bac0289..60fa66a3 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1026,7 +1026,7 @@ async def _rebuild_single_entity( async def _update_entity_storage( final_description: str, entity_type: str, - file_paths: list[str], + file_paths: set[str], source_chunk_ids: list[str], truncation_info: str = "", ): @@ -1195,6 +1195,8 @@ async def _rebuild_single_entity( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" ) + file_paths = set(file_paths_list) + # Remove duplicates while preserving order description_list = list(dict.fromkeys(descriptions)) entity_types = list(dict.fromkeys(entity_types)) @@ -1229,7 +1231,7 @@ async def _rebuild_single_entity( await _update_entity_storage( final_description, entity_type, - file_paths_list, + file_paths, limited_chunk_ids, truncation_info, ) @@ -1352,6 +1354,8 @@ async def _rebuild_single_relationship( f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})" ) + file_paths = set(file_paths_list) + # Remove duplicates while preserving order description_list = list(dict.fromkeys(descriptions)) keywords = list(dict.fromkeys(keywords)) @@ -1394,8 +1398,8 @@ async def _rebuild_single_relationship( "keywords": combined_keywords, "weight": weight, "source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids), - "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths_list if fp]) - if file_paths_list + "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp]) + if file_paths else current_relationship.get("file_path", "unknown_source"), "truncate": truncation_info, } @@ -1646,12 +1650,11 @@ async def _merge_nodes_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: # FIFO: keep tail (newest), discard head file_paths_list = file_paths_list[-max_file_paths:] - file_paths_list.append(f"...{file_path_placeholder}...(FIFO)") else: # KEEP: keep head (earliest), discard tail file_paths_list = file_paths_list[:max_file_paths] - file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)") + file_paths_list.append(f"...{file_path_placeholder}({limit_method})...") logger.info( f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})" ) @@ -1673,7 +1676,7 @@ async def _merge_nodes_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: truncation_info = truncation_info_log else: - truncation_info = "KEEP Old" + truncation_info = "Keep Old Chunks" deduplicated_num = already_fragment + len(nodes_data) - num_fragment dd_message = "" @@ -1968,12 +1971,15 @@ async def _merge_edges_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: # FIFO: keep tail (newest), discard head file_paths_list = file_paths_list[-max_file_paths:] - file_paths_list.append(f"...{file_path_placeholder}...(FIFO)") else: # KEEP: keep head (earliest), discard tail file_paths_list = file_paths_list[:max_file_paths] - file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)") + # Add + sign if has_placeholder is True, indicating actual file count is higher + + file_paths_list.append( + f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..." + ) logger.info( f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})" ) @@ -1995,7 +2001,7 @@ async def _merge_edges_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: truncation_info = truncation_info_log else: - truncation_info = "KEEP Old" + truncation_info = "Keep Old Chunks" deduplicated_num = already_fragment + len(edges_data) - num_fragment dd_message = ""