cherry-pick be3d274a
This commit is contained in:
parent
ef2355a7ac
commit
18a8f57b89
1 changed files with 16 additions and 10 deletions
|
|
@ -1026,7 +1026,7 @@ async def _rebuild_single_entity(
|
||||||
async def _update_entity_storage(
|
async def _update_entity_storage(
|
||||||
final_description: str,
|
final_description: str,
|
||||||
entity_type: str,
|
entity_type: str,
|
||||||
file_paths: list[str],
|
file_paths: set[str],
|
||||||
source_chunk_ids: list[str],
|
source_chunk_ids: list[str],
|
||||||
truncation_info: str = "",
|
truncation_info: str = "",
|
||||||
):
|
):
|
||||||
|
|
@ -1195,6 +1195,8 @@ async def _rebuild_single_entity(
|
||||||
f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
file_paths = set(file_paths_list)
|
||||||
|
|
||||||
# Remove duplicates while preserving order
|
# Remove duplicates while preserving order
|
||||||
description_list = list(dict.fromkeys(descriptions))
|
description_list = list(dict.fromkeys(descriptions))
|
||||||
entity_types = list(dict.fromkeys(entity_types))
|
entity_types = list(dict.fromkeys(entity_types))
|
||||||
|
|
@ -1229,7 +1231,7 @@ async def _rebuild_single_entity(
|
||||||
await _update_entity_storage(
|
await _update_entity_storage(
|
||||||
final_description,
|
final_description,
|
||||||
entity_type,
|
entity_type,
|
||||||
file_paths_list,
|
file_paths,
|
||||||
limited_chunk_ids,
|
limited_chunk_ids,
|
||||||
truncation_info,
|
truncation_info,
|
||||||
)
|
)
|
||||||
|
|
@ -1352,6 +1354,8 @@ async def _rebuild_single_relationship(
|
||||||
f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
file_paths = set(file_paths_list)
|
||||||
|
|
||||||
# Remove duplicates while preserving order
|
# Remove duplicates while preserving order
|
||||||
description_list = list(dict.fromkeys(descriptions))
|
description_list = list(dict.fromkeys(descriptions))
|
||||||
keywords = list(dict.fromkeys(keywords))
|
keywords = list(dict.fromkeys(keywords))
|
||||||
|
|
@ -1394,8 +1398,8 @@ async def _rebuild_single_relationship(
|
||||||
"keywords": combined_keywords,
|
"keywords": combined_keywords,
|
||||||
"weight": weight,
|
"weight": weight,
|
||||||
"source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids),
|
"source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids),
|
||||||
"file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths_list if fp])
|
"file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp])
|
||||||
if file_paths_list
|
if file_paths
|
||||||
else current_relationship.get("file_path", "unknown_source"),
|
else current_relationship.get("file_path", "unknown_source"),
|
||||||
"truncate": truncation_info,
|
"truncate": truncation_info,
|
||||||
}
|
}
|
||||||
|
|
@ -1646,12 +1650,11 @@ async def _merge_nodes_then_upsert(
|
||||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||||
# FIFO: keep tail (newest), discard head
|
# FIFO: keep tail (newest), discard head
|
||||||
file_paths_list = file_paths_list[-max_file_paths:]
|
file_paths_list = file_paths_list[-max_file_paths:]
|
||||||
file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
|
|
||||||
else:
|
else:
|
||||||
# KEEP: keep head (earliest), discard tail
|
# KEEP: keep head (earliest), discard tail
|
||||||
file_paths_list = file_paths_list[:max_file_paths]
|
file_paths_list = file_paths_list[:max_file_paths]
|
||||||
file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
|
|
||||||
|
|
||||||
|
file_paths_list.append(f"...{file_path_placeholder}({limit_method})...")
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
|
f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
|
||||||
)
|
)
|
||||||
|
|
@ -1673,7 +1676,7 @@ async def _merge_nodes_then_upsert(
|
||||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||||
truncation_info = truncation_info_log
|
truncation_info = truncation_info_log
|
||||||
else:
|
else:
|
||||||
truncation_info = "KEEP Old"
|
truncation_info = "Keep Old Chunks"
|
||||||
|
|
||||||
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
|
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
|
||||||
dd_message = ""
|
dd_message = ""
|
||||||
|
|
@ -1968,12 +1971,15 @@ async def _merge_edges_then_upsert(
|
||||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||||
# FIFO: keep tail (newest), discard head
|
# FIFO: keep tail (newest), discard head
|
||||||
file_paths_list = file_paths_list[-max_file_paths:]
|
file_paths_list = file_paths_list[-max_file_paths:]
|
||||||
file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
|
|
||||||
else:
|
else:
|
||||||
# KEEP: keep head (earliest), discard tail
|
# KEEP: keep head (earliest), discard tail
|
||||||
file_paths_list = file_paths_list[:max_file_paths]
|
file_paths_list = file_paths_list[:max_file_paths]
|
||||||
file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
|
|
||||||
|
|
||||||
|
# Add + sign if has_placeholder is True, indicating actual file count is higher
|
||||||
|
|
||||||
|
file_paths_list.append(
|
||||||
|
f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..."
|
||||||
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
|
f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
|
||||||
)
|
)
|
||||||
|
|
@ -1995,7 +2001,7 @@ async def _merge_edges_then_upsert(
|
||||||
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
|
||||||
truncation_info = truncation_info_log
|
truncation_info = truncation_info_log
|
||||||
else:
|
else:
|
||||||
truncation_info = "KEEP Old"
|
truncation_info = "Keep Old Chunks"
|
||||||
|
|
||||||
deduplicated_num = already_fragment + len(edges_data) - num_fragment
|
deduplicated_num = already_fragment + len(edges_data) - num_fragment
|
||||||
dd_message = ""
|
dd_message = ""
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue