From a5192f9ac42c100cc72b85f32938987ca056c37d Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:41:41 -0700 Subject: [PATCH] add performance note for deduplication loop in bulk_utils --- graphiti_core/utils/bulk_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/graphiti_core/utils/bulk_utils.py b/graphiti_core/utils/bulk_utils.py index 330e960d..2c5eaf5b 100644 --- a/graphiti_core/utils/bulk_utils.py +++ b/graphiti_core/utils/bulk_utils.py @@ -308,6 +308,11 @@ async def dedupe_nodes_bulk( canonical_nodes: dict[str, EntityNode] = {} for _, resolved_nodes in episode_resolutions: for node in resolved_nodes: + # NOTE: this loop is O(n^2) in the number of nodes inside the batch because we rebuild + # the MinHash index for the accumulated canonical pool each time. The LRU-backed + # shingle cache keeps the constant factors low for typical batch sizes (≤ CHUNK_SIZE), + # but if batches grow significantly we should switch to an incremental index or chunked + # processing. if not canonical_nodes: canonical_nodes[node.uuid] = node continue