From a5192f9ac42c100cc72b85f32938987ca056c37d Mon Sep 17 00:00:00 2001
From: Daniel Chalef <131175+danielchalef@users.noreply.github.com>
Date: Thu, 25 Sep 2025 21:41:41 -0700
Subject: [PATCH] add performance note for deduplication loop in bulk_utils

---
 graphiti_core/utils/bulk_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/graphiti_core/utils/bulk_utils.py b/graphiti_core/utils/bulk_utils.py
index 330e960d..2c5eaf5b 100644
--- a/graphiti_core/utils/bulk_utils.py
+++ b/graphiti_core/utils/bulk_utils.py
@@ -308,6 +308,11 @@ async def dedupe_nodes_bulk(
     canonical_nodes: dict[str, EntityNode] = {}
     for _, resolved_nodes in episode_resolutions:
         for node in resolved_nodes:
+            # NOTE: this loop is O(n^2) in the number of nodes inside the batch because we rebuild
+            # the MinHash index for the accumulated canonical pool each time. The LRU-backed
+            # shingle cache keeps the constant factors low for typical batch sizes (≤ CHUNK_SIZE),
+            # but if batches grow significantly we should switch to an incremental index or chunked
+            # processing.
             if not canonical_nodes:
                 canonical_nodes[node.uuid] = node
                 continue