add performance note for deduplication loop in bulk_utils

2025-09-25 21:41:41 -07:00 · 2025-09-25 21:41:41 -07:00 · a5192f9ac4
commit a5192f9ac4
parent ecab825684
1 changed files with 5 additions and 0 deletions
--- a/graphiti_core/utils/bulk_utils.py
+++ b/graphiti_core/utils/bulk_utils.py
@ -308,6 +308,11 @@ async def dedupe_nodes_bulk(
    canonical_nodes: dict[str, EntityNode] = {}
    for _, resolved_nodes in episode_resolutions:
        for node in resolved_nodes:
+            # NOTE: this loop is O(n^2) in the number of nodes inside the batch because we rebuild
+            # the MinHash index for the accumulated canonical pool each time. The LRU-backed
+            # shingle cache keeps the constant factors low for typical batch sizes (≤ CHUNK_SIZE),
+            # but if batches grow significantly we should switch to an incremental index or chunked
+            # processing.
            if not canonical_nodes:
                canonical_nodes[node.uuid] = node
                continue