diff --git a/graphiti_core/utils/maintenance/dedup_helpers.py b/graphiti_core/utils/maintenance/dedup_helpers.py index c5ee8024..3c08512e 100644 --- a/graphiti_core/utils/maintenance/dedup_helpers.py +++ b/graphiti_core/utils/maintenance/dedup_helpers.py @@ -151,6 +151,7 @@ class DedupCandidateIndexes: """Precomputed lookup structures that drive entity deduplication heuristics.""" existing_nodes: list[EntityNode] + nodes_by_uuid: dict[str, EntityNode] normalized_existing: defaultdict[str, list[EntityNode]] shingles_by_candidate: dict[str, set[str]] lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] @@ -168,12 +169,14 @@ class DedupResolutionState: def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes: """Precompute exact and fuzzy lookup structures once per dedupe run.""" normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list) + nodes_by_uuid: dict[str, EntityNode] = {} shingles_by_candidate: dict[str, set[str]] = {} lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list) for candidate in existing_nodes: normalized = _normalize_name_exact(candidate.name) normalized_existing[normalized].append(candidate) + nodes_by_uuid[candidate.uuid] = candidate shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name)) shingles_by_candidate[candidate.uuid] = shingles @@ -184,6 +187,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate return DedupCandidateIndexes( existing_nodes=existing_nodes, + nodes_by_uuid=nodes_by_uuid, normalized_existing=normalized_existing, shingles_by_candidate=shingles_by_candidate, lsh_buckets=lsh_buckets, @@ -210,6 +214,9 @@ def _resolve_with_similarity( state.resolved_nodes[idx] = match state.uuid_map[node.uuid] = match.uuid continue + if len(existing_matches) > 1: + state.unresolved_indices.append(idx) + continue shingles = _cached_shingles(normalized_fuzzy) signature = _minhash_signature(shingles) @@ -224,10 +231,7 @@ def _resolve_with_similarity( score = _jaccard_similarity(shingles, candidate_shingles) if score > best_score: best_score = score - best_candidate = next( - (cand for cand in indexes.existing_nodes if cand.uuid == candidate_id), - None, - ) + best_candidate = indexes.nodes_by_uuid.get(candidate_id) if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD: state.resolved_nodes[idx] = best_candidate diff --git a/tests/utils/maintenance/test_node_operations.py b/tests/utils/maintenance/test_node_operations.py index 58a8f27a..1be44cf9 100644 --- a/tests/utils/maintenance/test_node_operations.py +++ b/tests/utils/maintenance/test_node_operations.py @@ -199,6 +199,7 @@ def test_build_candidate_indexes_populates_structures(): normalized_key = candidate.name.lower() assert indexes.normalized_existing[normalized_key][0].uuid == candidate.uuid + assert indexes.nodes_by_uuid[candidate.uuid] is candidate assert candidate.uuid in indexes.shingles_by_candidate assert any(candidate.uuid in bucket for bucket in indexes.lsh_buckets.values()) @@ -260,7 +261,27 @@ def test_resolve_with_similarity_exact_match_updates_state(): def test_resolve_with_similarity_low_entropy_defers_resolution(): extracted = EntityNode(name='Bob', group_id='group', labels=['Entity']) - indexes = DedupCandidateIndexes([], defaultdict(list), {}, defaultdict(list)) + indexes = DedupCandidateIndexes( + existing_nodes=[], + nodes_by_uuid={}, + normalized_existing=defaultdict(list), + shingles_by_candidate={}, + lsh_buckets=defaultdict(list), + ) + state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[]) + + _resolve_with_similarity([extracted], indexes, state) + + assert state.resolved_nodes[0] is None + assert state.unresolved_indices == [0] + + +def test_resolve_with_similarity_multiple_exact_matches_defers_to_llm(): + candidate1 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity']) + candidate2 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity']) + extracted = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity']) + + indexes = _build_candidate_indexes([candidate1, candidate2]) state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[]) _resolve_with_similarity([extracted], indexes, state)