enhance deduplication helpers with UUID indexing and update resolution logic

2025-09-24 21:56:28 -07:00 · 2025-09-24 21:56:28 -07:00 · 7cd9798977
commit 7cd9798977
parent 859d7aee5e
2 changed files with 30 additions and 5 deletions
--- a/graphiti_core/utils/maintenance/dedup_helpers.py
+++ b/graphiti_core/utils/maintenance/dedup_helpers.py
@ -151,6 +151,7 @@ class DedupCandidateIndexes:
    """Precomputed lookup structures that drive entity deduplication heuristics."""
    existing_nodes: list[EntityNode]
    nodes_by_uuid: dict[str, EntityNode]
    normalized_existing: defaultdict[str, list[EntityNode]]
    shingles_by_candidate: dict[str, set[str]]
    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
@ -168,12 +169,14 @@ class DedupResolutionState:
 def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
    """Precompute exact and fuzzy lookup structures once per dedupe run."""
    normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
    nodes_by_uuid: dict[str, EntityNode] = {}
    shingles_by_candidate: dict[str, set[str]] = {}
    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
    for candidate in existing_nodes:
        normalized = _normalize_name_exact(candidate.name)
        normalized_existing[normalized].append(candidate)
        nodes_by_uuid[candidate.uuid] = candidate
        shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
        shingles_by_candidate[candidate.uuid] = shingles
@ -184,6 +187,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
    return DedupCandidateIndexes(
        existing_nodes=existing_nodes,
        nodes_by_uuid=nodes_by_uuid,
        normalized_existing=normalized_existing,
        shingles_by_candidate=shingles_by_candidate,
        lsh_buckets=lsh_buckets,
@ -210,6 +214,9 @@ def _resolve_with_similarity(
            state.resolved_nodes[idx] = match
            state.uuid_map[node.uuid] = match.uuid
            continue
        if len(existing_matches) > 1:
            state.unresolved_indices.append(idx)
            continue
        shingles = _cached_shingles(normalized_fuzzy)
        signature = _minhash_signature(shingles)
@ -224,10 +231,7 @@ def _resolve_with_similarity(
            score = _jaccard_similarity(shingles, candidate_shingles)
            if score > best_score:
                best_score = score
-                best_candidate = next(
+                best_candidate = indexes.nodes_by_uuid.get(candidate_id)
                    (cand for cand in indexes.existing_nodes if cand.uuid == candidate_id),
                    None,
                )
        if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
            state.resolved_nodes[idx] = best_candidate
--- a/tests/utils/maintenance/test_node_operations.py
+++ b/tests/utils/maintenance/test_node_operations.py
@ -199,6 +199,7 @@ def test_build_candidate_indexes_populates_structures():
    normalized_key = candidate.name.lower()
    assert indexes.normalized_existing[normalized_key][0].uuid == candidate.uuid
    assert indexes.nodes_by_uuid[candidate.uuid] is candidate
    assert candidate.uuid in indexes.shingles_by_candidate
    assert any(candidate.uuid in bucket for bucket in indexes.lsh_buckets.values())
@ -260,7 +261,27 @@ def test_resolve_with_similarity_exact_match_updates_state():
 def test_resolve_with_similarity_low_entropy_defers_resolution():
    extracted = EntityNode(name='Bob', group_id='group', labels=['Entity'])
-    indexes = DedupCandidateIndexes([], defaultdict(list), {}, defaultdict(list))
+    indexes = DedupCandidateIndexes(
        existing_nodes=[],
        nodes_by_uuid={},
        normalized_existing=defaultdict(list),
        shingles_by_candidate={},
        lsh_buckets=defaultdict(list),
    )
    state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
    _resolve_with_similarity([extracted], indexes, state)
    assert state.resolved_nodes[0] is None
    assert state.unresolved_indices == [0]
 def test_resolve_with_similarity_multiple_exact_matches_defers_to_llm():
    candidate1 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
    candidate2 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
    extracted = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
    indexes = _build_candidate_indexes([candidate1, candidate2])
    state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
    _resolve_with_similarity([extracted], indexes, state)