enhance deduplication helpers with UUID indexing and update resolution logic

This commit is contained in:
Daniel Chalef 2025-09-24 21:56:28 -07:00
parent 859d7aee5e
commit 7cd9798977
2 changed files with 30 additions and 5 deletions

View file

@ -151,6 +151,7 @@ class DedupCandidateIndexes:
"""Precomputed lookup structures that drive entity deduplication heuristics."""
existing_nodes: list[EntityNode]
nodes_by_uuid: dict[str, EntityNode]
normalized_existing: defaultdict[str, list[EntityNode]]
shingles_by_candidate: dict[str, set[str]]
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
@ -168,12 +169,14 @@ class DedupResolutionState:
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
nodes_by_uuid: dict[str, EntityNode] = {}
shingles_by_candidate: dict[str, set[str]] = {}
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
for candidate in existing_nodes:
normalized = _normalize_name_exact(candidate.name)
normalized_existing[normalized].append(candidate)
nodes_by_uuid[candidate.uuid] = candidate
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
shingles_by_candidate[candidate.uuid] = shingles
@ -184,6 +187,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
return DedupCandidateIndexes(
existing_nodes=existing_nodes,
nodes_by_uuid=nodes_by_uuid,
normalized_existing=normalized_existing,
shingles_by_candidate=shingles_by_candidate,
lsh_buckets=lsh_buckets,
@ -210,6 +214,9 @@ def _resolve_with_similarity(
state.resolved_nodes[idx] = match
state.uuid_map[node.uuid] = match.uuid
continue
if len(existing_matches) > 1:
state.unresolved_indices.append(idx)
continue
shingles = _cached_shingles(normalized_fuzzy)
signature = _minhash_signature(shingles)
@ -224,10 +231,7 @@ def _resolve_with_similarity(
score = _jaccard_similarity(shingles, candidate_shingles)
if score > best_score:
best_score = score
best_candidate = next(
(cand for cand in indexes.existing_nodes if cand.uuid == candidate_id),
None,
)
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
state.resolved_nodes[idx] = best_candidate

View file

@ -199,6 +199,7 @@ def test_build_candidate_indexes_populates_structures():
normalized_key = candidate.name.lower()
assert indexes.normalized_existing[normalized_key][0].uuid == candidate.uuid
assert indexes.nodes_by_uuid[candidate.uuid] is candidate
assert candidate.uuid in indexes.shingles_by_candidate
assert any(candidate.uuid in bucket for bucket in indexes.lsh_buckets.values())
@ -260,7 +261,27 @@ def test_resolve_with_similarity_exact_match_updates_state():
def test_resolve_with_similarity_low_entropy_defers_resolution():
extracted = EntityNode(name='Bob', group_id='group', labels=['Entity'])
indexes = DedupCandidateIndexes([], defaultdict(list), {}, defaultdict(list))
indexes = DedupCandidateIndexes(
existing_nodes=[],
nodes_by_uuid={},
normalized_existing=defaultdict(list),
shingles_by_candidate={},
lsh_buckets=defaultdict(list),
)
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
_resolve_with_similarity([extracted], indexes, state)
assert state.resolved_nodes[0] is None
assert state.unresolved_indices == [0]
def test_resolve_with_similarity_multiple_exact_matches_defers_to_llm():
candidate1 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
candidate2 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
extracted = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
indexes = _build_candidate_indexes([candidate1, candidate2])
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
_resolve_with_similarity([extracted], indexes, state)