enhance deduplication helpers with UUID indexing and update resolution logic
This commit is contained in:
parent
859d7aee5e
commit
7cd9798977
2 changed files with 30 additions and 5 deletions
|
|
@ -151,6 +151,7 @@ class DedupCandidateIndexes:
|
|||
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
||||
|
||||
existing_nodes: list[EntityNode]
|
||||
nodes_by_uuid: dict[str, EntityNode]
|
||||
normalized_existing: defaultdict[str, list[EntityNode]]
|
||||
shingles_by_candidate: dict[str, set[str]]
|
||||
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
||||
|
|
@ -168,12 +169,14 @@ class DedupResolutionState:
|
|||
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
||||
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
||||
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
||||
nodes_by_uuid: dict[str, EntityNode] = {}
|
||||
shingles_by_candidate: dict[str, set[str]] = {}
|
||||
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
||||
|
||||
for candidate in existing_nodes:
|
||||
normalized = _normalize_name_exact(candidate.name)
|
||||
normalized_existing[normalized].append(candidate)
|
||||
nodes_by_uuid[candidate.uuid] = candidate
|
||||
|
||||
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
||||
shingles_by_candidate[candidate.uuid] = shingles
|
||||
|
|
@ -184,6 +187,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
|
|||
|
||||
return DedupCandidateIndexes(
|
||||
existing_nodes=existing_nodes,
|
||||
nodes_by_uuid=nodes_by_uuid,
|
||||
normalized_existing=normalized_existing,
|
||||
shingles_by_candidate=shingles_by_candidate,
|
||||
lsh_buckets=lsh_buckets,
|
||||
|
|
@ -210,6 +214,9 @@ def _resolve_with_similarity(
|
|||
state.resolved_nodes[idx] = match
|
||||
state.uuid_map[node.uuid] = match.uuid
|
||||
continue
|
||||
if len(existing_matches) > 1:
|
||||
state.unresolved_indices.append(idx)
|
||||
continue
|
||||
|
||||
shingles = _cached_shingles(normalized_fuzzy)
|
||||
signature = _minhash_signature(shingles)
|
||||
|
|
@ -224,10 +231,7 @@ def _resolve_with_similarity(
|
|||
score = _jaccard_similarity(shingles, candidate_shingles)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_candidate = next(
|
||||
(cand for cand in indexes.existing_nodes if cand.uuid == candidate_id),
|
||||
None,
|
||||
)
|
||||
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
|
||||
|
||||
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
||||
state.resolved_nodes[idx] = best_candidate
|
||||
|
|
|
|||
|
|
@ -199,6 +199,7 @@ def test_build_candidate_indexes_populates_structures():
|
|||
|
||||
normalized_key = candidate.name.lower()
|
||||
assert indexes.normalized_existing[normalized_key][0].uuid == candidate.uuid
|
||||
assert indexes.nodes_by_uuid[candidate.uuid] is candidate
|
||||
assert candidate.uuid in indexes.shingles_by_candidate
|
||||
assert any(candidate.uuid in bucket for bucket in indexes.lsh_buckets.values())
|
||||
|
||||
|
|
@ -260,7 +261,27 @@ def test_resolve_with_similarity_exact_match_updates_state():
|
|||
|
||||
def test_resolve_with_similarity_low_entropy_defers_resolution():
|
||||
extracted = EntityNode(name='Bob', group_id='group', labels=['Entity'])
|
||||
indexes = DedupCandidateIndexes([], defaultdict(list), {}, defaultdict(list))
|
||||
indexes = DedupCandidateIndexes(
|
||||
existing_nodes=[],
|
||||
nodes_by_uuid={},
|
||||
normalized_existing=defaultdict(list),
|
||||
shingles_by_candidate={},
|
||||
lsh_buckets=defaultdict(list),
|
||||
)
|
||||
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
|
||||
|
||||
_resolve_with_similarity([extracted], indexes, state)
|
||||
|
||||
assert state.resolved_nodes[0] is None
|
||||
assert state.unresolved_indices == [0]
|
||||
|
||||
|
||||
def test_resolve_with_similarity_multiple_exact_matches_defers_to_llm():
|
||||
candidate1 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
|
||||
candidate2 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
|
||||
extracted = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
|
||||
|
||||
indexes = _build_candidate_indexes([candidate1, candidate2])
|
||||
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
|
||||
|
||||
_resolve_with_similarity([extracted], indexes, state)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue