enhance deduplication helpers with UUID indexing and update resolution logic
This commit is contained in:
parent
859d7aee5e
commit
7cd9798977
2 changed files with 30 additions and 5 deletions
|
|
@ -151,6 +151,7 @@ class DedupCandidateIndexes:
|
||||||
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
||||||
|
|
||||||
existing_nodes: list[EntityNode]
|
existing_nodes: list[EntityNode]
|
||||||
|
nodes_by_uuid: dict[str, EntityNode]
|
||||||
normalized_existing: defaultdict[str, list[EntityNode]]
|
normalized_existing: defaultdict[str, list[EntityNode]]
|
||||||
shingles_by_candidate: dict[str, set[str]]
|
shingles_by_candidate: dict[str, set[str]]
|
||||||
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
||||||
|
|
@ -168,12 +169,14 @@ class DedupResolutionState:
|
||||||
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
||||||
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
||||||
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
||||||
|
nodes_by_uuid: dict[str, EntityNode] = {}
|
||||||
shingles_by_candidate: dict[str, set[str]] = {}
|
shingles_by_candidate: dict[str, set[str]] = {}
|
||||||
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
||||||
|
|
||||||
for candidate in existing_nodes:
|
for candidate in existing_nodes:
|
||||||
normalized = _normalize_name_exact(candidate.name)
|
normalized = _normalize_name_exact(candidate.name)
|
||||||
normalized_existing[normalized].append(candidate)
|
normalized_existing[normalized].append(candidate)
|
||||||
|
nodes_by_uuid[candidate.uuid] = candidate
|
||||||
|
|
||||||
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
||||||
shingles_by_candidate[candidate.uuid] = shingles
|
shingles_by_candidate[candidate.uuid] = shingles
|
||||||
|
|
@ -184,6 +187,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
|
||||||
|
|
||||||
return DedupCandidateIndexes(
|
return DedupCandidateIndexes(
|
||||||
existing_nodes=existing_nodes,
|
existing_nodes=existing_nodes,
|
||||||
|
nodes_by_uuid=nodes_by_uuid,
|
||||||
normalized_existing=normalized_existing,
|
normalized_existing=normalized_existing,
|
||||||
shingles_by_candidate=shingles_by_candidate,
|
shingles_by_candidate=shingles_by_candidate,
|
||||||
lsh_buckets=lsh_buckets,
|
lsh_buckets=lsh_buckets,
|
||||||
|
|
@ -210,6 +214,9 @@ def _resolve_with_similarity(
|
||||||
state.resolved_nodes[idx] = match
|
state.resolved_nodes[idx] = match
|
||||||
state.uuid_map[node.uuid] = match.uuid
|
state.uuid_map[node.uuid] = match.uuid
|
||||||
continue
|
continue
|
||||||
|
if len(existing_matches) > 1:
|
||||||
|
state.unresolved_indices.append(idx)
|
||||||
|
continue
|
||||||
|
|
||||||
shingles = _cached_shingles(normalized_fuzzy)
|
shingles = _cached_shingles(normalized_fuzzy)
|
||||||
signature = _minhash_signature(shingles)
|
signature = _minhash_signature(shingles)
|
||||||
|
|
@ -224,10 +231,7 @@ def _resolve_with_similarity(
|
||||||
score = _jaccard_similarity(shingles, candidate_shingles)
|
score = _jaccard_similarity(shingles, candidate_shingles)
|
||||||
if score > best_score:
|
if score > best_score:
|
||||||
best_score = score
|
best_score = score
|
||||||
best_candidate = next(
|
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
|
||||||
(cand for cand in indexes.existing_nodes if cand.uuid == candidate_id),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
|
|
||||||
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
||||||
state.resolved_nodes[idx] = best_candidate
|
state.resolved_nodes[idx] = best_candidate
|
||||||
|
|
|
||||||
|
|
@ -199,6 +199,7 @@ def test_build_candidate_indexes_populates_structures():
|
||||||
|
|
||||||
normalized_key = candidate.name.lower()
|
normalized_key = candidate.name.lower()
|
||||||
assert indexes.normalized_existing[normalized_key][0].uuid == candidate.uuid
|
assert indexes.normalized_existing[normalized_key][0].uuid == candidate.uuid
|
||||||
|
assert indexes.nodes_by_uuid[candidate.uuid] is candidate
|
||||||
assert candidate.uuid in indexes.shingles_by_candidate
|
assert candidate.uuid in indexes.shingles_by_candidate
|
||||||
assert any(candidate.uuid in bucket for bucket in indexes.lsh_buckets.values())
|
assert any(candidate.uuid in bucket for bucket in indexes.lsh_buckets.values())
|
||||||
|
|
||||||
|
|
@ -260,7 +261,27 @@ def test_resolve_with_similarity_exact_match_updates_state():
|
||||||
|
|
||||||
def test_resolve_with_similarity_low_entropy_defers_resolution():
|
def test_resolve_with_similarity_low_entropy_defers_resolution():
|
||||||
extracted = EntityNode(name='Bob', group_id='group', labels=['Entity'])
|
extracted = EntityNode(name='Bob', group_id='group', labels=['Entity'])
|
||||||
indexes = DedupCandidateIndexes([], defaultdict(list), {}, defaultdict(list))
|
indexes = DedupCandidateIndexes(
|
||||||
|
existing_nodes=[],
|
||||||
|
nodes_by_uuid={},
|
||||||
|
normalized_existing=defaultdict(list),
|
||||||
|
shingles_by_candidate={},
|
||||||
|
lsh_buckets=defaultdict(list),
|
||||||
|
)
|
||||||
|
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
|
||||||
|
|
||||||
|
_resolve_with_similarity([extracted], indexes, state)
|
||||||
|
|
||||||
|
assert state.resolved_nodes[0] is None
|
||||||
|
assert state.unresolved_indices == [0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_with_similarity_multiple_exact_matches_defers_to_llm():
|
||||||
|
candidate1 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
|
||||||
|
candidate2 = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
|
||||||
|
extracted = EntityNode(name='Johnny Appleseed', group_id='group', labels=['Entity'])
|
||||||
|
|
||||||
|
indexes = _build_candidate_indexes([candidate1, candidate2])
|
||||||
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
|
state = DedupResolutionState(resolved_nodes=[None], uuid_map={}, unresolved_indices=[])
|
||||||
|
|
||||||
_resolve_with_similarity([extracted], indexes, state)
|
_resolve_with_similarity([extracted], indexes, state)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue