implement exact fact matching

This commit is contained in:
Daniel Chalef 2025-09-24 22:47:42 -07:00
parent 7cd9798977
commit ced05d089c
4 changed files with 71 additions and 7 deletions

View file

@ -36,7 +36,7 @@ _MINHASH_PERMUTATIONS = 32
_MINHASH_BAND_SIZE = 4
def _normalize_name_exact(name: str) -> str:
def _normalize_string_exact(name: str) -> str:
"""Lowercase text and collapse whitespace so equal names map to the same key."""
normalized = re.sub(r'[\s]+', ' ', name.lower())
return normalized.strip()
@ -44,7 +44,7 @@ def _normalize_name_exact(name: str) -> str:
def _normalize_name_for_fuzzy(name: str) -> str:
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_name_exact(name))
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
normalized = normalized.strip()
return re.sub(r'[\s]+', ' ', normalized)
@ -174,7 +174,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
for candidate in existing_nodes:
normalized = _normalize_name_exact(candidate.name)
normalized = _normalize_string_exact(candidate.name)
normalized_existing[normalized].append(candidate)
nodes_by_uuid[candidate.uuid] = candidate
@ -201,7 +201,7 @@ def _resolve_with_similarity(
) -> None:
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
for idx, node in enumerate(extracted_nodes):
normalized_exact = _normalize_name_exact(node.name)
normalized_exact = _normalize_string_exact(node.name)
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
if not _has_high_entropy(normalized_fuzzy):
@ -244,7 +244,7 @@ def _resolve_with_similarity(
__all__ = [
'DedupCandidateIndexes',
'DedupResolutionState',
'_normalize_name_exact',
'_normalize_string_exact',
'_normalize_name_for_fuzzy',
'_has_high_entropy',
'_minhash_signature',

View file

@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults
from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
from graphiti_core.search.search_filters import SearchFilters
from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
logger = logging.getLogger(__name__)
@ -397,6 +398,19 @@ async def resolve_extracted_edge(
if len(related_edges) == 0 and len(existing_edges) == 0:
return extracted_edge, [], []
# Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
normalized_fact = _normalize_string_exact(extracted_edge.fact)
for edge in related_edges:
if (
edge.source_node_uuid == extracted_edge.source_node_uuid
and edge.target_node_uuid == extracted_edge.target_node_uuid
and _normalize_string_exact(edge.fact) == normalized_fact
):
resolved = edge
if episode is not None and episode.uuid not in resolved.episodes:
resolved.episodes.append(episode.uuid)
return resolved, [], []
start = time()
# Prepare context for LLM

View file

@ -5,6 +5,7 @@ import pytest
from graphiti_core.edges import EntityEdge
from graphiti_core.nodes import EpisodicNode
from graphiti_core.utils.maintenance.edge_operations import resolve_extracted_edge
@pytest.fixture
@ -92,3 +93,52 @@ def mock_previous_episodes():
# Run the tests
if __name__ == '__main__':
pytest.main([__file__])
@pytest.mark.asyncio
async def test_resolve_extracted_edge_exact_fact_short_circuit(
mock_llm_client,
mock_existing_edges,
mock_current_episode,
):
extracted = EntityEdge(
source_node_uuid='source_uuid',
target_node_uuid='target_uuid',
name='test_edge',
group_id='group_1',
fact='Related fact',
episodes=['episode_1'],
created_at=datetime.now(timezone.utc),
valid_at=None,
invalid_at=None,
)
related_edges = [
EntityEdge(
source_node_uuid='source_uuid',
target_node_uuid='target_uuid',
name='related_edge',
group_id='group_1',
fact=' related FACT ',
episodes=['episode_2'],
created_at=datetime.now(timezone.utc) - timedelta(days=1),
valid_at=None,
invalid_at=None,
)
]
resolved_edge, duplicate_edges, invalidated = await resolve_extracted_edge(
mock_llm_client,
extracted,
related_edges,
mock_existing_edges,
mock_current_episode,
edge_types=None,
ensure_ascii=True,
)
assert resolved_edge is related_edges[0]
assert resolved_edge.episodes.count(mock_current_episode.uuid) == 1
assert duplicate_edges == []
assert invalidated == []
mock_llm_client.generate_response.assert_not_called()

View file

@ -18,8 +18,8 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
_lsh_bands,
_minhash_signature,
_name_entropy,
_normalize_name_exact,
_normalize_name_for_fuzzy,
_normalize_string_exact,
_resolve_with_similarity,
_shingles,
)
@ -205,7 +205,7 @@ def test_build_candidate_indexes_populates_structures():
def test_normalize_helpers():
assert _normalize_name_exact(' Alice Smith ') == 'alice smith'
assert _normalize_string_exact(' Alice Smith ') == 'alice smith'
assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'