implement exact fact matching

2025-09-24 22:47:42 -07:00 · 2025-09-24 22:47:42 -07:00 · ced05d089c
commit ced05d089c
parent 7cd9798977
4 changed files with 71 additions and 7 deletions
--- a/graphiti_core/utils/maintenance/dedup_helpers.py
+++ b/graphiti_core/utils/maintenance/dedup_helpers.py
@ -36,7 +36,7 @@ _MINHASH_PERMUTATIONS = 32
 _MINHASH_BAND_SIZE = 4
-def _normalize_name_exact(name: str) -> str:
+def _normalize_string_exact(name: str) -> str:
    """Lowercase text and collapse whitespace so equal names map to the same key."""
    normalized = re.sub(r'[\s]+', ' ', name.lower())
    return normalized.strip()
@ -44,7 +44,7 @@ def _normalize_name_exact(name: str) -> str:
 def _normalize_name_for_fuzzy(name: str) -> str:
    """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
-    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_name_exact(name))
+    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
    normalized = normalized.strip()
    return re.sub(r'[\s]+', ' ', normalized)
@ -174,7 +174,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
    for candidate in existing_nodes:
-        normalized = _normalize_name_exact(candidate.name)
+        normalized = _normalize_string_exact(candidate.name)
        normalized_existing[normalized].append(candidate)
        nodes_by_uuid[candidate.uuid] = candidate
@ -201,7 +201,7 @@ def _resolve_with_similarity(
 ) -> None:
    """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
    for idx, node in enumerate(extracted_nodes):
-        normalized_exact = _normalize_name_exact(node.name)
+        normalized_exact = _normalize_string_exact(node.name)
        normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
        if not _has_high_entropy(normalized_fuzzy):
@ -244,7 +244,7 @@ def _resolve_with_similarity(
 __all__ = [
    'DedupCandidateIndexes',
    'DedupResolutionState',
-    '_normalize_name_exact',
+    '_normalize_string_exact',
    '_normalize_name_for_fuzzy',
    '_has_high_entropy',
    '_minhash_signature',
--- a/graphiti_core/utils/maintenance/edge_operations.py
+++ b/graphiti_core/utils/maintenance/edge_operations.py
@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults
 from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
 from graphiti_core.search.search_filters import SearchFilters
 from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
 from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
 logger = logging.getLogger(__name__)
@ -397,6 +398,19 @@ async def resolve_extracted_edge(
    if len(related_edges) == 0 and len(existing_edges) == 0:
        return extracted_edge, [], []
    # Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
    normalized_fact = _normalize_string_exact(extracted_edge.fact)
    for edge in related_edges:
        if (
            edge.source_node_uuid == extracted_edge.source_node_uuid
            and edge.target_node_uuid == extracted_edge.target_node_uuid
            and _normalize_string_exact(edge.fact) == normalized_fact
        ):
            resolved = edge
            if episode is not None and episode.uuid not in resolved.episodes:
                resolved.episodes.append(episode.uuid)
            return resolved, [], []
    start = time()
    # Prepare context for LLM
--- a/tests/utils/maintenance/test_edge_operations.py
+++ b/tests/utils/maintenance/test_edge_operations.py
@ -5,6 +5,7 @@ import pytest
 from graphiti_core.edges import EntityEdge
 from graphiti_core.nodes import EpisodicNode
 from graphiti_core.utils.maintenance.edge_operations import resolve_extracted_edge
@pytest.fixture
@ -92,3 +93,52 @@ def mock_previous_episodes():
 # Run the tests
 if __name__ == '__main__':
    pytest.main([__file__])
@pytest.mark.asyncio
 async def test_resolve_extracted_edge_exact_fact_short_circuit(
    mock_llm_client,
    mock_existing_edges,
    mock_current_episode,
 ):
    extracted = EntityEdge(
        source_node_uuid='source_uuid',
        target_node_uuid='target_uuid',
        name='test_edge',
        group_id='group_1',
        fact='Related fact',
        episodes=['episode_1'],
        created_at=datetime.now(timezone.utc),
        valid_at=None,
        invalid_at=None,
    )
    related_edges = [
        EntityEdge(
            source_node_uuid='source_uuid',
            target_node_uuid='target_uuid',
            name='related_edge',
            group_id='group_1',
            fact=' related FACT  ',
            episodes=['episode_2'],
            created_at=datetime.now(timezone.utc) - timedelta(days=1),
            valid_at=None,
            invalid_at=None,
        )
    ]
    resolved_edge, duplicate_edges, invalidated = await resolve_extracted_edge(
        mock_llm_client,
        extracted,
        related_edges,
        mock_existing_edges,
        mock_current_episode,
        edge_types=None,
        ensure_ascii=True,
    )
    assert resolved_edge is related_edges[0]
    assert resolved_edge.episodes.count(mock_current_episode.uuid) == 1
    assert duplicate_edges == []
    assert invalidated == []
    mock_llm_client.generate_response.assert_not_called()
--- a/tests/utils/maintenance/test_node_operations.py
+++ b/tests/utils/maintenance/test_node_operations.py
@ -18,8 +18,8 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
    _lsh_bands,
    _minhash_signature,
    _name_entropy,
    _normalize_name_exact,
    _normalize_name_for_fuzzy,
    _normalize_string_exact,
    _resolve_with_similarity,
    _shingles,
 )
@ -205,7 +205,7 @@ def test_build_candidate_indexes_populates_structures():
 def test_normalize_helpers():
-    assert _normalize_name_exact('  Alice   Smith ') == 'alice smith'
+    assert _normalize_string_exact('  Alice   Smith ') == 'alice smith'
    assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'