implement exact fact matching

2025-09-24 22:47:42 -07:00 · 2025-09-24 22:47:42 -07:00 · ced05d089c
commit ced05d089c
parent 7cd9798977
4 changed files with 71 additions and 7 deletions
--- a/graphiti_core/utils/maintenance/dedup_helpers.py
+++ b/graphiti_core/utils/maintenance/dedup_helpers.py
@ -36,7 +36,7 @@ _MINHASH_PERMUTATIONS = 32
 _MINHASH_BAND_SIZE = 4


-def _normalize_name_exact(name: str) -> str:
+def _normalize_string_exact(name: str) -> str:
    """Lowercase text and collapse whitespace so equal names map to the same key."""
    normalized = re.sub(r'[\s]+', ' ', name.lower())
    return normalized.strip()
@ -44,7 +44,7 @@ def _normalize_name_exact(name: str) -> str:

 def _normalize_name_for_fuzzy(name: str) -> str:
    """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
-    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_name_exact(name))
+    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
    normalized = normalized.strip()
    return re.sub(r'[\s]+', ' ', normalized)

@ -174,7 +174,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)

    for candidate in existing_nodes:
-        normalized = _normalize_name_exact(candidate.name)
+        normalized = _normalize_string_exact(candidate.name)
        normalized_existing[normalized].append(candidate)
        nodes_by_uuid[candidate.uuid] = candidate

@ -201,7 +201,7 @@ def _resolve_with_similarity(
 ) -> None:
    """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
    for idx, node in enumerate(extracted_nodes):
-        normalized_exact = _normalize_name_exact(node.name)
+        normalized_exact = _normalize_string_exact(node.name)
        normalized_fuzzy = _normalize_name_for_fuzzy(node.name)

        if not _has_high_entropy(normalized_fuzzy):
@ -244,7 +244,7 @@ def _resolve_with_similarity(
 __all__ = [
    'DedupCandidateIndexes',
    'DedupResolutionState',
-    '_normalize_name_exact',
+    '_normalize_string_exact',
    '_normalize_name_for_fuzzy',
    '_has_high_entropy',
    '_minhash_signature',
--- a/graphiti_core/utils/maintenance/edge_operations.py
+++ b/graphiti_core/utils/maintenance/edge_operations.py
@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults
 from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
 from graphiti_core.search.search_filters import SearchFilters
 from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
+from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact

 logger = logging.getLogger(__name__)

@ -397,6 +398,19 @@ async def resolve_extracted_edge(
    if len(related_edges) == 0 and len(existing_edges) == 0:
        return extracted_edge, [], []

+    # Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
+    normalized_fact = _normalize_string_exact(extracted_edge.fact)
+    for edge in related_edges:
+        if (
+            edge.source_node_uuid == extracted_edge.source_node_uuid
+            and edge.target_node_uuid == extracted_edge.target_node_uuid
+            and _normalize_string_exact(edge.fact) == normalized_fact
+        ):
+            resolved = edge
+            if episode is not None and episode.uuid not in resolved.episodes:
+                resolved.episodes.append(episode.uuid)
+            return resolved, [], []
+
    start = time()

    # Prepare context for LLM
--- a/tests/utils/maintenance/test_edge_operations.py
+++ b/tests/utils/maintenance/test_edge_operations.py
@ -5,6 +5,7 @@ import pytest

 from graphiti_core.edges import EntityEdge
 from graphiti_core.nodes import EpisodicNode
+from graphiti_core.utils.maintenance.edge_operations import resolve_extracted_edge


@pytest.fixture
@ -92,3 +93,52 @@ def mock_previous_episodes():
 # Run the tests
 if __name__ == '__main__':
    pytest.main([__file__])
+
+
+@pytest.mark.asyncio
+async def test_resolve_extracted_edge_exact_fact_short_circuit(
+    mock_llm_client,
+    mock_existing_edges,
+    mock_current_episode,
+):
+    extracted = EntityEdge(
+        source_node_uuid='source_uuid',
+        target_node_uuid='target_uuid',
+        name='test_edge',
+        group_id='group_1',
+        fact='Related fact',
+        episodes=['episode_1'],
+        created_at=datetime.now(timezone.utc),
+        valid_at=None,
+        invalid_at=None,
+    )
+
+    related_edges = [
+        EntityEdge(
+            source_node_uuid='source_uuid',
+            target_node_uuid='target_uuid',
+            name='related_edge',
+            group_id='group_1',
+            fact=' related FACT  ',
+            episodes=['episode_2'],
+            created_at=datetime.now(timezone.utc) - timedelta(days=1),
+            valid_at=None,
+            invalid_at=None,
+        )
+    ]
+
+    resolved_edge, duplicate_edges, invalidated = await resolve_extracted_edge(
+        mock_llm_client,
+        extracted,
+        related_edges,
+        mock_existing_edges,
+        mock_current_episode,
+        edge_types=None,
+        ensure_ascii=True,
+    )
+
+    assert resolved_edge is related_edges[0]
+    assert resolved_edge.episodes.count(mock_current_episode.uuid) == 1
+    assert duplicate_edges == []
+    assert invalidated == []
+    mock_llm_client.generate_response.assert_not_called()
--- a/tests/utils/maintenance/test_node_operations.py
+++ b/tests/utils/maintenance/test_node_operations.py
@ -18,8 +18,8 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
    _lsh_bands,
    _minhash_signature,
    _name_entropy,
-    _normalize_name_exact,
    _normalize_name_for_fuzzy,
+    _normalize_string_exact,
    _resolve_with_similarity,
    _shingles,
 )
@ -205,7 +205,7 @@ def test_build_candidate_indexes_populates_structures():


 def test_normalize_helpers():
-    assert _normalize_name_exact('  Alice   Smith ') == 'alice smith'
+    assert _normalize_string_exact('  Alice   Smith ') == 'alice smith'
    assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'