diff --git a/graphiti_core/utils/maintenance/dedup_helpers.py b/graphiti_core/utils/maintenance/dedup_helpers.py index 3c08512e..4916331e 100644 --- a/graphiti_core/utils/maintenance/dedup_helpers.py +++ b/graphiti_core/utils/maintenance/dedup_helpers.py @@ -36,7 +36,7 @@ _MINHASH_PERMUTATIONS = 32 _MINHASH_BAND_SIZE = 4 -def _normalize_name_exact(name: str) -> str: +def _normalize_string_exact(name: str) -> str: """Lowercase text and collapse whitespace so equal names map to the same key.""" normalized = re.sub(r'[\s]+', ' ', name.lower()) return normalized.strip() @@ -44,7 +44,7 @@ def _normalize_name_exact(name: str) -> str: def _normalize_name_for_fuzzy(name: str) -> str: """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles.""" - normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_name_exact(name)) + normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name)) normalized = normalized.strip() return re.sub(r'[\s]+', ' ', normalized) @@ -174,7 +174,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list) for candidate in existing_nodes: - normalized = _normalize_name_exact(candidate.name) + normalized = _normalize_string_exact(candidate.name) normalized_existing[normalized].append(candidate) nodes_by_uuid[candidate.uuid] = candidate @@ -201,7 +201,7 @@ def _resolve_with_similarity( ) -> None: """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons.""" for idx, node in enumerate(extracted_nodes): - normalized_exact = _normalize_name_exact(node.name) + normalized_exact = _normalize_string_exact(node.name) normalized_fuzzy = _normalize_name_for_fuzzy(node.name) if not _has_high_entropy(normalized_fuzzy): @@ -244,7 +244,7 @@ def _resolve_with_similarity( __all__ = [ 'DedupCandidateIndexes', 'DedupResolutionState', - '_normalize_name_exact', + '_normalize_string_exact', '_normalize_name_for_fuzzy', '_has_high_entropy', '_minhash_signature', diff --git a/graphiti_core/utils/maintenance/edge_operations.py b/graphiti_core/utils/maintenance/edge_operations.py index 259c1db3..4069a0bd 100644 --- a/graphiti_core/utils/maintenance/edge_operations.py +++ b/graphiti_core/utils/maintenance/edge_operations.py @@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF from graphiti_core.search.search_filters import SearchFilters from graphiti_core.utils.datetime_utils import ensure_utc, utc_now +from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact logger = logging.getLogger(__name__) @@ -397,6 +398,19 @@ async def resolve_extracted_edge( if len(related_edges) == 0 and len(existing_edges) == 0: return extracted_edge, [], [] + # Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge. + normalized_fact = _normalize_string_exact(extracted_edge.fact) + for edge in related_edges: + if ( + edge.source_node_uuid == extracted_edge.source_node_uuid + and edge.target_node_uuid == extracted_edge.target_node_uuid + and _normalize_string_exact(edge.fact) == normalized_fact + ): + resolved = edge + if episode is not None and episode.uuid not in resolved.episodes: + resolved.episodes.append(episode.uuid) + return resolved, [], [] + start = time() # Prepare context for LLM diff --git a/tests/utils/maintenance/test_edge_operations.py b/tests/utils/maintenance/test_edge_operations.py index cdb1de9f..3d5e0433 100644 --- a/tests/utils/maintenance/test_edge_operations.py +++ b/tests/utils/maintenance/test_edge_operations.py @@ -5,6 +5,7 @@ import pytest from graphiti_core.edges import EntityEdge from graphiti_core.nodes import EpisodicNode +from graphiti_core.utils.maintenance.edge_operations import resolve_extracted_edge @pytest.fixture @@ -92,3 +93,52 @@ def mock_previous_episodes(): # Run the tests if __name__ == '__main__': pytest.main([__file__]) + + +@pytest.mark.asyncio +async def test_resolve_extracted_edge_exact_fact_short_circuit( + mock_llm_client, + mock_existing_edges, + mock_current_episode, +): + extracted = EntityEdge( + source_node_uuid='source_uuid', + target_node_uuid='target_uuid', + name='test_edge', + group_id='group_1', + fact='Related fact', + episodes=['episode_1'], + created_at=datetime.now(timezone.utc), + valid_at=None, + invalid_at=None, + ) + + related_edges = [ + EntityEdge( + source_node_uuid='source_uuid', + target_node_uuid='target_uuid', + name='related_edge', + group_id='group_1', + fact=' related FACT ', + episodes=['episode_2'], + created_at=datetime.now(timezone.utc) - timedelta(days=1), + valid_at=None, + invalid_at=None, + ) + ] + + resolved_edge, duplicate_edges, invalidated = await resolve_extracted_edge( + mock_llm_client, + extracted, + related_edges, + mock_existing_edges, + mock_current_episode, + edge_types=None, + ensure_ascii=True, + ) + + assert resolved_edge is related_edges[0] + assert resolved_edge.episodes.count(mock_current_episode.uuid) == 1 + assert duplicate_edges == [] + assert invalidated == [] + mock_llm_client.generate_response.assert_not_called() diff --git a/tests/utils/maintenance/test_node_operations.py b/tests/utils/maintenance/test_node_operations.py index 1be44cf9..a7250559 100644 --- a/tests/utils/maintenance/test_node_operations.py +++ b/tests/utils/maintenance/test_node_operations.py @@ -18,8 +18,8 @@ from graphiti_core.utils.maintenance.dedup_helpers import ( _lsh_bands, _minhash_signature, _name_entropy, - _normalize_name_exact, _normalize_name_for_fuzzy, + _normalize_string_exact, _resolve_with_similarity, _shingles, ) @@ -205,7 +205,7 @@ def test_build_candidate_indexes_populates_structures(): def test_normalize_helpers(): - assert _normalize_name_exact(' Alice Smith ') == 'alice smith' + assert _normalize_string_exact(' Alice Smith ') == 'alice smith' assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'