implement exact fact matching
This commit is contained in:
parent
7cd9798977
commit
ced05d089c
4 changed files with 71 additions and 7 deletions
|
|
@ -36,7 +36,7 @@ _MINHASH_PERMUTATIONS = 32
|
|||
_MINHASH_BAND_SIZE = 4
|
||||
|
||||
|
||||
def _normalize_name_exact(name: str) -> str:
|
||||
def _normalize_string_exact(name: str) -> str:
|
||||
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
||||
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
||||
return normalized.strip()
|
||||
|
|
@ -44,7 +44,7 @@ def _normalize_name_exact(name: str) -> str:
|
|||
|
||||
def _normalize_name_for_fuzzy(name: str) -> str:
|
||||
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
||||
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_name_exact(name))
|
||||
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
|
||||
normalized = normalized.strip()
|
||||
return re.sub(r'[\s]+', ' ', normalized)
|
||||
|
||||
|
|
@ -174,7 +174,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
|
|||
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
||||
|
||||
for candidate in existing_nodes:
|
||||
normalized = _normalize_name_exact(candidate.name)
|
||||
normalized = _normalize_string_exact(candidate.name)
|
||||
normalized_existing[normalized].append(candidate)
|
||||
nodes_by_uuid[candidate.uuid] = candidate
|
||||
|
||||
|
|
@ -201,7 +201,7 @@ def _resolve_with_similarity(
|
|||
) -> None:
|
||||
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
||||
for idx, node in enumerate(extracted_nodes):
|
||||
normalized_exact = _normalize_name_exact(node.name)
|
||||
normalized_exact = _normalize_string_exact(node.name)
|
||||
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
||||
|
||||
if not _has_high_entropy(normalized_fuzzy):
|
||||
|
|
@ -244,7 +244,7 @@ def _resolve_with_similarity(
|
|||
__all__ = [
|
||||
'DedupCandidateIndexes',
|
||||
'DedupResolutionState',
|
||||
'_normalize_name_exact',
|
||||
'_normalize_string_exact',
|
||||
'_normalize_name_for_fuzzy',
|
||||
'_has_high_entropy',
|
||||
'_minhash_signature',
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults
|
|||
from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
|
||||
from graphiti_core.search.search_filters import SearchFilters
|
||||
from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
|
||||
from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -397,6 +398,19 @@ async def resolve_extracted_edge(
|
|||
if len(related_edges) == 0 and len(existing_edges) == 0:
|
||||
return extracted_edge, [], []
|
||||
|
||||
# Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
|
||||
normalized_fact = _normalize_string_exact(extracted_edge.fact)
|
||||
for edge in related_edges:
|
||||
if (
|
||||
edge.source_node_uuid == extracted_edge.source_node_uuid
|
||||
and edge.target_node_uuid == extracted_edge.target_node_uuid
|
||||
and _normalize_string_exact(edge.fact) == normalized_fact
|
||||
):
|
||||
resolved = edge
|
||||
if episode is not None and episode.uuid not in resolved.episodes:
|
||||
resolved.episodes.append(episode.uuid)
|
||||
return resolved, [], []
|
||||
|
||||
start = time()
|
||||
|
||||
# Prepare context for LLM
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import pytest
|
|||
|
||||
from graphiti_core.edges import EntityEdge
|
||||
from graphiti_core.nodes import EpisodicNode
|
||||
from graphiti_core.utils.maintenance.edge_operations import resolve_extracted_edge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -92,3 +93,52 @@ def mock_previous_episodes():
|
|||
# Run the tests
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_extracted_edge_exact_fact_short_circuit(
|
||||
mock_llm_client,
|
||||
mock_existing_edges,
|
||||
mock_current_episode,
|
||||
):
|
||||
extracted = EntityEdge(
|
||||
source_node_uuid='source_uuid',
|
||||
target_node_uuid='target_uuid',
|
||||
name='test_edge',
|
||||
group_id='group_1',
|
||||
fact='Related fact',
|
||||
episodes=['episode_1'],
|
||||
created_at=datetime.now(timezone.utc),
|
||||
valid_at=None,
|
||||
invalid_at=None,
|
||||
)
|
||||
|
||||
related_edges = [
|
||||
EntityEdge(
|
||||
source_node_uuid='source_uuid',
|
||||
target_node_uuid='target_uuid',
|
||||
name='related_edge',
|
||||
group_id='group_1',
|
||||
fact=' related FACT ',
|
||||
episodes=['episode_2'],
|
||||
created_at=datetime.now(timezone.utc) - timedelta(days=1),
|
||||
valid_at=None,
|
||||
invalid_at=None,
|
||||
)
|
||||
]
|
||||
|
||||
resolved_edge, duplicate_edges, invalidated = await resolve_extracted_edge(
|
||||
mock_llm_client,
|
||||
extracted,
|
||||
related_edges,
|
||||
mock_existing_edges,
|
||||
mock_current_episode,
|
||||
edge_types=None,
|
||||
ensure_ascii=True,
|
||||
)
|
||||
|
||||
assert resolved_edge is related_edges[0]
|
||||
assert resolved_edge.episodes.count(mock_current_episode.uuid) == 1
|
||||
assert duplicate_edges == []
|
||||
assert invalidated == []
|
||||
mock_llm_client.generate_response.assert_not_called()
|
||||
|
|
|
|||
|
|
@ -18,8 +18,8 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
|
|||
_lsh_bands,
|
||||
_minhash_signature,
|
||||
_name_entropy,
|
||||
_normalize_name_exact,
|
||||
_normalize_name_for_fuzzy,
|
||||
_normalize_string_exact,
|
||||
_resolve_with_similarity,
|
||||
_shingles,
|
||||
)
|
||||
|
|
@ -205,7 +205,7 @@ def test_build_candidate_indexes_populates_structures():
|
|||
|
||||
|
||||
def test_normalize_helpers():
|
||||
assert _normalize_name_exact(' Alice Smith ') == 'alice smith'
|
||||
assert _normalize_string_exact(' Alice Smith ') == 'alice smith'
|
||||
assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue