implement exact fact matching
This commit is contained in:
parent
7cd9798977
commit
ced05d089c
4 changed files with 71 additions and 7 deletions
|
|
@ -36,7 +36,7 @@ _MINHASH_PERMUTATIONS = 32
|
||||||
_MINHASH_BAND_SIZE = 4
|
_MINHASH_BAND_SIZE = 4
|
||||||
|
|
||||||
|
|
||||||
def _normalize_name_exact(name: str) -> str:
|
def _normalize_string_exact(name: str) -> str:
|
||||||
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
||||||
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
||||||
return normalized.strip()
|
return normalized.strip()
|
||||||
|
|
@ -44,7 +44,7 @@ def _normalize_name_exact(name: str) -> str:
|
||||||
|
|
||||||
def _normalize_name_for_fuzzy(name: str) -> str:
|
def _normalize_name_for_fuzzy(name: str) -> str:
|
||||||
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
||||||
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_name_exact(name))
|
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
|
||||||
normalized = normalized.strip()
|
normalized = normalized.strip()
|
||||||
return re.sub(r'[\s]+', ' ', normalized)
|
return re.sub(r'[\s]+', ' ', normalized)
|
||||||
|
|
||||||
|
|
@ -174,7 +174,7 @@ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidate
|
||||||
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
||||||
|
|
||||||
for candidate in existing_nodes:
|
for candidate in existing_nodes:
|
||||||
normalized = _normalize_name_exact(candidate.name)
|
normalized = _normalize_string_exact(candidate.name)
|
||||||
normalized_existing[normalized].append(candidate)
|
normalized_existing[normalized].append(candidate)
|
||||||
nodes_by_uuid[candidate.uuid] = candidate
|
nodes_by_uuid[candidate.uuid] = candidate
|
||||||
|
|
||||||
|
|
@ -201,7 +201,7 @@ def _resolve_with_similarity(
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
||||||
for idx, node in enumerate(extracted_nodes):
|
for idx, node in enumerate(extracted_nodes):
|
||||||
normalized_exact = _normalize_name_exact(node.name)
|
normalized_exact = _normalize_string_exact(node.name)
|
||||||
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
||||||
|
|
||||||
if not _has_high_entropy(normalized_fuzzy):
|
if not _has_high_entropy(normalized_fuzzy):
|
||||||
|
|
@ -244,7 +244,7 @@ def _resolve_with_similarity(
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'DedupCandidateIndexes',
|
'DedupCandidateIndexes',
|
||||||
'DedupResolutionState',
|
'DedupResolutionState',
|
||||||
'_normalize_name_exact',
|
'_normalize_string_exact',
|
||||||
'_normalize_name_for_fuzzy',
|
'_normalize_name_for_fuzzy',
|
||||||
'_has_high_entropy',
|
'_has_high_entropy',
|
||||||
'_minhash_signature',
|
'_minhash_signature',
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults
|
||||||
from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
|
from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
|
||||||
from graphiti_core.search.search_filters import SearchFilters
|
from graphiti_core.search.search_filters import SearchFilters
|
||||||
from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
|
from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
|
||||||
|
from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -397,6 +398,19 @@ async def resolve_extracted_edge(
|
||||||
if len(related_edges) == 0 and len(existing_edges) == 0:
|
if len(related_edges) == 0 and len(existing_edges) == 0:
|
||||||
return extracted_edge, [], []
|
return extracted_edge, [], []
|
||||||
|
|
||||||
|
# Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
|
||||||
|
normalized_fact = _normalize_string_exact(extracted_edge.fact)
|
||||||
|
for edge in related_edges:
|
||||||
|
if (
|
||||||
|
edge.source_node_uuid == extracted_edge.source_node_uuid
|
||||||
|
and edge.target_node_uuid == extracted_edge.target_node_uuid
|
||||||
|
and _normalize_string_exact(edge.fact) == normalized_fact
|
||||||
|
):
|
||||||
|
resolved = edge
|
||||||
|
if episode is not None and episode.uuid not in resolved.episodes:
|
||||||
|
resolved.episodes.append(episode.uuid)
|
||||||
|
return resolved, [], []
|
||||||
|
|
||||||
start = time()
|
start = time()
|
||||||
|
|
||||||
# Prepare context for LLM
|
# Prepare context for LLM
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import pytest
|
||||||
|
|
||||||
from graphiti_core.edges import EntityEdge
|
from graphiti_core.edges import EntityEdge
|
||||||
from graphiti_core.nodes import EpisodicNode
|
from graphiti_core.nodes import EpisodicNode
|
||||||
|
from graphiti_core.utils.maintenance.edge_operations import resolve_extracted_edge
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
@ -92,3 +93,52 @@ def mock_previous_episodes():
|
||||||
# Run the tests
|
# Run the tests
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main([__file__])
|
pytest.main([__file__])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_resolve_extracted_edge_exact_fact_short_circuit(
|
||||||
|
mock_llm_client,
|
||||||
|
mock_existing_edges,
|
||||||
|
mock_current_episode,
|
||||||
|
):
|
||||||
|
extracted = EntityEdge(
|
||||||
|
source_node_uuid='source_uuid',
|
||||||
|
target_node_uuid='target_uuid',
|
||||||
|
name='test_edge',
|
||||||
|
group_id='group_1',
|
||||||
|
fact='Related fact',
|
||||||
|
episodes=['episode_1'],
|
||||||
|
created_at=datetime.now(timezone.utc),
|
||||||
|
valid_at=None,
|
||||||
|
invalid_at=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
related_edges = [
|
||||||
|
EntityEdge(
|
||||||
|
source_node_uuid='source_uuid',
|
||||||
|
target_node_uuid='target_uuid',
|
||||||
|
name='related_edge',
|
||||||
|
group_id='group_1',
|
||||||
|
fact=' related FACT ',
|
||||||
|
episodes=['episode_2'],
|
||||||
|
created_at=datetime.now(timezone.utc) - timedelta(days=1),
|
||||||
|
valid_at=None,
|
||||||
|
invalid_at=None,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
resolved_edge, duplicate_edges, invalidated = await resolve_extracted_edge(
|
||||||
|
mock_llm_client,
|
||||||
|
extracted,
|
||||||
|
related_edges,
|
||||||
|
mock_existing_edges,
|
||||||
|
mock_current_episode,
|
||||||
|
edge_types=None,
|
||||||
|
ensure_ascii=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert resolved_edge is related_edges[0]
|
||||||
|
assert resolved_edge.episodes.count(mock_current_episode.uuid) == 1
|
||||||
|
assert duplicate_edges == []
|
||||||
|
assert invalidated == []
|
||||||
|
mock_llm_client.generate_response.assert_not_called()
|
||||||
|
|
|
||||||
|
|
@ -18,8 +18,8 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
|
||||||
_lsh_bands,
|
_lsh_bands,
|
||||||
_minhash_signature,
|
_minhash_signature,
|
||||||
_name_entropy,
|
_name_entropy,
|
||||||
_normalize_name_exact,
|
|
||||||
_normalize_name_for_fuzzy,
|
_normalize_name_for_fuzzy,
|
||||||
|
_normalize_string_exact,
|
||||||
_resolve_with_similarity,
|
_resolve_with_similarity,
|
||||||
_shingles,
|
_shingles,
|
||||||
)
|
)
|
||||||
|
|
@ -205,7 +205,7 @@ def test_build_candidate_indexes_populates_structures():
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_helpers():
|
def test_normalize_helpers():
|
||||||
assert _normalize_name_exact(' Alice Smith ') == 'alice smith'
|
assert _normalize_string_exact(' Alice Smith ') == 'alice smith'
|
||||||
assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'
|
assert _normalize_name_for_fuzzy('Alice-Smith!') == 'alice smith'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue