diff --git a/graphiti_core/prompts/dedupe_edges.py b/graphiti_core/prompts/dedupe_edges.py index 84d25f4e..b60dd1b9 100644 --- a/graphiti_core/prompts/dedupe_edges.py +++ b/graphiti_core/prompts/dedupe_edges.py @@ -124,37 +124,48 @@ def resolve_edge(context: dict[str, Any]) -> list[Message]: Message( role='user', content=f""" - - {context['new_edge']} - - - - {context['existing_edges']} - - - {context['edge_invalidation_candidates']} - - - - {context['edge_types']} - - - Task: - If the NEW FACT represents identical factual information of one or more in EXISTING FACTS, return the idx of the duplicate facts. - Facts with similar information that contain key differences should not be marked as duplicates. - If the NEW FACT is not a duplicate of any of the EXISTING FACTS, return an empty list. - - Given the predefined FACT TYPES, determine if the NEW FACT should be classified as one of these types. - Return the fact type as fact_type or DEFAULT if NEW FACT is not one of the FACT TYPES. - - Based on the provided FACT INVALIDATION CANDIDATES and NEW FACT, determine which existing facts the new fact contradicts. - Return a list containing all idx's of the facts that are contradicted by the NEW FACT. - If there are no contradicted facts, return an empty list. + You will receive TWO separate lists of facts. Each list uses 'idx' as its index field, starting from 0. + + 1. DUPLICATE DETECTION: + - If the NEW FACT represents identical factual information as any fact in EXISTING FACTS, return those idx values in duplicate_facts. + - Facts with similar information that contain key differences should NOT be marked as duplicates. + - Return idx values from EXISTING FACTS. + - If no duplicates, return an empty list for duplicate_facts. + + 2. FACT TYPE CLASSIFICATION: + - Given the predefined FACT TYPES, determine if the NEW FACT should be classified as one of these types. + - Return the fact type as fact_type or DEFAULT if NEW FACT is not one of the FACT TYPES. + + 3. CONTRADICTION DETECTION: + - Based on FACT INVALIDATION CANDIDATES and NEW FACT, determine which facts the new fact contradicts. + - Return idx values from FACT INVALIDATION CANDIDATES. + - If no contradictions, return an empty list for contradicted_facts. + + IMPORTANT: + - duplicate_facts: Use ONLY 'idx' values from EXISTING FACTS + - contradicted_facts: Use ONLY 'idx' values from FACT INVALIDATION CANDIDATES + - These are two separate lists with independent idx ranges starting from 0 Guidelines: 1. Some facts may be very similar but will have key differences, particularly around numeric values in the facts. Do not mark these facts as duplicates. + + + {context['edge_types']} + + + + {context['existing_edges']} + + + + {context['edge_invalidation_candidates']} + + + + {context['new_edge']} + """, ), ] diff --git a/graphiti_core/utils/maintenance/edge_operations.py b/graphiti_core/utils/maintenance/edge_operations.py index 60a35357..5637e1c3 100644 --- a/graphiti_core/utils/maintenance/edge_operations.py +++ b/graphiti_core/utils/maintenance/edge_operations.py @@ -480,20 +480,19 @@ async def resolve_extracted_edge( start = time() # Prepare context for LLM - related_edges_context = [{'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)] + related_edges_context = [{'idx': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)] invalidation_edge_candidates_context = [ - {'id': i, 'fact': existing_edge.fact} for i, existing_edge in enumerate(existing_edges) + {'idx': i, 'fact': existing_edge.fact} for i, existing_edge in enumerate(existing_edges) ] edge_types_context = ( [ { - 'fact_type_id': i, 'fact_type_name': type_name, 'fact_type_description': type_model.__doc__, } - for i, (type_name, type_model) in enumerate(edge_type_candidates.items()) + for type_name, type_model in edge_type_candidates.items() ] if edge_type_candidates is not None else [] @@ -507,6 +506,14 @@ async def resolve_extracted_edge( 'ensure_ascii': ensure_ascii, } + logger.debug( + 'Resolving edge: sent %d EXISTING FACTS (idx 0-%d) and %d INVALIDATION CANDIDATES (idx 0-%d)', + len(related_edges), + len(related_edges) - 1, + len(existing_edges), + len(existing_edges) - 1, + ) + llm_response = await llm_client.generate_response( prompt_library.dedupe_edges.resolve_edge(context), response_model=EdgeDuplicate, @@ -515,6 +522,15 @@ async def resolve_extracted_edge( response_object = EdgeDuplicate(**llm_response) duplicate_facts = response_object.duplicate_facts + # Validate duplicate_facts are in valid range for EXISTING FACTS + invalid_duplicates = [i for i in duplicate_facts if i < 0 or i >= len(related_edges)] + if invalid_duplicates: + logger.warning( + 'LLM returned invalid duplicate_facts idx values %s (valid range: 0-%d for EXISTING FACTS)', + invalid_duplicates, + len(related_edges) - 1, + ) + duplicate_fact_ids: list[int] = [i for i in duplicate_facts if 0 <= i < len(related_edges)] resolved_edge = extracted_edge @@ -527,6 +543,15 @@ async def resolve_extracted_edge( contradicted_facts: list[int] = response_object.contradicted_facts + # Validate contradicted_facts are in valid range for INVALIDATION CANDIDATES + invalid_contradictions = [i for i in contradicted_facts if i < 0 or i >= len(existing_edges)] + if invalid_contradictions: + logger.warning( + 'LLM returned invalid contradicted_facts idx values %s (valid range: 0-%d for INVALIDATION CANDIDATES)', + invalid_contradictions, + len(existing_edges) - 1, + ) + invalidation_candidates: list[EntityEdge] = [ existing_edges[i] for i in contradicted_facts if 0 <= i < len(existing_edges) ]