Optimize edge deduplication prompt for caching and clarity (#970)

* Optimize edge deduplication prompt for caching and clarity - Restructure prompt to place invariant instructions at top and dynamic context at bottom for better LLM caching - Change 'id' to 'idx' in edge context lists to avoid confusion with other identifiers - Remove 'fact_type_id' from edge types context as LLM only needs fact_type_name - Remove dynamic range values from prompt instructions (e.g., "range 0-N") - Add debug logging before LLM call to track input sizes - Add validation logging after LLM response to catch invalid idx values - Clarify that duplicate_facts uses EXISTING FACTS idx and contradicted_facts uses INVALIDATION CANDIDATES idx 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Address terminology consistency and edge case logging - Update Pydantic field descriptions to use 'idx' instead of 'ids' for consistency - Fix debug logging to handle empty list edge case (avoid 'idx 0--1' display) Note on review feedback: - Validation is intentionally non-redundant: warnings provide visibility, list comprehensions ensure robustness - WARNING level is appropriate for LLM output issues (not system errors) - Existing test coverage is sufficient for this defensive logging addition 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-10-02 17:07:43 -07:00 · 2025-10-02 17:07:43 -07:00 · 4a307dbf10
commit 4a307dbf10
parent b28bd92c16
2 changed files with 69 additions and 32 deletions
--- a/graphiti_core/prompts/dedupe_edges.py
+++ b/graphiti_core/prompts/dedupe_edges.py
@ -25,11 +25,11 @@ from .prompt_helpers import to_prompt_json
 class EdgeDuplicate(BaseModel):
    duplicate_facts: list[int] = Field(
        ...,
-        description='List of ids of any duplicate facts. If no duplicate facts are found, default to empty list.',
+        description='List of idx values of any duplicate facts. If no duplicate facts are found, default to empty list.',
    )
    contradicted_facts: list[int] = Field(
        ...,
-        description='List of ids of facts that should be invalidated. If no facts should be invalidated, the list should be empty.',
+        description='List of idx values of facts that should be invalidated. If no facts should be invalidated, the list should be empty.',
    )
    fact_type: str = Field(..., description='One of the provided fact types or DEFAULT')

@ -124,37 +124,48 @@ def resolve_edge(context: dict[str, Any]) -> list[Message]:
        Message(
            role='user',
            content=f"""
-        <NEW FACT>
-        {context['new_edge']}
-        </NEW FACT>
-        
-        <EXISTING FACTS>
-        {context['existing_edges']}
-        </EXISTING FACTS>
-        <FACT INVALIDATION CANDIDATES>
-        {context['edge_invalidation_candidates']}
-        </FACT INVALIDATION CANDIDATES>
-        
-        <FACT TYPES>
-        {context['edge_types']}
-        </FACT TYPES>
-        
-
        Task:
-        If the NEW FACT represents identical factual information of one or more in EXISTING FACTS, return the idx of the duplicate facts.
-        Facts with similar information that contain key differences should not be marked as duplicates.
-        If the NEW FACT is not a duplicate of any of the EXISTING FACTS, return an empty list.
-        
-        Given the predefined FACT TYPES, determine if the NEW FACT should be classified as one of these types.
-        Return the fact type as fact_type or DEFAULT if NEW FACT is not one of the FACT TYPES.
-        
-        Based on the provided FACT INVALIDATION CANDIDATES and NEW FACT, determine which existing facts the new fact contradicts.
-        Return a list containing all idx's of the facts that are contradicted by the NEW FACT.
-        If there are no contradicted facts, return an empty list.
+        You will receive TWO separate lists of facts. Each list uses 'idx' as its index field, starting from 0.
+
+        1. DUPLICATE DETECTION:
+           - If the NEW FACT represents identical factual information as any fact in EXISTING FACTS, return those idx values in duplicate_facts.
+           - Facts with similar information that contain key differences should NOT be marked as duplicates.
+           - Return idx values from EXISTING FACTS.
+           - If no duplicates, return an empty list for duplicate_facts.
+
+        2. FACT TYPE CLASSIFICATION:
+           - Given the predefined FACT TYPES, determine if the NEW FACT should be classified as one of these types.
+           - Return the fact type as fact_type or DEFAULT if NEW FACT is not one of the FACT TYPES.
+
+        3. CONTRADICTION DETECTION:
+           - Based on FACT INVALIDATION CANDIDATES and NEW FACT, determine which facts the new fact contradicts.
+           - Return idx values from FACT INVALIDATION CANDIDATES.
+           - If no contradictions, return an empty list for contradicted_facts.
+
+        IMPORTANT:
+        - duplicate_facts: Use ONLY 'idx' values from EXISTING FACTS
+        - contradicted_facts: Use ONLY 'idx' values from FACT INVALIDATION CANDIDATES
+        - These are two separate lists with independent idx ranges starting from 0

        Guidelines:
        1. Some facts may be very similar but will have key differences, particularly around numeric values in the facts.
            Do not mark these facts as duplicates.
+
+        <FACT TYPES>
+        {context['edge_types']}
+        </FACT TYPES>
+
+        <EXISTING FACTS>
+        {context['existing_edges']}
+        </EXISTING FACTS>
+
+        <FACT INVALIDATION CANDIDATES>
+        {context['edge_invalidation_candidates']}
+        </FACT INVALIDATION CANDIDATES>
+
+        <NEW FACT>
+        {context['new_edge']}
+        </NEW FACT>
        """,
        ),
    ]
--- a/graphiti_core/utils/maintenance/edge_operations.py
+++ b/graphiti_core/utils/maintenance/edge_operations.py
@ -475,20 +475,19 @@ async def resolve_extracted_edge(
    start = time()

    # Prepare context for LLM
-    related_edges_context = [{'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)]
+    related_edges_context = [{'idx': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)]

    invalidation_edge_candidates_context = [
-        {'id': i, 'fact': existing_edge.fact} for i, existing_edge in enumerate(existing_edges)
+        {'idx': i, 'fact': existing_edge.fact} for i, existing_edge in enumerate(existing_edges)
    ]

    edge_types_context = (
        [
            {
-                'fact_type_id': i,
                'fact_type_name': type_name,
                'fact_type_description': type_model.__doc__,
            }
-            for i, (type_name, type_model) in enumerate(edge_type_candidates.items())
+            for type_name, type_model in edge_type_candidates.items()
        ]
        if edge_type_candidates is not None
        else []
@ -501,6 +500,15 @@ async def resolve_extracted_edge(
        'edge_types': edge_types_context,
    }

+    if related_edges or existing_edges:
+        logger.debug(
+            'Resolving edge: sent %d EXISTING FACTS%s and %d INVALIDATION CANDIDATES%s',
+            len(related_edges),
+            f' (idx 0-{len(related_edges) - 1})' if related_edges else '',
+            len(existing_edges),
+            f' (idx 0-{len(existing_edges) - 1})' if existing_edges else '',
+        )
+
    llm_response = await llm_client.generate_response(
        prompt_library.dedupe_edges.resolve_edge(context),
        response_model=EdgeDuplicate,
@ -509,6 +517,15 @@ async def resolve_extracted_edge(
    response_object = EdgeDuplicate(**llm_response)
    duplicate_facts = response_object.duplicate_facts

+    # Validate duplicate_facts are in valid range for EXISTING FACTS
+    invalid_duplicates = [i for i in duplicate_facts if i < 0 or i >= len(related_edges)]
+    if invalid_duplicates:
+        logger.warning(
+            'LLM returned invalid duplicate_facts idx values %s (valid range: 0-%d for EXISTING FACTS)',
+            invalid_duplicates,
+            len(related_edges) - 1,
+        )
+
    duplicate_fact_ids: list[int] = [i for i in duplicate_facts if 0 <= i < len(related_edges)]

    resolved_edge = extracted_edge
@ -521,6 +538,15 @@ async def resolve_extracted_edge(

    contradicted_facts: list[int] = response_object.contradicted_facts

+    # Validate contradicted_facts are in valid range for INVALIDATION CANDIDATES
+    invalid_contradictions = [i for i in contradicted_facts if i < 0 or i >= len(existing_edges)]
+    if invalid_contradictions:
+        logger.warning(
+            'LLM returned invalid contradicted_facts idx values %s (valid range: 0-%d for INVALIDATION CANDIDATES)',
+            invalid_contradictions,
+            len(existing_edges) - 1,
+        )
+
    invalidation_candidates: list[EntityEdge] = [
        existing_edges[i] for i in contradicted_facts if 0 <= i < len(existing_edges)
    ]