From 22bfdc532ace53e9039359fb818b9dc1390254e3 Mon Sep 17 00:00:00 2001
From: Daniel Chalef <131175+danielchalef@users.noreply.github.com>
Date: Thu, 2 Oct 2025 11:36:45 -0700
Subject: [PATCH] fix: Improve deduplication ID validation and logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add comprehensive logging to verify IDs sent to LLM (sent vs received)
- Enhance prompt with explicit ID bounds (0 through N-1)
- Add validation warnings for missing and extra IDs from LLM responses
- Improve error message clarity for invalid dedupe IDs
- Log actual IDs sent to LLM to confirm no index leakage

This helps diagnose cases where the LLM returns IDs outside the valid
range (e.g., ID 19 when only 0-18 were sent).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 graphiti_core/prompts/dedupe_nodes.py         |  4 +-
 .../utils/maintenance/node_operations.py      | 49 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/graphiti_core/prompts/dedupe_nodes.py b/graphiti_core/prompts/dedupe_nodes.py
index 31611473..08e44fee 100644
--- a/graphiti_core/prompts/dedupe_nodes.py
+++ b/graphiti_core/prompts/dedupe_nodes.py
@@ -166,7 +166,9 @@ def nodes(context: dict[str, Any]) -> list[Message]:
         - They have similar names or purposes but refer to separate instances or concepts.
 
         Task:
-        Respond with a JSON object that contains an "entity_resolutions" array with one entry for each entity in ENTITIES, ordered by the entity id.
+        ENTITIES contains {len(context['extracted_nodes'])} entities with IDs 0 through {len(context['extracted_nodes']) - 1}.
+        Respond with a JSON object that contains an "entity_resolutions" array with EXACTLY {len(context['extracted_nodes'])} entries - one for each entity in ENTITIES.
+        Your response MUST use only the IDs 0 through {len(context['extracted_nodes']) - 1}. Do not skip any IDs or use IDs outside this range.
 
         For every entity, return an object with the following keys:
         {{
diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py
index 24cdc583..9a7700ee 100644
--- a/graphiti_core/utils/maintenance/node_operations.py
+++ b/graphiti_core/utils/maintenance/node_operations.py
@@ -269,6 +269,27 @@ async def _resolve_with_llm(
         for i, node in enumerate(llm_extracted_nodes)
     ]
 
+    sent_ids = [ctx['id'] for ctx in extracted_nodes_context]
+    logger.debug(
+        'Sending %d entities to LLM for deduplication with IDs 0-%d (actual IDs sent: %s)',
+        len(llm_extracted_nodes),
+        len(llm_extracted_nodes) - 1,
+        sent_ids if len(sent_ids) <= 20 else f'{sent_ids[:10]}...{sent_ids[-10:]}',
+    )
+    if llm_extracted_nodes:
+        sample_size = min(3, len(extracted_nodes_context))
+        logger.debug(
+            'First %d entities: %s',
+            sample_size,
+            [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[:sample_size]],
+        )
+        if len(extracted_nodes_context) > 3:
+            logger.debug(
+                'Last %d entities: %s',
+                sample_size,
+                [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[-sample_size:]],
+            )
+
     existing_nodes_context = [
         {
             **{
@@ -301,15 +322,39 @@ async def _resolve_with_llm(
     valid_relative_range = range(len(state.unresolved_indices))
     processed_relative_ids: set[int] = set()
 
+    received_ids = {r.id for r in node_resolutions}
+    expected_ids = set(valid_relative_range)
+    missing_ids = expected_ids - received_ids
+    extra_ids = received_ids - expected_ids
+
+    logger.debug(
+        'Received %d resolutions for %d entities (expected %d)',
+        len(node_resolutions),
+        len(state.unresolved_indices),
+        len(state.unresolved_indices),
+    )
+
+    if missing_ids:
+        logger.warning('LLM did not return resolutions for IDs: %s', sorted(missing_ids))
+
+    if extra_ids:
+        logger.warning(
+            'LLM returned invalid IDs outside valid range 0-%d: %s (all returned IDs: %s)',
+            len(state.unresolved_indices) - 1,
+            sorted(extra_ids),
+            sorted(received_ids),
+        )
+
     for resolution in node_resolutions:
         relative_id: int = resolution.id
         duplicate_idx: int = resolution.duplicate_idx
 
         if relative_id not in valid_relative_range:
             logger.warning(
-                'Skipping invalid LLM dedupe id %s (unresolved indices: %s)',
+                'Skipping invalid LLM dedupe id %d (valid range: 0-%d, received %d resolutions)',
                 relative_id,
-                state.unresolved_indices,
+                len(state.unresolved_indices) - 1,
+                len(node_resolutions),
             )
             continue