From 22bfdc532ace53e9039359fb818b9dc1390254e3 Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 2 Oct 2025 11:36:45 -0700 Subject: [PATCH] fix: Improve deduplication ID validation and logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add comprehensive logging to verify IDs sent to LLM (sent vs received) - Enhance prompt with explicit ID bounds (0 through N-1) - Add validation warnings for missing and extra IDs from LLM responses - Improve error message clarity for invalid dedupe IDs - Log actual IDs sent to LLM to confirm no index leakage This helps diagnose cases where the LLM returns IDs outside the valid range (e.g., ID 19 when only 0-18 were sent). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- graphiti_core/prompts/dedupe_nodes.py | 4 +- .../utils/maintenance/node_operations.py | 49 ++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/graphiti_core/prompts/dedupe_nodes.py b/graphiti_core/prompts/dedupe_nodes.py index 31611473..08e44fee 100644 --- a/graphiti_core/prompts/dedupe_nodes.py +++ b/graphiti_core/prompts/dedupe_nodes.py @@ -166,7 +166,9 @@ def nodes(context: dict[str, Any]) -> list[Message]: - They have similar names or purposes but refer to separate instances or concepts. Task: - Respond with a JSON object that contains an "entity_resolutions" array with one entry for each entity in ENTITIES, ordered by the entity id. + ENTITIES contains {len(context['extracted_nodes'])} entities with IDs 0 through {len(context['extracted_nodes']) - 1}. + Respond with a JSON object that contains an "entity_resolutions" array with EXACTLY {len(context['extracted_nodes'])} entries - one for each entity in ENTITIES. + Your response MUST use only the IDs 0 through {len(context['extracted_nodes']) - 1}. Do not skip any IDs or use IDs outside this range. For every entity, return an object with the following keys: {{ diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 24cdc583..9a7700ee 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -269,6 +269,27 @@ async def _resolve_with_llm( for i, node in enumerate(llm_extracted_nodes) ] + sent_ids = [ctx['id'] for ctx in extracted_nodes_context] + logger.debug( + 'Sending %d entities to LLM for deduplication with IDs 0-%d (actual IDs sent: %s)', + len(llm_extracted_nodes), + len(llm_extracted_nodes) - 1, + sent_ids if len(sent_ids) <= 20 else f'{sent_ids[:10]}...{sent_ids[-10:]}', + ) + if llm_extracted_nodes: + sample_size = min(3, len(extracted_nodes_context)) + logger.debug( + 'First %d entities: %s', + sample_size, + [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[:sample_size]], + ) + if len(extracted_nodes_context) > 3: + logger.debug( + 'Last %d entities: %s', + sample_size, + [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[-sample_size:]], + ) + existing_nodes_context = [ { **{ @@ -301,15 +322,39 @@ async def _resolve_with_llm( valid_relative_range = range(len(state.unresolved_indices)) processed_relative_ids: set[int] = set() + received_ids = {r.id for r in node_resolutions} + expected_ids = set(valid_relative_range) + missing_ids = expected_ids - received_ids + extra_ids = received_ids - expected_ids + + logger.debug( + 'Received %d resolutions for %d entities (expected %d)', + len(node_resolutions), + len(state.unresolved_indices), + len(state.unresolved_indices), + ) + + if missing_ids: + logger.warning('LLM did not return resolutions for IDs: %s', sorted(missing_ids)) + + if extra_ids: + logger.warning( + 'LLM returned invalid IDs outside valid range 0-%d: %s (all returned IDs: %s)', + len(state.unresolved_indices) - 1, + sorted(extra_ids), + sorted(received_ids), + ) + for resolution in node_resolutions: relative_id: int = resolution.id duplicate_idx: int = resolution.duplicate_idx if relative_id not in valid_relative_range: logger.warning( - 'Skipping invalid LLM dedupe id %s (unresolved indices: %s)', + 'Skipping invalid LLM dedupe id %d (valid range: 0-%d, received %d resolutions)', relative_id, - state.unresolved_indices, + len(state.unresolved_indices) - 1, + len(node_resolutions), ) continue