From 5ca8b9565cd4a3f93be679b1018f517c0f719a82 Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 2 Oct 2025 12:22:07 -0700 Subject: [PATCH] fix: Improve deduplication ID validation and logging (#965) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Improve deduplication ID validation and logging - Add comprehensive logging to verify IDs sent to LLM (sent vs received) - Enhance prompt with explicit ID bounds (0 through N-1) - Add validation warnings for missing and extra IDs from LLM responses - Improve error message clarity for invalid dedupe IDs - Log actual IDs sent to LLM to confirm no index leakage This helps diagnose cases where the LLM returns IDs outside the valid range (e.g., ID 19 when only 0-18 were sent). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * fix: Remove redundant logging parameter Address reviewer comment about redundant third parameter in debug log statement. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * fix: Address reviewer comments on list slicing and prompt clarity - Fix list slicing bug: change <= to < to avoid gap when exactly 20 elements (previously would skip element 10 when showing 21 elements) - Consolidate redundant prompt phrasing while maintaining clarity (reduced from 3 sentences to 2, keeping essential constraints) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * fix: Remove redundant prompt text to reduce token usage Consolidate 'using these exact IDs (0 through N-1)' with following sentence to eliminate repetition. Changes: - 'using these exact IDs (0 through {N-1}). Do not skip IDs or use IDs outside this range' - 'with IDs 0 through {N-1}. Do not skip or add IDs' Saves ~15 tokens per deduplication call. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --------- Co-authored-by: Claude --- graphiti_core/prompts/dedupe_nodes.py | 3 +- .../utils/maintenance/node_operations.py | 48 ++++++++++++++++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/graphiti_core/prompts/dedupe_nodes.py b/graphiti_core/prompts/dedupe_nodes.py index 31611473..84ea1317 100644 --- a/graphiti_core/prompts/dedupe_nodes.py +++ b/graphiti_core/prompts/dedupe_nodes.py @@ -166,7 +166,8 @@ def nodes(context: dict[str, Any]) -> list[Message]: - They have similar names or purposes but refer to separate instances or concepts. Task: - Respond with a JSON object that contains an "entity_resolutions" array with one entry for each entity in ENTITIES, ordered by the entity id. + ENTITIES contains {len(context['extracted_nodes'])} entities with IDs 0 through {len(context['extracted_nodes']) - 1}. + Your response MUST include EXACTLY {len(context['extracted_nodes'])} resolutions with IDs 0 through {len(context['extracted_nodes']) - 1}. Do not skip or add IDs. For every entity, return an object with the following keys: {{ diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 24cdc583..758349c6 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -269,6 +269,27 @@ async def _resolve_with_llm( for i, node in enumerate(llm_extracted_nodes) ] + sent_ids = [ctx['id'] for ctx in extracted_nodes_context] + logger.debug( + 'Sending %d entities to LLM for deduplication with IDs 0-%d (actual IDs sent: %s)', + len(llm_extracted_nodes), + len(llm_extracted_nodes) - 1, + sent_ids if len(sent_ids) < 20 else f'{sent_ids[:10]}...{sent_ids[-10:]}', + ) + if llm_extracted_nodes: + sample_size = min(3, len(extracted_nodes_context)) + logger.debug( + 'First %d entities: %s', + sample_size, + [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[:sample_size]], + ) + if len(extracted_nodes_context) > 3: + logger.debug( + 'Last %d entities: %s', + sample_size, + [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[-sample_size:]], + ) + existing_nodes_context = [ { **{ @@ -301,15 +322,38 @@ async def _resolve_with_llm( valid_relative_range = range(len(state.unresolved_indices)) processed_relative_ids: set[int] = set() + received_ids = {r.id for r in node_resolutions} + expected_ids = set(valid_relative_range) + missing_ids = expected_ids - received_ids + extra_ids = received_ids - expected_ids + + logger.debug( + 'Received %d resolutions for %d entities', + len(node_resolutions), + len(state.unresolved_indices), + ) + + if missing_ids: + logger.warning('LLM did not return resolutions for IDs: %s', sorted(missing_ids)) + + if extra_ids: + logger.warning( + 'LLM returned invalid IDs outside valid range 0-%d: %s (all returned IDs: %s)', + len(state.unresolved_indices) - 1, + sorted(extra_ids), + sorted(received_ids), + ) + for resolution in node_resolutions: relative_id: int = resolution.id duplicate_idx: int = resolution.duplicate_idx if relative_id not in valid_relative_range: logger.warning( - 'Skipping invalid LLM dedupe id %s (unresolved indices: %s)', + 'Skipping invalid LLM dedupe id %d (valid range: 0-%d, received %d resolutions)', relative_id, - state.unresolved_indices, + len(state.unresolved_indices) - 1, + len(node_resolutions), ) continue