diff --git a/graphiti_core/prompts/dedupe_nodes.py b/graphiti_core/prompts/dedupe_nodes.py index 31611473..84ea1317 100644 --- a/graphiti_core/prompts/dedupe_nodes.py +++ b/graphiti_core/prompts/dedupe_nodes.py @@ -166,7 +166,8 @@ def nodes(context: dict[str, Any]) -> list[Message]: - They have similar names or purposes but refer to separate instances or concepts. Task: - Respond with a JSON object that contains an "entity_resolutions" array with one entry for each entity in ENTITIES, ordered by the entity id. + ENTITIES contains {len(context['extracted_nodes'])} entities with IDs 0 through {len(context['extracted_nodes']) - 1}. + Your response MUST include EXACTLY {len(context['extracted_nodes'])} resolutions with IDs 0 through {len(context['extracted_nodes']) - 1}. Do not skip or add IDs. For every entity, return an object with the following keys: {{ diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 24cdc583..758349c6 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -269,6 +269,27 @@ async def _resolve_with_llm( for i, node in enumerate(llm_extracted_nodes) ] + sent_ids = [ctx['id'] for ctx in extracted_nodes_context] + logger.debug( + 'Sending %d entities to LLM for deduplication with IDs 0-%d (actual IDs sent: %s)', + len(llm_extracted_nodes), + len(llm_extracted_nodes) - 1, + sent_ids if len(sent_ids) < 20 else f'{sent_ids[:10]}...{sent_ids[-10:]}', + ) + if llm_extracted_nodes: + sample_size = min(3, len(extracted_nodes_context)) + logger.debug( + 'First %d entities: %s', + sample_size, + [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[:sample_size]], + ) + if len(extracted_nodes_context) > 3: + logger.debug( + 'Last %d entities: %s', + sample_size, + [(ctx['id'], ctx['name']) for ctx in extracted_nodes_context[-sample_size:]], + ) + existing_nodes_context = [ { **{ @@ -301,15 +322,38 @@ async def _resolve_with_llm( valid_relative_range = range(len(state.unresolved_indices)) processed_relative_ids: set[int] = set() + received_ids = {r.id for r in node_resolutions} + expected_ids = set(valid_relative_range) + missing_ids = expected_ids - received_ids + extra_ids = received_ids - expected_ids + + logger.debug( + 'Received %d resolutions for %d entities', + len(node_resolutions), + len(state.unresolved_indices), + ) + + if missing_ids: + logger.warning('LLM did not return resolutions for IDs: %s', sorted(missing_ids)) + + if extra_ids: + logger.warning( + 'LLM returned invalid IDs outside valid range 0-%d: %s (all returned IDs: %s)', + len(state.unresolved_indices) - 1, + sorted(extra_ids), + sorted(received_ids), + ) + for resolution in node_resolutions: relative_id: int = resolution.id duplicate_idx: int = resolution.duplicate_idx if relative_id not in valid_relative_range: logger.warning( - 'Skipping invalid LLM dedupe id %s (unresolved indices: %s)', + 'Skipping invalid LLM dedupe id %d (valid range: 0-%d, received %d resolutions)', relative_id, - state.unresolved_indices, + len(state.unresolved_indices) - 1, + len(node_resolutions), ) continue