From fd9969b5a1244a63ef763eeeebb121c348de2042 Mon Sep 17 00:00:00 2001 From: Preston Rasmussen <109292228+prasmussen15@users.noreply.github.com> Date: Wed, 7 May 2025 23:23:31 -0400 Subject: [PATCH] Update dedupe prompt (#457) * improve dedupe logic * cut summary length * update unit tests --- graphiti_core/prompts/dedupe_nodes.py | 15 +++++++++++---- graphiti_core/prompts/extract_nodes.py | 2 +- graphiti_core/prompts/summarize_nodes.py | 8 ++++---- .../utils/maintenance/node_operations.py | 8 ++------ tests/utils/maintenance/test_edge_operations.py | 2 ++ 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/graphiti_core/prompts/dedupe_nodes.py b/graphiti_core/prompts/dedupe_nodes.py index 72718c40..1cac6b79 100644 --- a/graphiti_core/prompts/dedupe_nodes.py +++ b/graphiti_core/prompts/dedupe_nodes.py @@ -44,7 +44,7 @@ def node(context: dict[str, Any]) -> list[Message]: return [ Message( role='system', - content='You are a helpful assistant that de-duplicates entities from entity lists.', + content='You are a helpful assistant that determines whether or not a NEW ENTITY is a duplicate of any EXISTING ENTITIES.', ), Message( role='user', @@ -69,14 +69,21 @@ def node(context: dict[str, Any]) -> list[Message]: Given the above EXISTING ENTITIES and their attributes, MESSAGE, and PREVIOUS MESSAGES; Determine if the NEW ENTITY extracted from the conversation is a duplicate entity of one of the EXISTING ENTITIES. - The ENTITY TYPE DESCRIPTION gives more insight into what the entity type means for the NEW ENTITY. + Entities should only be considered duplicates if they refer to the *same real-world object or concept*. + + Do NOT mark entities as duplicates if: + - They are related but distinct. + - They have similar names or purposes but refer to separate instances or concepts. Task: If the NEW ENTITY represents a duplicate entity of any entity in EXISTING ENTITIES, set duplicate_entity_id to the - id of the EXISTING ENTITY that is the duplicate. If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES, + id of the EXISTING ENTITY that is the duplicate. + + If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES, duplicate_entity_id should be set to -1. - Also return the most complete name for the entity. + Also return the name that best describes the NEW ENTITY (whether it is the name of the NEW ENTITY, a node it + is a duplicate of, or a combination of the two). """, ), ] diff --git a/graphiti_core/prompts/extract_nodes.py b/graphiti_core/prompts/extract_nodes.py index 89d18841..7ad652e1 100644 --- a/graphiti_core/prompts/extract_nodes.py +++ b/graphiti_core/prompts/extract_nodes.py @@ -256,7 +256,7 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]: 1. Do not hallucinate entity property values if they cannot be found in the current context. 2. Only use the provided MESSAGES and ENTITY to set attribute values. 3. The summary attribute represents a summary of the ENTITY, and should be updated with new information about the Entity from the MESSAGES. - Summaries must be no longer than 500 words. + Summaries must be no longer than 250 words. {context['node']} diff --git a/graphiti_core/prompts/summarize_nodes.py b/graphiti_core/prompts/summarize_nodes.py index a9475dc3..a15c19ba 100644 --- a/graphiti_core/prompts/summarize_nodes.py +++ b/graphiti_core/prompts/summarize_nodes.py @@ -25,7 +25,7 @@ from .models import Message, PromptFunction, PromptVersion class Summary(BaseModel): summary: str = Field( ..., - description='Summary containing the important information about the entity. Under 500 words', + description='Summary containing the important information about the entity. Under 250 words', ) @@ -56,7 +56,7 @@ def summarize_pair(context: dict[str, Any]) -> list[Message]: content=f""" Synthesize the information from the following two summaries into a single succinct summary. - Summaries must be under 500 words. + Summaries must be under 250 words. Summaries: {json.dumps(context['node_summaries'], indent=2)} @@ -82,7 +82,7 @@ def summarize_context(context: dict[str, Any]) -> list[Message]: Given the above MESSAGES and the following ENTITY name, create a summary for the ENTITY. Your summary must only use information from the provided MESSAGES. Your summary should also only contain information relevant to the - provided ENTITY. Summaries must be under 500 words. + provided ENTITY. Summaries must be under 250 words. In addition, extract any values for the provided entity properties based on their descriptions. If the value of the entity property cannot be found in the current context, set the value of the property to the Python value None. @@ -117,7 +117,7 @@ def summary_description(context: dict[str, Any]) -> list[Message]: role='user', content=f""" Create a short one sentence description of the summary that explains what kind of information is summarized. - Summaries must be under 500 words. + Summaries must be under 250 words. Summary: {json.dumps(context['summary'], indent=2)} diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 8df4f563..c1c378e7 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -290,7 +290,6 @@ async def resolve_extracted_node( 'id': i, 'name': node.name, 'entity_types': node.labels, - 'summary': node.summary, }, **node.attributes, } @@ -384,7 +383,7 @@ async def extract_attributes_from_node( 'summary': ( str, Field( - description='Summary containing the important information about the entity. Under 500 words', + description='Summary containing the important information about the entity. Under 250 words', ), ) } @@ -436,10 +435,7 @@ async def dedupe_node_list( node_map[node.uuid] = node # Prepare context for LLM - nodes_context = [ - {'uuid': node.uuid, 'name': node.name, 'summary': node.summary, **node.attributes} - for node in nodes - ] + nodes_context = [{'uuid': node.uuid, 'name': node.name, **node.attributes} for node in nodes] context = { 'nodes': nodes_context, diff --git a/tests/utils/maintenance/test_edge_operations.py b/tests/utils/maintenance/test_edge_operations.py index bcd3ddd2..3145b74d 100644 --- a/tests/utils/maintenance/test_edge_operations.py +++ b/tests/utils/maintenance/test_edge_operations.py @@ -119,6 +119,7 @@ async def test_resolve_extracted_edge_no_changes( mock_extracted_edge, mock_related_edges, mock_existing_edges, + mock_current_episode, ) assert resolved_edge.uuid == mock_extracted_edge.uuid @@ -170,6 +171,7 @@ async def test_resolve_extracted_edge_with_invalidation( mock_extracted_edge, mock_related_edges, mock_existing_edges, + mock_current_episode, ) assert resolved_edge.uuid == mock_extracted_edge.uuid