Update dedupe prompt (#457)

* improve dedupe logic * cut summary length * update unit tests
2025-05-07 23:23:31 -04:00 · 2025-05-07 23:23:31 -04:00 · fd9969b5a1
commit fd9969b5a1
parent 8ce9b1e157
5 changed files with 20 additions and 15 deletions
--- a/graphiti_core/prompts/dedupe_nodes.py
+++ b/graphiti_core/prompts/dedupe_nodes.py
@ -44,7 +44,7 @@ def node(context: dict[str, Any]) -> list[Message]:
    return [
        Message(
            role='system',
-            content='You are a helpful assistant that de-duplicates entities from entity lists.',
+            content='You are a helpful assistant that determines whether or not a NEW ENTITY is a duplicate of any EXISTING ENTITIES.',
        ),
        Message(
            role='user',
@ -69,14 +69,21 @@ def node(context: dict[str, Any]) -> list[Message]:
        Given the above EXISTING ENTITIES and their attributes, MESSAGE, and PREVIOUS MESSAGES; Determine if the NEW ENTITY extracted from the conversation
        is a duplicate entity of one of the EXISTING ENTITIES.
        
-        The ENTITY TYPE DESCRIPTION gives more insight into what the entity type means for the NEW ENTITY.
+        Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
+
+        Do NOT mark entities as duplicates if:
+        - They are related but distinct.
+        - They have similar names or purposes but refer to separate instances or concepts.

        Task:
        If the NEW ENTITY represents a duplicate entity of any entity in EXISTING ENTITIES, set duplicate_entity_id to the
-        id of the EXISTING ENTITY that is the duplicate. If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES,
+        id of the EXISTING ENTITY that is the duplicate. 
+        
+        If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES,
        duplicate_entity_id should be set to -1.
        
-        Also return the most complete name for the entity.
+        Also return the name that best describes the NEW ENTITY (whether it is the name of the NEW ENTITY, a node it
+        is a duplicate of, or a combination of the two).
        """,
        ),
    ]
--- a/graphiti_core/prompts/extract_nodes.py
+++ b/graphiti_core/prompts/extract_nodes.py
@ -256,7 +256,7 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]:
        1. Do not hallucinate entity property values if they cannot be found in the current context.
        2. Only use the provided MESSAGES and ENTITY to set attribute values.
        3. The summary attribute represents a summary of the ENTITY, and should be updated with new information about the Entity from the MESSAGES. 
-            Summaries must be no longer than 500 words.
+            Summaries must be no longer than 250 words.
        
        <ENTITY>
        {context['node']}
--- a/graphiti_core/prompts/summarize_nodes.py
+++ b/graphiti_core/prompts/summarize_nodes.py
@ -25,7 +25,7 @@ from .models import Message, PromptFunction, PromptVersion
 class Summary(BaseModel):
    summary: str = Field(
        ...,
-        description='Summary containing the important information about the entity. Under 500 words',
+        description='Summary containing the important information about the entity. Under 250 words',
    )


@ -56,7 +56,7 @@ def summarize_pair(context: dict[str, Any]) -> list[Message]:
            content=f"""
        Synthesize the information from the following two summaries into a single succinct summary.
        
-        Summaries must be under 500 words.
+        Summaries must be under 250 words.

        Summaries:
        {json.dumps(context['node_summaries'], indent=2)}
@ -82,7 +82,7 @@ def summarize_context(context: dict[str, Any]) -> list[Message]:
        
        Given the above MESSAGES and the following ENTITY name, create a summary for the ENTITY. Your summary must only use
        information from the provided MESSAGES. Your summary should also only contain information relevant to the
-        provided ENTITY. Summaries must be under 500 words.
+        provided ENTITY. Summaries must be under 250 words.
        
        In addition, extract any values for the provided entity properties based on their descriptions.
        If the value of the entity property cannot be found in the current context, set the value of the property to the Python value None.
@ -117,7 +117,7 @@ def summary_description(context: dict[str, Any]) -> list[Message]:
            role='user',
            content=f"""
        Create a short one sentence description of the summary that explains what kind of information is summarized.
-        Summaries must be under 500 words.
+        Summaries must be under 250 words.

        Summary:
        {json.dumps(context['summary'], indent=2)}
--- a/graphiti_core/utils/maintenance/node_operations.py
+++ b/graphiti_core/utils/maintenance/node_operations.py
@ -290,7 +290,6 @@ async def resolve_extracted_node(
                'id': i,
                'name': node.name,
                'entity_types': node.labels,
-                'summary': node.summary,
            },
            **node.attributes,
        }
@ -384,7 +383,7 @@ async def extract_attributes_from_node(
        'summary': (
            str,
            Field(
-                description='Summary containing the important information about the entity. Under 500 words',
+                description='Summary containing the important information about the entity. Under 250 words',
            ),
        )
    }
@ -436,10 +435,7 @@ async def dedupe_node_list(
        node_map[node.uuid] = node

    # Prepare context for LLM
-    nodes_context = [
-        {'uuid': node.uuid, 'name': node.name, 'summary': node.summary, **node.attributes}
-        for node in nodes
-    ]
+    nodes_context = [{'uuid': node.uuid, 'name': node.name, **node.attributes} for node in nodes]

    context = {
        'nodes': nodes_context,
--- a/tests/utils/maintenance/test_edge_operations.py
+++ b/tests/utils/maintenance/test_edge_operations.py
@ -119,6 +119,7 @@ async def test_resolve_extracted_edge_no_changes(
        mock_extracted_edge,
        mock_related_edges,
        mock_existing_edges,
+        mock_current_episode,
    )

    assert resolved_edge.uuid == mock_extracted_edge.uuid
@ -170,6 +171,7 @@ async def test_resolve_extracted_edge_with_invalidation(
        mock_extracted_edge,
        mock_related_edges,
        mock_existing_edges,
+        mock_current_episode,
    )

    assert resolved_edge.uuid == mock_extracted_edge.uuid