Update dedupe prompt (#457)

* improve dedupe logic

* cut summary length

* update unit tests
This commit is contained in:
Preston Rasmussen 2025-05-07 23:23:31 -04:00 committed by GitHub
parent 8ce9b1e157
commit fd9969b5a1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 20 additions and 15 deletions

View file

@ -44,7 +44,7 @@ def node(context: dict[str, Any]) -> list[Message]:
return [
Message(
role='system',
content='You are a helpful assistant that de-duplicates entities from entity lists.',
content='You are a helpful assistant that determines whether or not a NEW ENTITY is a duplicate of any EXISTING ENTITIES.',
),
Message(
role='user',
@ -69,14 +69,21 @@ def node(context: dict[str, Any]) -> list[Message]:
Given the above EXISTING ENTITIES and their attributes, MESSAGE, and PREVIOUS MESSAGES; Determine if the NEW ENTITY extracted from the conversation
is a duplicate entity of one of the EXISTING ENTITIES.
The ENTITY TYPE DESCRIPTION gives more insight into what the entity type means for the NEW ENTITY.
Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
Do NOT mark entities as duplicates if:
- They are related but distinct.
- They have similar names or purposes but refer to separate instances or concepts.
Task:
If the NEW ENTITY represents a duplicate entity of any entity in EXISTING ENTITIES, set duplicate_entity_id to the
id of the EXISTING ENTITY that is the duplicate. If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES,
id of the EXISTING ENTITY that is the duplicate.
If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES,
duplicate_entity_id should be set to -1.
Also return the most complete name for the entity.
Also return the name that best describes the NEW ENTITY (whether it is the name of the NEW ENTITY, a node it
is a duplicate of, or a combination of the two).
""",
),
]

View file

@ -256,7 +256,7 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]:
1. Do not hallucinate entity property values if they cannot be found in the current context.
2. Only use the provided MESSAGES and ENTITY to set attribute values.
3. The summary attribute represents a summary of the ENTITY, and should be updated with new information about the Entity from the MESSAGES.
Summaries must be no longer than 500 words.
Summaries must be no longer than 250 words.
<ENTITY>
{context['node']}

View file

@ -25,7 +25,7 @@ from .models import Message, PromptFunction, PromptVersion
class Summary(BaseModel):
summary: str = Field(
...,
description='Summary containing the important information about the entity. Under 500 words',
description='Summary containing the important information about the entity. Under 250 words',
)
@ -56,7 +56,7 @@ def summarize_pair(context: dict[str, Any]) -> list[Message]:
content=f"""
Synthesize the information from the following two summaries into a single succinct summary.
Summaries must be under 500 words.
Summaries must be under 250 words.
Summaries:
{json.dumps(context['node_summaries'], indent=2)}
@ -82,7 +82,7 @@ def summarize_context(context: dict[str, Any]) -> list[Message]:
Given the above MESSAGES and the following ENTITY name, create a summary for the ENTITY. Your summary must only use
information from the provided MESSAGES. Your summary should also only contain information relevant to the
provided ENTITY. Summaries must be under 500 words.
provided ENTITY. Summaries must be under 250 words.
In addition, extract any values for the provided entity properties based on their descriptions.
If the value of the entity property cannot be found in the current context, set the value of the property to the Python value None.
@ -117,7 +117,7 @@ def summary_description(context: dict[str, Any]) -> list[Message]:
role='user',
content=f"""
Create a short one sentence description of the summary that explains what kind of information is summarized.
Summaries must be under 500 words.
Summaries must be under 250 words.
Summary:
{json.dumps(context['summary'], indent=2)}

View file

@ -290,7 +290,6 @@ async def resolve_extracted_node(
'id': i,
'name': node.name,
'entity_types': node.labels,
'summary': node.summary,
},
**node.attributes,
}
@ -384,7 +383,7 @@ async def extract_attributes_from_node(
'summary': (
str,
Field(
description='Summary containing the important information about the entity. Under 500 words',
description='Summary containing the important information about the entity. Under 250 words',
),
)
}
@ -436,10 +435,7 @@ async def dedupe_node_list(
node_map[node.uuid] = node
# Prepare context for LLM
nodes_context = [
{'uuid': node.uuid, 'name': node.name, 'summary': node.summary, **node.attributes}
for node in nodes
]
nodes_context = [{'uuid': node.uuid, 'name': node.name, **node.attributes} for node in nodes]
context = {
'nodes': nodes_context,

View file

@ -119,6 +119,7 @@ async def test_resolve_extracted_edge_no_changes(
mock_extracted_edge,
mock_related_edges,
mock_existing_edges,
mock_current_episode,
)
assert resolved_edge.uuid == mock_extracted_edge.uuid
@ -170,6 +171,7 @@ async def test_resolve_extracted_edge_with_invalidation(
mock_extracted_edge,
mock_related_edges,
mock_existing_edges,
mock_current_episode,
)
assert resolved_edge.uuid == mock_extracted_edge.uuid