Update dedupe prompt (#457)
* improve dedupe logic * cut summary length * update unit tests
This commit is contained in:
parent
8ce9b1e157
commit
fd9969b5a1
5 changed files with 20 additions and 15 deletions
|
|
@ -44,7 +44,7 @@ def node(context: dict[str, Any]) -> list[Message]:
|
||||||
return [
|
return [
|
||||||
Message(
|
Message(
|
||||||
role='system',
|
role='system',
|
||||||
content='You are a helpful assistant that de-duplicates entities from entity lists.',
|
content='You are a helpful assistant that determines whether or not a NEW ENTITY is a duplicate of any EXISTING ENTITIES.',
|
||||||
),
|
),
|
||||||
Message(
|
Message(
|
||||||
role='user',
|
role='user',
|
||||||
|
|
@ -69,14 +69,21 @@ def node(context: dict[str, Any]) -> list[Message]:
|
||||||
Given the above EXISTING ENTITIES and their attributes, MESSAGE, and PREVIOUS MESSAGES; Determine if the NEW ENTITY extracted from the conversation
|
Given the above EXISTING ENTITIES and their attributes, MESSAGE, and PREVIOUS MESSAGES; Determine if the NEW ENTITY extracted from the conversation
|
||||||
is a duplicate entity of one of the EXISTING ENTITIES.
|
is a duplicate entity of one of the EXISTING ENTITIES.
|
||||||
|
|
||||||
The ENTITY TYPE DESCRIPTION gives more insight into what the entity type means for the NEW ENTITY.
|
Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
|
||||||
|
|
||||||
|
Do NOT mark entities as duplicates if:
|
||||||
|
- They are related but distinct.
|
||||||
|
- They have similar names or purposes but refer to separate instances or concepts.
|
||||||
|
|
||||||
Task:
|
Task:
|
||||||
If the NEW ENTITY represents a duplicate entity of any entity in EXISTING ENTITIES, set duplicate_entity_id to the
|
If the NEW ENTITY represents a duplicate entity of any entity in EXISTING ENTITIES, set duplicate_entity_id to the
|
||||||
id of the EXISTING ENTITY that is the duplicate. If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES,
|
id of the EXISTING ENTITY that is the duplicate.
|
||||||
|
|
||||||
|
If the NEW ENTITY is not a duplicate of any of the EXISTING ENTITIES,
|
||||||
duplicate_entity_id should be set to -1.
|
duplicate_entity_id should be set to -1.
|
||||||
|
|
||||||
Also return the most complete name for the entity.
|
Also return the name that best describes the NEW ENTITY (whether it is the name of the NEW ENTITY, a node it
|
||||||
|
is a duplicate of, or a combination of the two).
|
||||||
""",
|
""",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -256,7 +256,7 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]:
|
||||||
1. Do not hallucinate entity property values if they cannot be found in the current context.
|
1. Do not hallucinate entity property values if they cannot be found in the current context.
|
||||||
2. Only use the provided MESSAGES and ENTITY to set attribute values.
|
2. Only use the provided MESSAGES and ENTITY to set attribute values.
|
||||||
3. The summary attribute represents a summary of the ENTITY, and should be updated with new information about the Entity from the MESSAGES.
|
3. The summary attribute represents a summary of the ENTITY, and should be updated with new information about the Entity from the MESSAGES.
|
||||||
Summaries must be no longer than 500 words.
|
Summaries must be no longer than 250 words.
|
||||||
|
|
||||||
<ENTITY>
|
<ENTITY>
|
||||||
{context['node']}
|
{context['node']}
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ from .models import Message, PromptFunction, PromptVersion
|
||||||
class Summary(BaseModel):
|
class Summary(BaseModel):
|
||||||
summary: str = Field(
|
summary: str = Field(
|
||||||
...,
|
...,
|
||||||
description='Summary containing the important information about the entity. Under 500 words',
|
description='Summary containing the important information about the entity. Under 250 words',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,7 +56,7 @@ def summarize_pair(context: dict[str, Any]) -> list[Message]:
|
||||||
content=f"""
|
content=f"""
|
||||||
Synthesize the information from the following two summaries into a single succinct summary.
|
Synthesize the information from the following two summaries into a single succinct summary.
|
||||||
|
|
||||||
Summaries must be under 500 words.
|
Summaries must be under 250 words.
|
||||||
|
|
||||||
Summaries:
|
Summaries:
|
||||||
{json.dumps(context['node_summaries'], indent=2)}
|
{json.dumps(context['node_summaries'], indent=2)}
|
||||||
|
|
@ -82,7 +82,7 @@ def summarize_context(context: dict[str, Any]) -> list[Message]:
|
||||||
|
|
||||||
Given the above MESSAGES and the following ENTITY name, create a summary for the ENTITY. Your summary must only use
|
Given the above MESSAGES and the following ENTITY name, create a summary for the ENTITY. Your summary must only use
|
||||||
information from the provided MESSAGES. Your summary should also only contain information relevant to the
|
information from the provided MESSAGES. Your summary should also only contain information relevant to the
|
||||||
provided ENTITY. Summaries must be under 500 words.
|
provided ENTITY. Summaries must be under 250 words.
|
||||||
|
|
||||||
In addition, extract any values for the provided entity properties based on their descriptions.
|
In addition, extract any values for the provided entity properties based on their descriptions.
|
||||||
If the value of the entity property cannot be found in the current context, set the value of the property to the Python value None.
|
If the value of the entity property cannot be found in the current context, set the value of the property to the Python value None.
|
||||||
|
|
@ -117,7 +117,7 @@ def summary_description(context: dict[str, Any]) -> list[Message]:
|
||||||
role='user',
|
role='user',
|
||||||
content=f"""
|
content=f"""
|
||||||
Create a short one sentence description of the summary that explains what kind of information is summarized.
|
Create a short one sentence description of the summary that explains what kind of information is summarized.
|
||||||
Summaries must be under 500 words.
|
Summaries must be under 250 words.
|
||||||
|
|
||||||
Summary:
|
Summary:
|
||||||
{json.dumps(context['summary'], indent=2)}
|
{json.dumps(context['summary'], indent=2)}
|
||||||
|
|
|
||||||
|
|
@ -290,7 +290,6 @@ async def resolve_extracted_node(
|
||||||
'id': i,
|
'id': i,
|
||||||
'name': node.name,
|
'name': node.name,
|
||||||
'entity_types': node.labels,
|
'entity_types': node.labels,
|
||||||
'summary': node.summary,
|
|
||||||
},
|
},
|
||||||
**node.attributes,
|
**node.attributes,
|
||||||
}
|
}
|
||||||
|
|
@ -384,7 +383,7 @@ async def extract_attributes_from_node(
|
||||||
'summary': (
|
'summary': (
|
||||||
str,
|
str,
|
||||||
Field(
|
Field(
|
||||||
description='Summary containing the important information about the entity. Under 500 words',
|
description='Summary containing the important information about the entity. Under 250 words',
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
@ -436,10 +435,7 @@ async def dedupe_node_list(
|
||||||
node_map[node.uuid] = node
|
node_map[node.uuid] = node
|
||||||
|
|
||||||
# Prepare context for LLM
|
# Prepare context for LLM
|
||||||
nodes_context = [
|
nodes_context = [{'uuid': node.uuid, 'name': node.name, **node.attributes} for node in nodes]
|
||||||
{'uuid': node.uuid, 'name': node.name, 'summary': node.summary, **node.attributes}
|
|
||||||
for node in nodes
|
|
||||||
]
|
|
||||||
|
|
||||||
context = {
|
context = {
|
||||||
'nodes': nodes_context,
|
'nodes': nodes_context,
|
||||||
|
|
|
||||||
|
|
@ -119,6 +119,7 @@ async def test_resolve_extracted_edge_no_changes(
|
||||||
mock_extracted_edge,
|
mock_extracted_edge,
|
||||||
mock_related_edges,
|
mock_related_edges,
|
||||||
mock_existing_edges,
|
mock_existing_edges,
|
||||||
|
mock_current_episode,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert resolved_edge.uuid == mock_extracted_edge.uuid
|
assert resolved_edge.uuid == mock_extracted_edge.uuid
|
||||||
|
|
@ -170,6 +171,7 @@ async def test_resolve_extracted_edge_with_invalidation(
|
||||||
mock_extracted_edge,
|
mock_extracted_edge,
|
||||||
mock_related_edges,
|
mock_related_edges,
|
||||||
mock_existing_edges,
|
mock_existing_edges,
|
||||||
|
mock_current_episode,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert resolved_edge.uuid == mock_extracted_edge.uuid
|
assert resolved_edge.uuid == mock_extracted_edge.uuid
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue