fix: Prevent duplicate edge facts within same episode
This fixes three related bugs that allowed verbatim duplicate edge facts: 1. Fixed LLM deduplication: Changed related_edges_context to use integer indices instead of UUIDs, matching the EdgeDuplicate model expectations. 2. Fixed batch deduplication: Removed episode skip in dedupe_edges_bulk that prevented comparing edges from the same episode. Added self-comparison guard to prevent edge from comparing against itself. 3. Added fast-path deduplication: Added exact string matching before parallel processing in resolve_extracted_edges to catch within-episode duplicates early, preventing race conditions where concurrent edges can't see each other. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
4d54493064
commit
160a8a1310
3 changed files with 2094 additions and 2090 deletions
|
|
@ -434,13 +434,14 @@ async def dedupe_edges_bulk(
|
||||||
for i, edges_i in enumerate(extracted_edges):
|
for i, edges_i in enumerate(extracted_edges):
|
||||||
existing_edges: list[EntityEdge] = []
|
existing_edges: list[EntityEdge] = []
|
||||||
for j, edges_j in enumerate(extracted_edges):
|
for j, edges_j in enumerate(extracted_edges):
|
||||||
if i == j:
|
|
||||||
continue
|
|
||||||
existing_edges += edges_j
|
existing_edges += edges_j
|
||||||
|
|
||||||
for edge in edges_i:
|
for edge in edges_i:
|
||||||
candidates: list[EntityEdge] = []
|
candidates: list[EntityEdge] = []
|
||||||
for existing_edge in existing_edges:
|
for existing_edge in existing_edges:
|
||||||
|
# Skip self-comparison
|
||||||
|
if edge.uuid == existing_edge.uuid:
|
||||||
|
continue
|
||||||
# Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
|
# Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
|
||||||
# This approach will cast a wider net than BM25, which is ideal for this use case
|
# This approach will cast a wider net than BM25, which is ideal for this use case
|
||||||
if (
|
if (
|
||||||
|
|
|
||||||
|
|
@ -232,6 +232,22 @@ async def resolve_extracted_edges(
|
||||||
edge_types: dict[str, type[BaseModel]],
|
edge_types: dict[str, type[BaseModel]],
|
||||||
edge_type_map: dict[tuple[str, str], list[str]],
|
edge_type_map: dict[tuple[str, str], list[str]],
|
||||||
) -> tuple[list[EntityEdge], list[EntityEdge]]:
|
) -> tuple[list[EntityEdge], list[EntityEdge]]:
|
||||||
|
# Fast path: deduplicate exact matches within the extracted edges before parallel processing
|
||||||
|
seen: dict[tuple[str, str, str], EntityEdge] = {}
|
||||||
|
deduplicated_edges: list[EntityEdge] = []
|
||||||
|
|
||||||
|
for edge in extracted_edges:
|
||||||
|
key = (
|
||||||
|
edge.source_node_uuid,
|
||||||
|
edge.target_node_uuid,
|
||||||
|
_normalize_string_exact(edge.fact),
|
||||||
|
)
|
||||||
|
if key not in seen:
|
||||||
|
seen[key] = edge
|
||||||
|
deduplicated_edges.append(edge)
|
||||||
|
|
||||||
|
extracted_edges = deduplicated_edges
|
||||||
|
|
||||||
driver = clients.driver
|
driver = clients.driver
|
||||||
llm_client = clients.llm_client
|
llm_client = clients.llm_client
|
||||||
embedder = clients.embedder
|
embedder = clients.embedder
|
||||||
|
|
@ -465,7 +481,7 @@ async def resolve_extracted_edge(
|
||||||
|
|
||||||
# Prepare context for LLM
|
# Prepare context for LLM
|
||||||
related_edges_context = [
|
related_edges_context = [
|
||||||
{'id': edge.uuid, 'fact': edge.fact} for i, edge in enumerate(related_edges)
|
{'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)
|
||||||
]
|
]
|
||||||
|
|
||||||
invalidation_edge_candidates_context = [
|
invalidation_edge_candidates_context = [
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue