fix: Prevent duplicate edge facts within same episode
This fixes three related bugs that allowed verbatim duplicate edge facts: 1. Fixed LLM deduplication: Changed related_edges_context to use integer indices instead of UUIDs, matching the EdgeDuplicate model expectations. 2. Fixed batch deduplication: Removed episode skip in dedupe_edges_bulk that prevented comparing edges from the same episode. Added self-comparison guard to prevent edge from comparing against itself. 3. Added fast-path deduplication: Added exact string matching before parallel processing in resolve_extracted_edges to catch within-episode duplicates early, preventing race conditions where concurrent edges can't see each other. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
4d54493064
commit
160a8a1310
3 changed files with 2094 additions and 2090 deletions
|
|
@ -434,13 +434,14 @@ async def dedupe_edges_bulk(
|
|||
for i, edges_i in enumerate(extracted_edges):
|
||||
existing_edges: list[EntityEdge] = []
|
||||
for j, edges_j in enumerate(extracted_edges):
|
||||
if i == j:
|
||||
continue
|
||||
existing_edges += edges_j
|
||||
|
||||
for edge in edges_i:
|
||||
candidates: list[EntityEdge] = []
|
||||
for existing_edge in existing_edges:
|
||||
# Skip self-comparison
|
||||
if edge.uuid == existing_edge.uuid:
|
||||
continue
|
||||
# Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
|
||||
# This approach will cast a wider net than BM25, which is ideal for this use case
|
||||
if (
|
||||
|
|
|
|||
|
|
@ -232,6 +232,22 @@ async def resolve_extracted_edges(
|
|||
edge_types: dict[str, type[BaseModel]],
|
||||
edge_type_map: dict[tuple[str, str], list[str]],
|
||||
) -> tuple[list[EntityEdge], list[EntityEdge]]:
|
||||
# Fast path: deduplicate exact matches within the extracted edges before parallel processing
|
||||
seen: dict[tuple[str, str, str], EntityEdge] = {}
|
||||
deduplicated_edges: list[EntityEdge] = []
|
||||
|
||||
for edge in extracted_edges:
|
||||
key = (
|
||||
edge.source_node_uuid,
|
||||
edge.target_node_uuid,
|
||||
_normalize_string_exact(edge.fact),
|
||||
)
|
||||
if key not in seen:
|
||||
seen[key] = edge
|
||||
deduplicated_edges.append(edge)
|
||||
|
||||
extracted_edges = deduplicated_edges
|
||||
|
||||
driver = clients.driver
|
||||
llm_client = clients.llm_client
|
||||
embedder = clients.embedder
|
||||
|
|
@ -465,7 +481,7 @@ async def resolve_extracted_edge(
|
|||
|
||||
# Prepare context for LLM
|
||||
related_edges_context = [
|
||||
{'id': edge.uuid, 'fact': edge.fact} for i, edge in enumerate(related_edges)
|
||||
{'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)
|
||||
]
|
||||
|
||||
invalidation_edge_candidates_context = [
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue