fix: Prevent duplicate edge facts within same episode

This fixes three related bugs that allowed verbatim duplicate edge facts:

1. Fixed LLM deduplication: Changed related_edges_context to use integer
   indices instead of UUIDs, matching the EdgeDuplicate model expectations.

2. Fixed batch deduplication: Removed episode skip in dedupe_edges_bulk
   that prevented comparing edges from the same episode. Added self-comparison
   guard to prevent edge from comparing against itself.

3. Added fast-path deduplication: Added exact string matching before parallel
   processing in resolve_extracted_edges to catch within-episode duplicates
   early, preventing race conditions where concurrent edges can't see each other.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Daniel Chalef 2025-09-30 20:34:17 -07:00
parent 4d54493064
commit 160a8a1310
3 changed files with 2094 additions and 2090 deletions

View file

@ -434,13 +434,14 @@ async def dedupe_edges_bulk(
for i, edges_i in enumerate(extracted_edges):
existing_edges: list[EntityEdge] = []
for j, edges_j in enumerate(extracted_edges):
if i == j:
continue
existing_edges += edges_j
for edge in edges_i:
candidates: list[EntityEdge] = []
for existing_edge in existing_edges:
# Skip self-comparison
if edge.uuid == existing_edge.uuid:
continue
# Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
# This approach will cast a wider net than BM25, which is ideal for this use case
if (

View file

@ -232,6 +232,22 @@ async def resolve_extracted_edges(
edge_types: dict[str, type[BaseModel]],
edge_type_map: dict[tuple[str, str], list[str]],
) -> tuple[list[EntityEdge], list[EntityEdge]]:
# Fast path: deduplicate exact matches within the extracted edges before parallel processing
seen: dict[tuple[str, str, str], EntityEdge] = {}
deduplicated_edges: list[EntityEdge] = []
for edge in extracted_edges:
key = (
edge.source_node_uuid,
edge.target_node_uuid,
_normalize_string_exact(edge.fact),
)
if key not in seen:
seen[key] = edge
deduplicated_edges.append(edge)
extracted_edges = deduplicated_edges
driver = clients.driver
llm_client = clients.llm_client
embedder = clients.embedder
@ -465,7 +481,7 @@ async def resolve_extracted_edge(
# Prepare context for LLM
related_edges_context = [
{'id': edge.uuid, 'fact': edge.fact} for i, edge in enumerate(related_edges)
{'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)
]
invalidation_edge_candidates_context = [

4161
uv.lock generated

File diff suppressed because it is too large Load diff