fix: Prevent duplicate edge facts within same episode

This fixes three related bugs that allowed verbatim duplicate edge facts: 1. Fixed LLM deduplication: Changed related_edges_context to use integer indices instead of UUIDs, matching the EdgeDuplicate model expectations. 2. Fixed batch deduplication: Removed episode skip in dedupe_edges_bulk that prevented comparing edges from the same episode. Added self-comparison guard to prevent edge from comparing against itself. 3. Added fast-path deduplication: Added exact string matching before parallel processing in resolve_extracted_edges to catch within-episode duplicates early, preventing race conditions where concurrent edges can't see each other. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-30 20:34:17 -07:00 · 2025-09-30 20:34:17 -07:00 · 160a8a1310
commit 160a8a1310
parent 4d54493064
3 changed files with 2094 additions and 2090 deletions
--- a/graphiti_core/utils/bulk_utils.py
+++ b/graphiti_core/utils/bulk_utils.py
@ -434,13 +434,14 @@ async def dedupe_edges_bulk(
    for i, edges_i in enumerate(extracted_edges):
        existing_edges: list[EntityEdge] = []
        for j, edges_j in enumerate(extracted_edges):
-            if i == j:
-                continue
            existing_edges += edges_j

        for edge in edges_i:
            candidates: list[EntityEdge] = []
            for existing_edge in existing_edges:
+                # Skip self-comparison
+                if edge.uuid == existing_edge.uuid:
+                    continue
                # Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
                # This approach will cast a wider net than BM25, which is ideal for this use case
                if (
--- a/graphiti_core/utils/maintenance/edge_operations.py
+++ b/graphiti_core/utils/maintenance/edge_operations.py
@ -232,6 +232,22 @@ async def resolve_extracted_edges(
    edge_types: dict[str, type[BaseModel]],
    edge_type_map: dict[tuple[str, str], list[str]],
 ) -> tuple[list[EntityEdge], list[EntityEdge]]:
+    # Fast path: deduplicate exact matches within the extracted edges before parallel processing
+    seen: dict[tuple[str, str, str], EntityEdge] = {}
+    deduplicated_edges: list[EntityEdge] = []
+
+    for edge in extracted_edges:
+        key = (
+            edge.source_node_uuid,
+            edge.target_node_uuid,
+            _normalize_string_exact(edge.fact),
+        )
+        if key not in seen:
+            seen[key] = edge
+            deduplicated_edges.append(edge)
+
+    extracted_edges = deduplicated_edges
+
    driver = clients.driver
    llm_client = clients.llm_client
    embedder = clients.embedder
@ -465,7 +481,7 @@ async def resolve_extracted_edge(

    # Prepare context for LLM
    related_edges_context = [
-        {'id': edge.uuid, 'fact': edge.fact} for i, edge in enumerate(related_edges)
+        {'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)
    ]

    invalidation_edge_candidates_context = [
--- a/uv.lock
+++ b/uv.lock