cognee/evals/plot_metrics.py
Igor Ilic 87a3642ea4
Cog 1576 (#721)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: vasilije <vas.markovic@gmail.com>
Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
2025-04-11 09:31:48 +02:00

437 lines
16 KiB
Python

import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
def calculate_confidence_interval(accuracies, confidence=0.95):
"""Calculate mean and confidence interval for a list of accuracies."""
if not accuracies:
return 0, 0, 0
mean = np.mean(accuracies)
if len(accuracies) < 2:
return mean, mean, mean
ci = stats.t.interval(confidence, len(accuracies) - 1, loc=mean, scale=stats.sem(accuracies))
return mean, ci[0], ci[1]
def load_human_eval_metrics(system_dir):
"""Load and calculate metrics from human evaluation JSON files."""
human_eval_patterns = ["human_eval", "huma_eval"]
metrics = {}
for pattern in human_eval_patterns:
for file in system_dir.glob(f"*{pattern}*.json"):
try:
with open(file) as f:
data = json.load(f)
scores = [item["metrics"]["humaneval"]["score"] for item in data]
if scores:
mean, ci_low, ci_high = calculate_confidence_interval(scores)
metrics["Human-LLM Correctness"] = {
"mean": mean,
"ci_low": ci_low,
"ci_high": ci_high,
}
print(
f"Found human eval metrics in {file}: mean={mean:.4f}, CI=[{ci_low:.4f}, {ci_high:.4f}]"
)
break
except Exception as e:
print(f"Error loading {file}: {e}")
return metrics
def load_metrics(system_dir):
"""Load metrics from a system directory."""
metrics = {}
system_name_lower = system_dir.name.split("_")[0].lower()
is_optimized = "optimized" in system_dir.name.lower()
# --- Human-LLM Correctness Loading (Standard Check First) ---
human_metrics = load_human_eval_metrics(system_dir)
# ^ This loads from dedicated *_human_eval.json files if they exist
metrics.update(human_metrics)
# --- DeepEval Metrics & Special Cognee Optimized Handling ---
metrics_file = None
if system_name_lower == "graphiti":
metrics_file = system_dir / "aggregate_metrics_graphiti.json"
print(f"Processing Graphiti DeepEval from {metrics_file}")
elif system_name_lower == "mem0":
metrics_file = system_dir / "aggregate_metrics_mem0.json"
print(f"Processing Mem0 DeepEval from {metrics_file}")
elif system_name_lower == "cognee":
if is_optimized:
# Files for Cognee Optimized (dreamify)
deepeval_file = system_dir / "aggregate_metrics_4o_cognee_10.json"
if not deepeval_file.exists():
deepeval_file = system_dir / "aggregate_metrics_4o_cognee_10_short.json" # Fallback
# DeepEval metrics file (including Correctness, F1, EM)
deepeval_metrics_file = (
system_dir / "aggregate_metrics_4o_cognee_10_short_deepeval.json"
)
print("Processing Cognee Optimized (dreamify)")
# --- 1. Load Human-LLM from main file ---
if deepeval_file.exists():
try:
with open(deepeval_file) as f:
data = json.load(f)
# Load Human-LLM Correctness from the correctness field in main file
if (
"correctness" in data
and isinstance(data["correctness"], dict)
and "mean" in data["correctness"]
):
metrics["Human-LLM Correctness"] = {
"mean": data["correctness"]["mean"],
"ci_low": data["correctness"].get(
"ci_lower", data["correctness"]["mean"]
),
"ci_high": data["correctness"].get(
"ci_upper", data["correctness"]["mean"]
),
}
print(
f"Found Human-LLM Correctness in {deepeval_file}: mean={metrics['Human-LLM Correctness']['mean']:.4f}"
)
except Exception as e:
print(f"Error loading metrics from {deepeval_file}: {e}")
# --- 2. Load ALL DeepEval metrics (Correctness, F1, EM) from the dedicated file ---
if deepeval_metrics_file.exists():
try:
with open(deepeval_metrics_file) as f:
data = json.load(f)
# Look for ALL metrics in the deepeval file
deepeval_metrics = {"correctness": "Correctness", "f1": "F1", "EM": "EM"}
for key, display_name in deepeval_metrics.items():
metric_key = f"DeepEval {display_name}"
if key in data and isinstance(data[key], dict) and "mean" in data[key]:
metrics[metric_key] = {
"mean": data[key]["mean"],
"ci_low": data[key].get("ci_lower", data[key]["mean"]),
"ci_high": data[key].get("ci_upper", data[key]["mean"]),
}
print(
f"Found DeepEval {display_name} in {deepeval_metrics_file}: mean={metrics[metric_key]['mean']:.4f}"
)
except Exception as e:
print(f"Error loading DeepEval metrics from {deepeval_metrics_file}: {e}")
else: # Regular Cognee
metrics_file = system_dir / "aggregate_metrics_v_deepeval.json"
print(f"Processing Regular Cognee DeepEval from {metrics_file}")
# Common logic to load DeepEval metrics (Correctness, F1, EM) from the determined file
if metrics_file and metrics_file.exists():
try:
with open(metrics_file) as f:
data = json.load(f)
standard_metrics_keys = {"correctness": "Correctness", "f1": "F1", "EM": "EM"}
for key, display_name in standard_metrics_keys.items():
metric_key = f"DeepEval {display_name}"
if key in data and isinstance(data[key], dict) and "mean" in data[key]:
# Load DeepEval metric if found
metrics[metric_key] = {
"mean": data[key]["mean"],
"ci_low": data[key].get("ci_lower", data[key]["mean"]),
"ci_high": data[key].get("ci_upper", data[key]["mean"]),
}
print(
f"Found DeepEval metrics in {metrics_file}: {key}={metrics[metric_key]['mean']:.4f}"
)
except Exception as e:
print(f"Error loading DeepEval metrics from {metrics_file}: {e}")
elif metrics_file:
print(f"DeepEval metrics file not found: {metrics_file}")
# Make sure all standard metrics exist with defaults if missing
all_expected_metrics = [
"Human-LLM Correctness",
"DeepEval Correctness",
"DeepEval F1",
"DeepEval EM",
]
for metric_name in all_expected_metrics:
if metric_name not in metrics:
metrics[metric_name] = {"mean": 0.0, "ci_low": 0.0, "ci_high": 0.0}
print(f"Added default for missing metric: {metric_name}")
return metrics
def plot_metrics(all_systems_metrics, output_file="metrics_comparison.png"):
"""Plot metrics comparison."""
if not all_systems_metrics:
print("No metrics found to plot")
return
# Set style
plt.style.use("seaborn-v0_8")
sns.set_theme(style="whitegrid")
# Cognee brand colors
brand_colors = {
"data_dream_violet": "#6510F4",
"data_flux_green": "#0DFF00",
"secondary_purple": "#A550FF",
"abyss_black": "#000000",
"data_cloud_grey": "#F4F4F4",
"dark_grey": "#323332",
}
# Color palette using Cognee brand colors
colors = [
brand_colors["data_flux_green"],
brand_colors["data_dream_violet"],
brand_colors["secondary_purple"],
brand_colors["dark_grey"],
]
# Prepare data with custom ordering (Cognee first, then Graphiti)
preferred_order = ["Cognee", "Graphiti", "Mem0", "Falkor"]
systems = [system for system in preferred_order if system in all_systems_metrics]
# Add any systems not in preferred order at the end
for system in all_systems_metrics.keys():
if system not in systems:
systems.append(system)
metrics = set()
for system_metrics in all_systems_metrics.values():
metrics.update(system_metrics.keys())
# Sort metrics by average score across systems (highest to lowest)
def get_metric_avg_score(metric):
scores = []
for system in systems:
if metric in all_systems_metrics[system]:
scores.append(all_systems_metrics[system][metric]["mean"])
return np.mean(scores) if scores else 0
metrics = sorted(list(metrics), key=get_metric_avg_score, reverse=True)
# Set up the plot with Cognee brand styling
fig, ax = plt.subplots(figsize=(15, 8), facecolor=brand_colors["data_cloud_grey"])
ax.set_facecolor(brand_colors["data_cloud_grey"])
# Plot bars
x = np.arange(len(systems))
width = 0.8 / len(metrics)
for i, metric in enumerate(metrics):
means = []
yerr_low = []
yerr_high = []
for system in systems:
if metric in all_systems_metrics[system]:
m = all_systems_metrics[system][metric]
means.append(m["mean"])
yerr_low.append(m["mean"] - m["ci_low"])
yerr_high.append(m["ci_high"] - m["mean"])
else:
means.append(0)
yerr_low.append(0)
yerr_high.append(0)
yerr = [yerr_low, yerr_high]
ax.bar(
x + i * width - (len(metrics) - 1) * width / 2,
means,
width,
label=metric,
color=colors[i % len(colors)],
alpha=0.85,
yerr=yerr,
capsize=4,
error_kw={
"elinewidth": 1.5,
"capthick": 1.5,
"ecolor": brand_colors["dark_grey"],
"alpha": 0.5,
},
)
# Customize plot with Cognee styling
ax.set_ylabel("Score", fontsize=14, fontweight="bold", color=brand_colors["abyss_black"])
ax.set_title(
"AI Memory - Benchmark Results",
fontsize=18,
pad=20,
fontweight="bold",
color=brand_colors["data_dream_violet"],
)
ax.set_xticks(x)
ax.set_xticklabels(
systems,
rotation=45,
ha="right",
fontsize=12,
fontweight="bold",
color=brand_colors["abyss_black"],
)
ax.tick_params(axis="y", labelsize=11, colors=brand_colors["abyss_black"])
# Set y-axis limits with some padding
ax.set_ylim(0, 1.1)
# Add grid
ax.yaxis.grid(True, linestyle="--", alpha=0.5, color=brand_colors["dark_grey"])
ax.set_axisbelow(True)
# Customize legend
legend = ax.legend(
bbox_to_anchor=(1.05, 1),
loc="upper left",
fontsize=12,
frameon=True,
fancybox=True,
shadow=True,
title="Metrics",
title_fontsize=14,
)
# Style the legend text with brand colors
plt.setp(legend.get_title(), fontweight="bold", color=brand_colors["data_dream_violet"])
# Add value labels on top of bars with improved visibility
for i, metric in enumerate(metrics):
for j, system in enumerate(systems):
if metric in all_systems_metrics[system]:
value = all_systems_metrics[system][metric]["mean"]
if value > 0: # Only show label if value is greater than 0
# Create a small white background for the text to improve legibility
ax.text(
j + i * width - (len(metrics) - 1) * width / 2,
value + 0.02,
f"{value:.2f}",
ha="center",
va="bottom",
fontsize=11,
fontweight="bold",
color=brand_colors["data_dream_violet"],
bbox=dict(facecolor="white", alpha=0.7, pad=1, edgecolor="none"),
)
# Add border to the plot
for spine in ax.spines.values():
spine.set_edgecolor(brand_colors["dark_grey"])
spine.set_linewidth(1.5)
# Adjust layout
plt.tight_layout()
# Define output file paths
output_base = output_file.rsplit(".", 1)[0]
output_ext = output_file.rsplit(".", 1)[1] if "." in output_file else "png"
logo_output_file = f"{output_base}_with_logo.{output_ext}"
# Save plot first without logo
plt.savefig(output_file, bbox_inches="tight", dpi=300)
# Now add logo and save again
try:
# Try to find the logo file
logo_path = Path("../assets/cognee-logo-transparent.png")
if not logo_path.exists():
logo_path = Path("../assets/cognee_logo.png")
if logo_path.exists():
# Create a new figure with the same size
height, width = fig.get_size_inches()
fig_with_logo = plt.figure(
figsize=(height, width), facecolor=brand_colors["data_cloud_grey"]
)
# First, plot the saved chart as a background
chart_img = plt.imread(output_file)
chart_ax = fig_with_logo.add_subplot(111)
chart_ax.imshow(chart_img)
chart_ax.axis("off")
# Now overlay the logo with transparency
logo_img = plt.imread(str(logo_path))
# Position logo in the upper part of the chart with current horizontal position
# Keep horizontal position (0.65) but move back to upper part of chart
logo_ax = fig_with_logo.add_axes([0.65, 0.75, 0.085, 0.085], zorder=1)
logo_ax.imshow(logo_img, alpha=0.45) # Same opacity
logo_ax.axis("off") # Turn off axis
# Save the combined image
fig_with_logo.savefig(logo_output_file, dpi=300, bbox_inches="tight")
plt.close(fig_with_logo)
# Replace the original file with the logo version
import os
os.replace(logo_output_file, output_file)
except Exception as e:
print(f"Warning: Could not add logo overlay - {e}")
plt.close(fig)
def main():
"""Main function to process metrics and generate plot."""
eval_dir = Path(".")
all_systems_metrics = {}
# Process each system directory
for system_dir in eval_dir.glob("*_01042025"):
print(f"\nChecking system directory: {system_dir}")
system_name = system_dir.name.split("_")[0].capitalize()
metrics = load_metrics(system_dir)
# Special handling for cognee_optimized
if "optimized" in system_dir.name.lower():
system_name = "Cognee (dreamify)"
if metrics:
all_systems_metrics[system_name] = metrics
print(f"Found metrics for {system_name}: {metrics}")
# Plot cognee comparison if both regular and optimized are present
if "Cognee" in all_systems_metrics and "Cognee (dreamify)" in all_systems_metrics:
print("\nGenerating Cognee vs Cognee (dreamify) comparison plot.")
cognee_metrics = {
"Cognee": all_systems_metrics["Cognee"],
"Cognee (dreamify)": all_systems_metrics["Cognee (dreamify)"],
}
plot_metrics(cognee_metrics, output_file="cognee_comparison.png")
print(f"\nAll systems metrics: {all_systems_metrics}")
if not all_systems_metrics:
print("No metrics data found!")
return
# Plot metrics for all systems - excluding both Falkor and Cognee (dreamify)
systems_for_comparison = {}
for system_name, system_metrics in all_systems_metrics.items():
if system_name not in ["Cognee (dreamify)", "Falkor"]:
systems_for_comparison[system_name] = system_metrics
if systems_for_comparison:
print("\nGenerating main metrics comparison (excluding Falkor and Cognee dreamify)")
plot_metrics(systems_for_comparison, output_file="metrics_comparison.png")
if __name__ == "__main__":
main()