import json import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path from scipy import stats def calculate_confidence_interval(accuracies, confidence=0.95): """Calculate mean and confidence interval for a list of accuracies.""" if not accuracies: return 0, 0, 0 mean = np.mean(accuracies) if len(accuracies) < 2: return mean, mean, mean ci = stats.t.interval(confidence, len(accuracies) - 1, loc=mean, scale=stats.sem(accuracies)) return mean, ci[0], ci[1] def load_human_eval_metrics(system_dir): """Load and calculate metrics from human evaluation JSON files.""" human_eval_patterns = ["human_eval", "huma_eval"] metrics = {} for pattern in human_eval_patterns: for file in system_dir.glob(f"*{pattern}*.json"): try: with open(file) as f: data = json.load(f) scores = [item["metrics"]["humaneval"]["score"] for item in data] if scores: mean, ci_low, ci_high = calculate_confidence_interval(scores) metrics["Human-LLM Correctness"] = { "mean": mean, "ci_low": ci_low, "ci_high": ci_high, } print( f"Found human eval metrics in {file}: mean={mean:.4f}, CI=[{ci_low:.4f}, {ci_high:.4f}]" ) break except Exception as e: print(f"Error loading {file}: {e}") return metrics def load_metrics(system_dir): """Load metrics from a system directory.""" metrics = {} system_name_lower = system_dir.name.split("_")[0].lower() is_optimized = "optimized" in system_dir.name.lower() # --- Human-LLM Correctness Loading (Standard Check First) --- human_metrics = load_human_eval_metrics(system_dir) # ^ This loads from dedicated *_human_eval.json files if they exist metrics.update(human_metrics) # --- DeepEval Metrics & Special Cognee Optimized Handling --- metrics_file = None if system_name_lower == "graphiti": metrics_file = system_dir / "aggregate_metrics_graphiti.json" print(f"Processing Graphiti DeepEval from {metrics_file}") elif system_name_lower == "mem0": metrics_file = system_dir / "aggregate_metrics_mem0.json" print(f"Processing Mem0 DeepEval from {metrics_file}") elif system_name_lower == "cognee": if is_optimized: # Files for Cognee Optimized (dreamify) deepeval_file = system_dir / "aggregate_metrics_4o_cognee_10.json" if not deepeval_file.exists(): deepeval_file = system_dir / "aggregate_metrics_4o_cognee_10_short.json" # Fallback # DeepEval metrics file (including Correctness, F1, EM) deepeval_metrics_file = ( system_dir / "aggregate_metrics_4o_cognee_10_short_deepeval.json" ) print("Processing Cognee Optimized (dreamify)") # --- 1. Load Human-LLM from main file --- if deepeval_file.exists(): try: with open(deepeval_file) as f: data = json.load(f) # Load Human-LLM Correctness from the correctness field in main file if ( "correctness" in data and isinstance(data["correctness"], dict) and "mean" in data["correctness"] ): metrics["Human-LLM Correctness"] = { "mean": data["correctness"]["mean"], "ci_low": data["correctness"].get( "ci_lower", data["correctness"]["mean"] ), "ci_high": data["correctness"].get( "ci_upper", data["correctness"]["mean"] ), } print( f"Found Human-LLM Correctness in {deepeval_file}: mean={metrics['Human-LLM Correctness']['mean']:.4f}" ) except Exception as e: print(f"Error loading metrics from {deepeval_file}: {e}") # --- 2. Load ALL DeepEval metrics (Correctness, F1, EM) from the dedicated file --- if deepeval_metrics_file.exists(): try: with open(deepeval_metrics_file) as f: data = json.load(f) # Look for ALL metrics in the deepeval file deepeval_metrics = {"correctness": "Correctness", "f1": "F1", "EM": "EM"} for key, display_name in deepeval_metrics.items(): metric_key = f"DeepEval {display_name}" if key in data and isinstance(data[key], dict) and "mean" in data[key]: metrics[metric_key] = { "mean": data[key]["mean"], "ci_low": data[key].get("ci_lower", data[key]["mean"]), "ci_high": data[key].get("ci_upper", data[key]["mean"]), } print( f"Found DeepEval {display_name} in {deepeval_metrics_file}: mean={metrics[metric_key]['mean']:.4f}" ) except Exception as e: print(f"Error loading DeepEval metrics from {deepeval_metrics_file}: {e}") else: # Regular Cognee metrics_file = system_dir / "aggregate_metrics_v_deepeval.json" print(f"Processing Regular Cognee DeepEval from {metrics_file}") # Common logic to load DeepEval metrics (Correctness, F1, EM) from the determined file if metrics_file and metrics_file.exists(): try: with open(metrics_file) as f: data = json.load(f) standard_metrics_keys = {"correctness": "Correctness", "f1": "F1", "EM": "EM"} for key, display_name in standard_metrics_keys.items(): metric_key = f"DeepEval {display_name}" if key in data and isinstance(data[key], dict) and "mean" in data[key]: # Load DeepEval metric if found metrics[metric_key] = { "mean": data[key]["mean"], "ci_low": data[key].get("ci_lower", data[key]["mean"]), "ci_high": data[key].get("ci_upper", data[key]["mean"]), } print( f"Found DeepEval metrics in {metrics_file}: {key}={metrics[metric_key]['mean']:.4f}" ) except Exception as e: print(f"Error loading DeepEval metrics from {metrics_file}: {e}") elif metrics_file: print(f"DeepEval metrics file not found: {metrics_file}") # Make sure all standard metrics exist with defaults if missing all_expected_metrics = [ "Human-LLM Correctness", "DeepEval Correctness", "DeepEval F1", "DeepEval EM", ] for metric_name in all_expected_metrics: if metric_name not in metrics: metrics[metric_name] = {"mean": 0.0, "ci_low": 0.0, "ci_high": 0.0} print(f"Added default for missing metric: {metric_name}") return metrics def plot_metrics(all_systems_metrics, output_file="metrics_comparison.png"): """Plot metrics comparison.""" if not all_systems_metrics: print("No metrics found to plot") return # Set style plt.style.use("seaborn-v0_8") sns.set_theme(style="whitegrid") # Cognee brand colors brand_colors = { "data_dream_violet": "#6510F4", "data_flux_green": "#0DFF00", "secondary_purple": "#A550FF", "abyss_black": "#000000", "data_cloud_grey": "#F4F4F4", "dark_grey": "#323332", } # Color palette using Cognee brand colors colors = [ brand_colors["data_flux_green"], brand_colors["data_dream_violet"], brand_colors["secondary_purple"], brand_colors["dark_grey"], ] # Prepare data with custom ordering (Cognee first, then Graphiti) preferred_order = ["Cognee", "Graphiti", "Mem0", "Falkor"] systems = [system for system in preferred_order if system in all_systems_metrics] # Add any systems not in preferred order at the end for system in all_systems_metrics.keys(): if system not in systems: systems.append(system) metrics = set() for system_metrics in all_systems_metrics.values(): metrics.update(system_metrics.keys()) # Sort metrics by average score across systems (highest to lowest) def get_metric_avg_score(metric): scores = [] for system in systems: if metric in all_systems_metrics[system]: scores.append(all_systems_metrics[system][metric]["mean"]) return np.mean(scores) if scores else 0 metrics = sorted(list(metrics), key=get_metric_avg_score, reverse=True) # Set up the plot with Cognee brand styling fig, ax = plt.subplots(figsize=(15, 8), facecolor=brand_colors["data_cloud_grey"]) ax.set_facecolor(brand_colors["data_cloud_grey"]) # Plot bars x = np.arange(len(systems)) width = 0.8 / len(metrics) for i, metric in enumerate(metrics): means = [] yerr_low = [] yerr_high = [] for system in systems: if metric in all_systems_metrics[system]: m = all_systems_metrics[system][metric] means.append(m["mean"]) yerr_low.append(m["mean"] - m["ci_low"]) yerr_high.append(m["ci_high"] - m["mean"]) else: means.append(0) yerr_low.append(0) yerr_high.append(0) yerr = [yerr_low, yerr_high] ax.bar( x + i * width - (len(metrics) - 1) * width / 2, means, width, label=metric, color=colors[i % len(colors)], alpha=0.85, yerr=yerr, capsize=4, error_kw={ "elinewidth": 1.5, "capthick": 1.5, "ecolor": brand_colors["dark_grey"], "alpha": 0.5, }, ) # Customize plot with Cognee styling ax.set_ylabel("Score", fontsize=14, fontweight="bold", color=brand_colors["abyss_black"]) ax.set_title( "AI Memory - Benchmark Results", fontsize=18, pad=20, fontweight="bold", color=brand_colors["data_dream_violet"], ) ax.set_xticks(x) ax.set_xticklabels( systems, rotation=45, ha="right", fontsize=12, fontweight="bold", color=brand_colors["abyss_black"], ) ax.tick_params(axis="y", labelsize=11, colors=brand_colors["abyss_black"]) # Set y-axis limits with some padding ax.set_ylim(0, 1.1) # Add grid ax.yaxis.grid(True, linestyle="--", alpha=0.5, color=brand_colors["dark_grey"]) ax.set_axisbelow(True) # Customize legend legend = ax.legend( bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=12, frameon=True, fancybox=True, shadow=True, title="Metrics", title_fontsize=14, ) # Style the legend text with brand colors plt.setp(legend.get_title(), fontweight="bold", color=brand_colors["data_dream_violet"]) # Add value labels on top of bars with improved visibility for i, metric in enumerate(metrics): for j, system in enumerate(systems): if metric in all_systems_metrics[system]: value = all_systems_metrics[system][metric]["mean"] if value > 0: # Only show label if value is greater than 0 # Create a small white background for the text to improve legibility ax.text( j + i * width - (len(metrics) - 1) * width / 2, value + 0.02, f"{value:.2f}", ha="center", va="bottom", fontsize=11, fontweight="bold", color=brand_colors["data_dream_violet"], bbox=dict(facecolor="white", alpha=0.7, pad=1, edgecolor="none"), ) # Add border to the plot for spine in ax.spines.values(): spine.set_edgecolor(brand_colors["dark_grey"]) spine.set_linewidth(1.5) # Adjust layout plt.tight_layout() # Define output file paths output_base = output_file.rsplit(".", 1)[0] output_ext = output_file.rsplit(".", 1)[1] if "." in output_file else "png" logo_output_file = f"{output_base}_with_logo.{output_ext}" # Save plot first without logo plt.savefig(output_file, bbox_inches="tight", dpi=300) # Now add logo and save again try: # Try to find the logo file logo_path = Path("../assets/cognee-logo-transparent.png") if not logo_path.exists(): logo_path = Path("../assets/cognee_logo.png") if logo_path.exists(): # Create a new figure with the same size height, width = fig.get_size_inches() fig_with_logo = plt.figure( figsize=(height, width), facecolor=brand_colors["data_cloud_grey"] ) # First, plot the saved chart as a background chart_img = plt.imread(output_file) chart_ax = fig_with_logo.add_subplot(111) chart_ax.imshow(chart_img) chart_ax.axis("off") # Now overlay the logo with transparency logo_img = plt.imread(str(logo_path)) # Position logo in the upper part of the chart with current horizontal position # Keep horizontal position (0.65) but move back to upper part of chart logo_ax = fig_with_logo.add_axes([0.65, 0.75, 0.085, 0.085], zorder=1) logo_ax.imshow(logo_img, alpha=0.45) # Same opacity logo_ax.axis("off") # Turn off axis # Save the combined image fig_with_logo.savefig(logo_output_file, dpi=300, bbox_inches="tight") plt.close(fig_with_logo) # Replace the original file with the logo version import os os.replace(logo_output_file, output_file) except Exception as e: print(f"Warning: Could not add logo overlay - {e}") plt.close(fig) def main(): """Main function to process metrics and generate plot.""" eval_dir = Path(".") all_systems_metrics = {} # Process each system directory for system_dir in eval_dir.glob("*_01042025"): print(f"\nChecking system directory: {system_dir}") system_name = system_dir.name.split("_")[0].capitalize() metrics = load_metrics(system_dir) # Special handling for cognee_optimized if "optimized" in system_dir.name.lower(): system_name = "Cognee (dreamify)" if metrics: all_systems_metrics[system_name] = metrics print(f"Found metrics for {system_name}: {metrics}") # Plot cognee comparison if both regular and optimized are present if "Cognee" in all_systems_metrics and "Cognee (dreamify)" in all_systems_metrics: print("\nGenerating Cognee vs Cognee (dreamify) comparison plot.") cognee_metrics = { "Cognee": all_systems_metrics["Cognee"], "Cognee (dreamify)": all_systems_metrics["Cognee (dreamify)"], } plot_metrics(cognee_metrics, output_file="cognee_comparison.png") print(f"\nAll systems metrics: {all_systems_metrics}") if not all_systems_metrics: print("No metrics data found!") return # Plot metrics for all systems - excluding both Falkor and Cognee (dreamify) systems_for_comparison = {} for system_name, system_metrics in all_systems_metrics.items(): if system_name not in ["Cognee (dreamify)", "Falkor"]: systems_for_comparison[system_name] = system_metrics if systems_for_comparison: print("\nGenerating main metrics comparison (excluding Falkor and Cognee dreamify)") plot_metrics(systems_for_comparison, output_file="metrics_comparison.png") if __name__ == "__main__": main()