cognee/evals/eval_framework/analysis/metrics_calculator.py

import json
from collections import defaultdict
import numpy as np
from typing import Dict, List, Tuple


def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
    """Calculate bootstrap confidence intervals for a list of scores."""
    means = []
    n = len(scores)
    for _ in range(num_samples):
        sample = np.random.choice(scores, size=n, replace=True)
        means.append(np.mean(sample))

    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
    return np.mean(scores), lower_bound, upper_bound


def load_metrics_data(json_file_path: str) -> List[Dict]:
    """Load metrics data from JSON file."""
    try:
        with open(json_file_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find the file: {json_file_path}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")


def extract_metrics_and_details(
    data: List[Dict],
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
    """Extract metrics scores and details from evaluation data."""
    metrics_data = defaultdict(list)
    metric_details = defaultdict(list)

    for entry in data:
        for metric, values in entry["metrics"].items():
            score = values["score"]
            metrics_data[metric].append(score)
            if "reason" in values:
                metric_details[metric].append(
                    {
                        "question": entry["question"],
                        "answer": entry["answer"],
                        "golden_answer": entry["golden_answer"],
                        "reason": values["reason"],
                        "score": score,
                    }
                )

    return metrics_data, metric_details


def save_aggregate_metrics(
    metrics_data: Dict[str, List[float]],
    ci_results: Dict[str, Tuple[float, float, float]],
    output_path: str,
) -> None:
    """Save aggregated metrics and confidence intervals to file."""
    aggregate_data = {
        metric: {
            "scores": scores,
            "mean": ci_results[metric][0],
            "ci_lower": ci_results[metric][1],
            "ci_upper": ci_results[metric][2],
        }
        for metric, scores in metrics_data.items()
    }

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(aggregate_data, f, indent=4)


def calculate_metrics_statistics(
    json_data: str, aggregate_output_path: str
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
    """Calculate metrics statistics and save aggregated results."""
    data = load_metrics_data(json_data)
    metrics_data, metric_details = extract_metrics_and_details(data)

    # Calculate confidence intervals
    ci_results = {}
    for metric, scores in metrics_data.items():
        mean_score, lower, upper = bootstrap_ci(scores)
        ci_results[metric] = (mean_score, lower, upper)

    # Save aggregate metrics
    save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)

    return metrics_data, metric_details, ci_results