cognee/evals/eval_framework/analysis/metrics_calculator.py
lxobr bee04cad86
Feat/cog 1331 modal run eval (#576)
<!-- .github/pull_request_template.md -->

## Description
- Split metrics dashboard into two modules: calculator (statistics) and
generator (visualization)
- Added aggregate metrics as a new phase in evaluation pipeline
- Created modal example to run multiple evaluations in parallel and
collect results into a single combined output
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced metrics reporting with improved visualizations, including
histogram and confidence interval plots.
- Introduced an asynchronous evaluation process that supports parallel
execution and streamlined result aggregation.
- Added new configuration options to control metrics calculation and
aggregated output storage.

- **Refactor**
- Restructured dashboard generation and evaluation workflows into a more
modular, maintainable design.
- Improved error handling and logging for better feedback during
evaluation processes.

- **Bug Fixes**
- Updated test cases to ensure accurate validation of the new dashboard
generation and metrics calculation functionalities.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-03-03 14:22:32 +01:00

92 lines
3.2 KiB
Python

import json
from collections import defaultdict
import numpy as np
from typing import Dict, List, Tuple
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
"""Calculate bootstrap confidence intervals for a list of scores."""
means = []
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound
def load_metrics_data(json_file_path: str) -> List[Dict]:
"""Load metrics data from JSON file."""
try:
with open(json_file_path, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
def extract_metrics_and_details(
data: List[Dict],
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
"""Extract metrics scores and details from evaluation data."""
metrics_data = defaultdict(list)
metric_details = defaultdict(list)
for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
return metrics_data, metric_details
def save_aggregate_metrics(
metrics_data: Dict[str, List[float]],
ci_results: Dict[str, Tuple[float, float, float]],
output_path: str,
) -> None:
"""Save aggregated metrics and confidence intervals to file."""
aggregate_data = {
metric: {
"scores": scores,
"mean": ci_results[metric][0],
"ci_lower": ci_results[metric][1],
"ci_upper": ci_results[metric][2],
}
for metric, scores in metrics_data.items()
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(aggregate_data, f, indent=4)
def calculate_metrics_statistics(
json_data: str, aggregate_output_path: str
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
"""Calculate metrics statistics and save aggregated results."""
data = load_metrics_data(json_data)
metrics_data, metric_details = extract_metrics_and_details(data)
# Calculate confidence intervals
ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)
# Save aggregate metrics
save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
return metrics_data, metric_details, ci_results