<!-- .github/pull_request_template.md --> ## Description - Split metrics dashboard into two modules: calculator (statistics) and generator (visualization) - Added aggregate metrics as a new phase in evaluation pipeline - Created modal example to run multiple evaluations in parallel and collect results into a single combined output ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced metrics reporting with improved visualizations, including histogram and confidence interval plots. - Introduced an asynchronous evaluation process that supports parallel execution and streamlined result aggregation. - Added new configuration options to control metrics calculation and aggregated output storage. - **Refactor** - Restructured dashboard generation and evaluation workflows into a more modular, maintainable design. - Improved error handling and logging for better feedback during evaluation processes. - **Bug Fixes** - Updated test cases to ensure accurate validation of the new dashboard generation and metrics calculation functionalities. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
92 lines
3.2 KiB
Python
92 lines
3.2 KiB
Python
import json
|
|
from collections import defaultdict
|
|
import numpy as np
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
|
|
"""Calculate bootstrap confidence intervals for a list of scores."""
|
|
means = []
|
|
n = len(scores)
|
|
for _ in range(num_samples):
|
|
sample = np.random.choice(scores, size=n, replace=True)
|
|
means.append(np.mean(sample))
|
|
|
|
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
|
|
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
|
|
return np.mean(scores), lower_bound, upper_bound
|
|
|
|
|
|
def load_metrics_data(json_file_path: str) -> List[Dict]:
|
|
"""Load metrics data from JSON file."""
|
|
try:
|
|
with open(json_file_path, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
raise FileNotFoundError(f"Could not find the file: {json_file_path}")
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
|
|
|
|
|
|
def extract_metrics_and_details(
|
|
data: List[Dict],
|
|
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
|
|
"""Extract metrics scores and details from evaluation data."""
|
|
metrics_data = defaultdict(list)
|
|
metric_details = defaultdict(list)
|
|
|
|
for entry in data:
|
|
for metric, values in entry["metrics"].items():
|
|
score = values["score"]
|
|
metrics_data[metric].append(score)
|
|
if "reason" in values:
|
|
metric_details[metric].append(
|
|
{
|
|
"question": entry["question"],
|
|
"answer": entry["answer"],
|
|
"golden_answer": entry["golden_answer"],
|
|
"reason": values["reason"],
|
|
"score": score,
|
|
}
|
|
)
|
|
|
|
return metrics_data, metric_details
|
|
|
|
|
|
def save_aggregate_metrics(
|
|
metrics_data: Dict[str, List[float]],
|
|
ci_results: Dict[str, Tuple[float, float, float]],
|
|
output_path: str,
|
|
) -> None:
|
|
"""Save aggregated metrics and confidence intervals to file."""
|
|
aggregate_data = {
|
|
metric: {
|
|
"scores": scores,
|
|
"mean": ci_results[metric][0],
|
|
"ci_lower": ci_results[metric][1],
|
|
"ci_upper": ci_results[metric][2],
|
|
}
|
|
for metric, scores in metrics_data.items()
|
|
}
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(aggregate_data, f, indent=4)
|
|
|
|
|
|
def calculate_metrics_statistics(
|
|
json_data: str, aggregate_output_path: str
|
|
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
|
|
"""Calculate metrics statistics and save aggregated results."""
|
|
data = load_metrics_data(json_data)
|
|
metrics_data, metric_details = extract_metrics_and_details(data)
|
|
|
|
# Calculate confidence intervals
|
|
ci_results = {}
|
|
for metric, scores in metrics_data.items():
|
|
mean_score, lower, upper = bootstrap_ci(scores)
|
|
ci_results[metric] = (mean_score, lower, upper)
|
|
|
|
# Save aggregate metrics
|
|
save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
|
|
|
|
return metrics_data, metric_details, ci_results
|