import json from collections import defaultdict import plotly.graph_objects as go import numpy as np def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95): means = [] n = len(scores) for _ in range(num_samples): sample = np.random.choice(scores, size=n, replace=True) means.append(np.mean(sample)) lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100) upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100) return np.mean(scores), lower_bound, upper_bound def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""): try: with open(json_data, "r", encoding="utf-8") as f: data = json.load(f) except FileNotFoundError: raise FileNotFoundError(f"Could not find the file: {json_data}") except json.JSONDecodeError as e: raise ValueError(f"Error decoding JSON from {json_data}: {e}") metrics_data = defaultdict(list) metric_details = defaultdict(list) for entry in data: for metric, values in entry["metrics"].items(): score = values["score"] metrics_data[metric].append(score) if "reason" in values: metric_details[metric].append( { "question": entry["question"], "answer": entry["answer"], "golden_answer": entry["golden_answer"], "reason": values["reason"], "score": score, } ) figures = [] for metric, scores in metrics_data.items(): fig = go.Figure() fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4")) fig.update_layout( title=f"{metric} Score Distribution", xaxis_title="Score", yaxis_title="Count", bargap=0.1, template="seaborn", ) figures.append(fig.to_html(full_html=False)) ci_results = {} for metric, scores in metrics_data.items(): mean_score, lower, upper = bootstrap_ci(scores) ci_results[metric] = (mean_score, lower, upper) # Bar chart with confidence intervals fig = go.Figure() for metric, (mean_score, lower, upper) in ci_results.items(): fig.add_trace( go.Bar( x=[metric], y=[mean_score], error_y=dict( type="data", array=[upper - mean_score], arrayminus=[mean_score - lower], visible=True, ), name=metric, ) ) fig.update_layout( title="95% confidence interval for all the metrics", xaxis_title="Metric", yaxis_title="Score", template="seaborn", ) figures.append(fig.to_html(full_html=False)) details_html = [] for metric, details in metric_details.items(): details_html.append(f"

{metric} Details

") details_html.append(""" """) for item in details: details_html.append( f"" f"" f"" f"" f"" f"" f"" ) details_html.append("
Question Answer Golden Answer Reason Score
{item['question']}{item['answer']}{item['golden_answer']}{item['reason']}{item['score']}
") html_template = f""" LLM Evaluation Dashboard {benchmark}

LLM Evaluation Metrics Dashboard {benchmark}

Metrics Distribution

{"".join([f'
{fig}
' for fig in figures[: len(metrics_data)]])}

95% confidence interval for all the metrics

{figures[-1]}

Detailed Explanations

{"".join(details_html)} """ with open(output_file, "w", encoding="utf-8") as f: f.write(html_template) return output_file