import json import plotly.graph_objects as go from typing import Dict, List, Tuple from collections import defaultdict def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]: """Create distribution histogram plots for each metric.""" figures = [] for metric, scores in metrics_data.items(): fig = go.Figure() fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4")) fig.update_layout( title=f"{metric} Score Distribution", xaxis_title="Score", yaxis_title="Count", bargap=0.1, template="seaborn", ) figures.append(fig.to_html(full_html=False)) return figures def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str: """Create confidence interval bar plot.""" fig = go.Figure() for metric, (mean_score, lower, upper) in ci_results.items(): fig.add_trace( go.Bar( x=[metric], y=[mean_score], error_y=dict( type="data", array=[upper - mean_score], arrayminus=[mean_score - lower], visible=True, ), name=metric, ) ) fig.update_layout( title="95% confidence interval for all the metrics", xaxis_title="Metric", yaxis_title="Score", template="seaborn", ) return fig.to_html(full_html=False) def generate_details_html(metrics_data: List[Dict]) -> List[str]: """Generate HTML for detailed metric information.""" details_html = [] metric_details = {} # Organize metrics by type for entry in metrics_data: for metric, values in entry["metrics"].items(): if metric not in metric_details: metric_details[metric] = [] metric_details[metric].append( { "question": entry["question"], "answer": entry["answer"], "golden_answer": entry["golden_answer"], "reason": values.get("reason", ""), "score": values["score"], } ) for metric, details in metric_details.items(): details_html.append(f"

{metric} Details

") details_html.append(""" """) for item in details: details_html.append( f"" f"" f"" f"" f"" f"" f"" ) details_html.append("
Question Answer Golden Answer Reason Score
{item['question']}{item['answer']}{item['golden_answer']}{item['reason']}{item['score']}
") return details_html def get_dashboard_html_template( figures: List[str], details_html: List[str], benchmark: str = "" ) -> str: """Generate the complete HTML dashboard template.""" return f""" LLM Evaluation Dashboard {benchmark}

LLM Evaluation Metrics Dashboard {benchmark}

Metrics Distribution

{"".join([f'
{fig}
' for fig in figures[:-1]])}

95% confidence interval for all the metrics

{figures[-1]}

Detailed Explanations

{"".join(details_html)} """ def create_dashboard( metrics_path: str, aggregate_metrics_path: str, output_file: str = "dashboard_with_ci.html", benchmark: str = "", ) -> str: """Create and save the dashboard with all visualizations.""" # Read metrics files with open(metrics_path, "r") as f: metrics_data = json.load(f) with open(aggregate_metrics_path, "r") as f: aggregate_data = json.load(f) # Extract data for visualizations metrics_by_type = defaultdict(list) for entry in metrics_data: for metric, values in entry["metrics"].items(): metrics_by_type[metric].append(values["score"]) # Generate visualizations distribution_figures = create_distribution_plots(metrics_by_type) ci_plot = create_ci_plot( { metric: (data["mean"], data["ci_lower"], data["ci_upper"]) for metric, data in aggregate_data.items() } ) # Combine all figures figures = distribution_figures + [ci_plot] # Generate HTML components details_html = generate_details_html(metrics_data) dashboard_html = get_dashboard_html_template(figures, details_html, benchmark) # Write to file with open(output_file, "w", encoding="utf-8") as f: f.write(dashboard_html) return output_file