<!-- .github/pull_request_template.md --> ## Description - Split metrics dashboard into two modules: calculator (statistics) and generator (visualization) - Added aggregate metrics as a new phase in evaluation pipeline - Created modal example to run multiple evaluations in parallel and collect results into a single combined output ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced metrics reporting with improved visualizations, including histogram and confidence interval plots. - Introduced an asynchronous evaluation process that supports parallel execution and streamlined result aggregation. - Added new configuration options to control metrics calculation and aggregated output storage. - **Refactor** - Restructured dashboard generation and evaluation workflows into a more modular, maintainable design. - Improved error handling and logging for better feedback during evaluation processes. - **Bug Fixes** - Updated test cases to ensure accurate validation of the new dashboard generation and metrics calculation functionalities. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
172 lines
5.6 KiB
Python
172 lines
5.6 KiB
Python
import json
|
|
import plotly.graph_objects as go
|
|
from typing import Dict, List, Tuple
|
|
from collections import defaultdict
|
|
|
|
|
|
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
|
|
"""Create distribution histogram plots for each metric."""
|
|
figures = []
|
|
for metric, scores in metrics_data.items():
|
|
fig = go.Figure()
|
|
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
|
|
|
|
fig.update_layout(
|
|
title=f"{metric} Score Distribution",
|
|
xaxis_title="Score",
|
|
yaxis_title="Count",
|
|
bargap=0.1,
|
|
template="seaborn",
|
|
)
|
|
figures.append(fig.to_html(full_html=False))
|
|
return figures
|
|
|
|
|
|
def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
|
|
"""Create confidence interval bar plot."""
|
|
fig = go.Figure()
|
|
for metric, (mean_score, lower, upper) in ci_results.items():
|
|
fig.add_trace(
|
|
go.Bar(
|
|
x=[metric],
|
|
y=[mean_score],
|
|
error_y=dict(
|
|
type="data",
|
|
array=[upper - mean_score],
|
|
arrayminus=[mean_score - lower],
|
|
visible=True,
|
|
),
|
|
name=metric,
|
|
)
|
|
)
|
|
|
|
fig.update_layout(
|
|
title="95% confidence interval for all the metrics",
|
|
xaxis_title="Metric",
|
|
yaxis_title="Score",
|
|
template="seaborn",
|
|
)
|
|
return fig.to_html(full_html=False)
|
|
|
|
|
|
def generate_details_html(metrics_data: List[Dict]) -> List[str]:
|
|
"""Generate HTML for detailed metric information."""
|
|
details_html = []
|
|
metric_details = {}
|
|
|
|
# Organize metrics by type
|
|
for entry in metrics_data:
|
|
for metric, values in entry["metrics"].items():
|
|
if metric not in metric_details:
|
|
metric_details[metric] = []
|
|
metric_details[metric].append(
|
|
{
|
|
"question": entry["question"],
|
|
"answer": entry["answer"],
|
|
"golden_answer": entry["golden_answer"],
|
|
"reason": values.get("reason", ""),
|
|
"score": values["score"],
|
|
}
|
|
)
|
|
|
|
for metric, details in metric_details.items():
|
|
details_html.append(f"<h3>{metric} Details</h3>")
|
|
details_html.append("""
|
|
<table class="metric-table">
|
|
<tr>
|
|
<th>Question</th>
|
|
<th>Answer</th>
|
|
<th>Golden Answer</th>
|
|
<th>Reason</th>
|
|
<th>Score</th>
|
|
</tr>
|
|
""")
|
|
for item in details:
|
|
details_html.append(
|
|
f"<tr>"
|
|
f"<td>{item['question']}</td>"
|
|
f"<td>{item['answer']}</td>"
|
|
f"<td>{item['golden_answer']}</td>"
|
|
f"<td>{item['reason']}</td>"
|
|
f"<td>{item['score']}</td>"
|
|
f"</tr>"
|
|
)
|
|
details_html.append("</table>")
|
|
return details_html
|
|
|
|
|
|
def get_dashboard_html_template(
|
|
figures: List[str], details_html: List[str], benchmark: str = ""
|
|
) -> str:
|
|
"""Generate the complete HTML dashboard template."""
|
|
return f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>LLM Evaluation Dashboard {benchmark}</title>
|
|
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
|
<style>
|
|
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
|
.chart {{ border: 1px solid #ddd; padding: 20px; margin-bottom: 30px; }}
|
|
.metric-table {{ border-collapse: collapse; width: 100%; margin-bottom: 30px; }}
|
|
.metric-table th, .metric-table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
|
.metric-table th {{ background-color: #f2f2f2; }}
|
|
h2 {{ color: #333; border-bottom: 2px solid #eee; padding-bottom: 10px; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
|
|
|
|
<h2>Metrics Distribution</h2>
|
|
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
|
|
|
|
<h2>95% confidence interval for all the metrics</h2>
|
|
<div class="chart">{figures[-1]}</div>
|
|
|
|
<h2>Detailed Explanations</h2>
|
|
{"".join(details_html)}
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
|
|
def create_dashboard(
|
|
metrics_path: str,
|
|
aggregate_metrics_path: str,
|
|
output_file: str = "dashboard_with_ci.html",
|
|
benchmark: str = "",
|
|
) -> str:
|
|
"""Create and save the dashboard with all visualizations."""
|
|
# Read metrics files
|
|
with open(metrics_path, "r") as f:
|
|
metrics_data = json.load(f)
|
|
with open(aggregate_metrics_path, "r") as f:
|
|
aggregate_data = json.load(f)
|
|
|
|
# Extract data for visualizations
|
|
metrics_by_type = defaultdict(list)
|
|
for entry in metrics_data:
|
|
for metric, values in entry["metrics"].items():
|
|
metrics_by_type[metric].append(values["score"])
|
|
|
|
# Generate visualizations
|
|
distribution_figures = create_distribution_plots(metrics_by_type)
|
|
ci_plot = create_ci_plot(
|
|
{
|
|
metric: (data["mean"], data["ci_lower"], data["ci_upper"])
|
|
for metric, data in aggregate_data.items()
|
|
}
|
|
)
|
|
|
|
# Combine all figures
|
|
figures = distribution_figures + [ci_plot]
|
|
|
|
# Generate HTML components
|
|
details_html = generate_details_html(metrics_data)
|
|
dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
|
|
|
|
# Write to file
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
f.write(dashboard_html)
|
|
|
|
return output_file
|