Feat/cog 1331 modal run eval (#576)
<!-- .github/pull_request_template.md --> ## Description - Split metrics dashboard into two modules: calculator (statistics) and generator (visualization) - Added aggregate metrics as a new phase in evaluation pipeline - Created modal example to run multiple evaluations in parallel and collect results into a single combined output ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced metrics reporting with improved visualizations, including histogram and confidence interval plots. - Introduced an asynchronous evaluation process that supports parallel execution and streamlined result aggregation. - Added new configuration options to control metrics calculation and aggregated output storage. - **Refactor** - Restructured dashboard generation and evaluation workflows into a more modular, maintainable design. - Improved error handling and logging for better feedback during evaluation processes. - **Bug Fixes** - Updated test cases to ensure accurate validation of the new dashboard generation and metrics calculation functionalities. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
8874ddad2e
commit
bee04cad86
9 changed files with 429 additions and 151 deletions
|
|
@ -1,104 +1,88 @@
|
|||
import unittest
|
||||
from unittest.mock import patch
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TestGenerateMetricsDashboard(unittest.TestCase):
|
||||
from evals.eval_framework.analysis.dashboard_generator import (
|
||||
create_distribution_plots,
|
||||
create_ci_plot,
|
||||
generate_details_html,
|
||||
get_dashboard_html_template,
|
||||
create_dashboard,
|
||||
)
|
||||
|
||||
|
||||
class TestDashboardFunctions(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.test_data = [
|
||||
"""Set up test data."""
|
||||
self.metrics_data = {
|
||||
"accuracy": [0.8, 0.85, 0.9, 0.95, 1.0],
|
||||
"f1_score": [0.7, 0.75, 0.8, 0.85, 0.9],
|
||||
}
|
||||
|
||||
self.ci_data = {
|
||||
"accuracy": (0.9, 0.85, 0.95),
|
||||
"f1_score": (0.8, 0.75, 0.85),
|
||||
}
|
||||
|
||||
self.detail_data = [
|
||||
{
|
||||
"question": "What is AI?",
|
||||
"answer": "Artificial Intelligence",
|
||||
"golden_answer": "Artificial Intelligence",
|
||||
"metrics": {
|
||||
"accuracy": {"score": 0.9, "reason": "Close enough"},
|
||||
"relevance": {"score": 0.8},
|
||||
},
|
||||
},
|
||||
{
|
||||
"question": "What is ML?",
|
||||
"answer": "Machine Learning",
|
||||
"golden_answer": "Machine Learning",
|
||||
"metrics": {
|
||||
"accuracy": {"score": 0.95, "reason": "Exact match"},
|
||||
"relevance": {"score": 0.85},
|
||||
},
|
||||
"accuracy": {"score": 1.0, "reason": "Exact match"},
|
||||
"f1_score": {"score": 0.9, "reason": "High similarity"},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8")
|
||||
json.dump(self.test_data, self.temp_json)
|
||||
self.temp_json.close()
|
||||
self.output_file = "test_dashboard.html"
|
||||
def test_generate_details_html(self):
|
||||
"""Test HTML details generation."""
|
||||
html_output = generate_details_html(self.detail_data)
|
||||
|
||||
def tearDown(self):
|
||||
os.remove(self.temp_json.name)
|
||||
if os.path.exists(self.output_file):
|
||||
os.remove(self.output_file)
|
||||
self.assertIn("<h3>accuracy Details</h3>", html_output[0])
|
||||
self.assertIn("<th>Question</th>", html_output[1])
|
||||
self.assertIn("Exact match", "".join(html_output))
|
||||
|
||||
def test_generate_metrics_dashboard_valid_json(self):
|
||||
"""Test if the function processes valid JSON correctly and creates an output file."""
|
||||
result = generate_metrics_dashboard(
|
||||
self.temp_json.name, self.output_file, benchmark="Test Benchmark"
|
||||
def test_get_dashboard_html_template(self):
|
||||
"""Test full dashboard HTML generation."""
|
||||
figures = create_distribution_plots(self.metrics_data)
|
||||
ci_plot = create_ci_plot(self.ci_data)
|
||||
dashboard_html = get_dashboard_html_template(
|
||||
figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1"
|
||||
)
|
||||
|
||||
self.assertTrue(os.path.exists(self.output_file))
|
||||
self.assertEqual(result, self.output_file)
|
||||
self.assertIn("<title>LLM Evaluation Dashboard Benchmark 1</title>", dashboard_html)
|
||||
self.assertIn("<h2>Metrics Distribution</h2>", dashboard_html)
|
||||
self.assertIn("<h2>95% confidence interval for all the metrics</h2>", dashboard_html)
|
||||
self.assertIn("Benchmark 1", dashboard_html)
|
||||
|
||||
with open(self.output_file, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
self.assertIn("<title>LLM Evaluation Dashboard Test Benchmark</title>", html_content)
|
||||
self.assertIn("accuracy", html_content)
|
||||
self.assertIn("relevance", html_content)
|
||||
def test_create_dashboard(self):
|
||||
"""Test the full dashboard generation and file creation."""
|
||||
metrics_path = "test_metrics.json"
|
||||
aggregate_metrics_path = "test_aggregate.json"
|
||||
output_file = "test_dashboard.html"
|
||||
|
||||
@patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95))
|
||||
def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci):
|
||||
"""Test if bootstrap_ci is called with the correct parameters."""
|
||||
generate_metrics_dashboard(self.temp_json.name, self.output_file)
|
||||
with open(metrics_path, "w") as f:
|
||||
json.dump(self.detail_data, f)
|
||||
|
||||
mock_bootstrap_ci.assert_any_call([0.9, 0.95]) # For accuracy
|
||||
mock_bootstrap_ci.assert_any_call([0.8, 0.85]) # For relevance
|
||||
|
||||
@patch("plotly.graph_objects.Figure.to_html", return_value="<div>Plotly Chart</div>")
|
||||
def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html):
|
||||
"""Test if Plotly figures are generated correctly."""
|
||||
generate_metrics_dashboard(self.temp_json.name, self.output_file)
|
||||
|
||||
self.assertGreaterEqual(mock_to_html.call_count, 3) # 2 metrics + CI chart
|
||||
|
||||
with open(self.output_file, "r", encoding="utf-8") as f:
|
||||
file_content = f.read()
|
||||
self.assertIn(
|
||||
"<div>Plotly Chart</div>",
|
||||
file_content,
|
||||
"The output file does not contain the expected Plotly chart HTML.",
|
||||
with open(aggregate_metrics_path, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]}
|
||||
for metric, v in self.ci_data.items()
|
||||
},
|
||||
f,
|
||||
)
|
||||
|
||||
output = create_dashboard(
|
||||
metrics_path, aggregate_metrics_path, output_file, "Test Benchmark"
|
||||
)
|
||||
|
||||
class TestBootstrapCI(unittest.TestCase):
|
||||
def test_bootstrap_ci_basic(self):
|
||||
scores = [1, 2, 3, 4, 5]
|
||||
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
|
||||
self.assertEqual(output, output_file)
|
||||
self.assertTrue(os.path.exists(output_file))
|
||||
|
||||
self.assertAlmostEqual(mean, np.mean(scores), places=2)
|
||||
self.assertLessEqual(lower, mean)
|
||||
self.assertGreaterEqual(upper, mean)
|
||||
|
||||
def test_bootstrap_ci_single_value(self):
|
||||
scores = [3, 3, 3, 3, 3]
|
||||
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
|
||||
|
||||
self.assertEqual(mean, 3)
|
||||
self.assertEqual(lower, 3)
|
||||
self.assertEqual(upper, 3)
|
||||
|
||||
def test_bootstrap_ci_empty_list(self):
|
||||
mean, lower, upper = bootstrap_ci([])
|
||||
|
||||
self.assertTrue(np.isnan(mean))
|
||||
self.assertTrue(np.isnan(lower))
|
||||
self.assertTrue(np.isnan(upper))
|
||||
os.remove(metrics_path)
|
||||
os.remove(aggregate_metrics_path)
|
||||
os.remove(output_file)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter):
|
|||
"question": "What is 2 + 2?",
|
||||
"answer": "4",
|
||||
"golden_answer": "4",
|
||||
"retrieval_context": "2 + 2 = 4",
|
||||
}
|
||||
]
|
||||
|
||||
|
|
@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter):
|
|||
"question": None,
|
||||
"answer": None,
|
||||
"golden_answer": None,
|
||||
"retrieval_context": None,
|
||||
}
|
||||
]
|
||||
evaluator_metrics = ["EM", "f1"]
|
||||
|
|
|
|||
|
|
@ -2,6 +2,10 @@ import pytest
|
|||
from typing import Optional
|
||||
import sys
|
||||
from unittest.mock import patch, MagicMock
|
||||
import unittest
|
||||
import numpy as np
|
||||
from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci
|
||||
|
||||
|
||||
with patch.dict(
|
||||
sys.modules,
|
||||
|
|
@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra
|
|||
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
|
||||
f"F1 score failed for '{actual}' vs '{expected}'"
|
||||
)
|
||||
|
||||
|
||||
class TestBootstrapCI(unittest.TestCase):
|
||||
def test_bootstrap_ci_basic(self):
|
||||
scores = [1, 2, 3, 4, 5]
|
||||
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
|
||||
|
||||
self.assertAlmostEqual(mean, np.mean(scores), places=2)
|
||||
self.assertLessEqual(lower, mean)
|
||||
self.assertGreaterEqual(upper, mean)
|
||||
|
||||
def test_bootstrap_ci_single_value(self):
|
||||
scores = [3, 3, 3, 3, 3]
|
||||
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
|
||||
|
||||
self.assertEqual(mean, 3)
|
||||
self.assertEqual(lower, 3)
|
||||
self.assertEqual(upper, 3)
|
||||
|
||||
def test_bootstrap_ci_empty_list(self):
|
||||
mean, lower, upper = bootstrap_ci([])
|
||||
|
||||
self.assertTrue(np.isnan(mean))
|
||||
self.assertTrue(np.isnan(lower))
|
||||
self.assertTrue(np.isnan(upper))
|
||||
|
|
|
|||
0
evals/eval_framework/analysis/__init__.py
Normal file
0
evals/eval_framework/analysis/__init__.py
Normal file
|
|
@ -1,50 +1,12 @@
|
|||
import json
|
||||
from collections import defaultdict
|
||||
import plotly.graph_objects as go
|
||||
import numpy as np
|
||||
from typing import Dict, List, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
|
||||
means = []
|
||||
n = len(scores)
|
||||
for _ in range(num_samples):
|
||||
sample = np.random.choice(scores, size=n, replace=True)
|
||||
means.append(np.mean(sample))
|
||||
|
||||
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
|
||||
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
|
||||
return np.mean(scores), lower_bound, upper_bound
|
||||
|
||||
|
||||
def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
|
||||
try:
|
||||
with open(json_data, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"Could not find the file: {json_data}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Error decoding JSON from {json_data}: {e}")
|
||||
|
||||
metrics_data = defaultdict(list)
|
||||
metric_details = defaultdict(list)
|
||||
|
||||
for entry in data:
|
||||
for metric, values in entry["metrics"].items():
|
||||
score = values["score"]
|
||||
metrics_data[metric].append(score)
|
||||
if "reason" in values:
|
||||
metric_details[metric].append(
|
||||
{
|
||||
"question": entry["question"],
|
||||
"answer": entry["answer"],
|
||||
"golden_answer": entry["golden_answer"],
|
||||
"reason": values["reason"],
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
|
||||
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
|
||||
"""Create distribution histogram plots for each metric."""
|
||||
figures = []
|
||||
|
||||
for metric, scores in metrics_data.items():
|
||||
fig = go.Figure()
|
||||
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
|
||||
|
|
@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
|
|||
template="seaborn",
|
||||
)
|
||||
figures.append(fig.to_html(full_html=False))
|
||||
return figures
|
||||
|
||||
ci_results = {}
|
||||
for metric, scores in metrics_data.items():
|
||||
mean_score, lower, upper = bootstrap_ci(scores)
|
||||
ci_results[metric] = (mean_score, lower, upper)
|
||||
|
||||
# Bar chart with confidence intervals
|
||||
def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
|
||||
"""Create confidence interval bar plot."""
|
||||
fig = go.Figure()
|
||||
for metric, (mean_score, lower, upper) in ci_results.items():
|
||||
fig.add_trace(
|
||||
|
|
@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
|
|||
yaxis_title="Score",
|
||||
template="seaborn",
|
||||
)
|
||||
figures.append(fig.to_html(full_html=False))
|
||||
return fig.to_html(full_html=False)
|
||||
|
||||
|
||||
def generate_details_html(metrics_data: List[Dict]) -> List[str]:
|
||||
"""Generate HTML for detailed metric information."""
|
||||
details_html = []
|
||||
metric_details = {}
|
||||
|
||||
# Organize metrics by type
|
||||
for entry in metrics_data:
|
||||
for metric, values in entry["metrics"].items():
|
||||
if metric not in metric_details:
|
||||
metric_details[metric] = []
|
||||
metric_details[metric].append(
|
||||
{
|
||||
"question": entry["question"],
|
||||
"answer": entry["answer"],
|
||||
"golden_answer": entry["golden_answer"],
|
||||
"reason": values.get("reason", ""),
|
||||
"score": values["score"],
|
||||
}
|
||||
)
|
||||
|
||||
for metric, details in metric_details.items():
|
||||
details_html.append(f"<h3>{metric} Details</h3>")
|
||||
details_html.append("""
|
||||
|
|
@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
|
|||
f"</tr>"
|
||||
)
|
||||
details_html.append("</table>")
|
||||
return details_html
|
||||
|
||||
html_template = f"""
|
||||
|
||||
def get_dashboard_html_template(
|
||||
figures: List[str], details_html: List[str], benchmark: str = ""
|
||||
) -> str:
|
||||
"""Generate the complete HTML dashboard template."""
|
||||
return f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
|
|
@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
|
|||
<h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
|
||||
|
||||
<h2>Metrics Distribution</h2>
|
||||
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])}
|
||||
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
|
||||
|
||||
<h2>95% confidence interval for all the metrics</h2>
|
||||
<div class="chart">{figures[-1]}</div>
|
||||
|
|
@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
|
|||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def create_dashboard(
|
||||
metrics_path: str,
|
||||
aggregate_metrics_path: str,
|
||||
output_file: str = "dashboard_with_ci.html",
|
||||
benchmark: str = "",
|
||||
) -> str:
|
||||
"""Create and save the dashboard with all visualizations."""
|
||||
# Read metrics files
|
||||
with open(metrics_path, "r") as f:
|
||||
metrics_data = json.load(f)
|
||||
with open(aggregate_metrics_path, "r") as f:
|
||||
aggregate_data = json.load(f)
|
||||
|
||||
# Extract data for visualizations
|
||||
metrics_by_type = defaultdict(list)
|
||||
for entry in metrics_data:
|
||||
for metric, values in entry["metrics"].items():
|
||||
metrics_by_type[metric].append(values["score"])
|
||||
|
||||
# Generate visualizations
|
||||
distribution_figures = create_distribution_plots(metrics_by_type)
|
||||
ci_plot = create_ci_plot(
|
||||
{
|
||||
metric: (data["mean"], data["ci_lower"], data["ci_upper"])
|
||||
for metric, data in aggregate_data.items()
|
||||
}
|
||||
)
|
||||
|
||||
# Combine all figures
|
||||
figures = distribution_figures + [ci_plot]
|
||||
|
||||
# Generate HTML components
|
||||
details_html = generate_details_html(metrics_data)
|
||||
dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
|
||||
|
||||
# Write to file
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(html_template)
|
||||
f.write(dashboard_html)
|
||||
|
||||
return output_file
|
||||
92
evals/eval_framework/analysis/metrics_calculator.py
Normal file
92
evals/eval_framework/analysis/metrics_calculator.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
import json
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
|
||||
"""Calculate bootstrap confidence intervals for a list of scores."""
|
||||
means = []
|
||||
n = len(scores)
|
||||
for _ in range(num_samples):
|
||||
sample = np.random.choice(scores, size=n, replace=True)
|
||||
means.append(np.mean(sample))
|
||||
|
||||
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
|
||||
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
|
||||
return np.mean(scores), lower_bound, upper_bound
|
||||
|
||||
|
||||
def load_metrics_data(json_file_path: str) -> List[Dict]:
|
||||
"""Load metrics data from JSON file."""
|
||||
try:
|
||||
with open(json_file_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"Could not find the file: {json_file_path}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
|
||||
|
||||
|
||||
def extract_metrics_and_details(
|
||||
data: List[Dict],
|
||||
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
|
||||
"""Extract metrics scores and details from evaluation data."""
|
||||
metrics_data = defaultdict(list)
|
||||
metric_details = defaultdict(list)
|
||||
|
||||
for entry in data:
|
||||
for metric, values in entry["metrics"].items():
|
||||
score = values["score"]
|
||||
metrics_data[metric].append(score)
|
||||
if "reason" in values:
|
||||
metric_details[metric].append(
|
||||
{
|
||||
"question": entry["question"],
|
||||
"answer": entry["answer"],
|
||||
"golden_answer": entry["golden_answer"],
|
||||
"reason": values["reason"],
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
|
||||
return metrics_data, metric_details
|
||||
|
||||
|
||||
def save_aggregate_metrics(
|
||||
metrics_data: Dict[str, List[float]],
|
||||
ci_results: Dict[str, Tuple[float, float, float]],
|
||||
output_path: str,
|
||||
) -> None:
|
||||
"""Save aggregated metrics and confidence intervals to file."""
|
||||
aggregate_data = {
|
||||
metric: {
|
||||
"scores": scores,
|
||||
"mean": ci_results[metric][0],
|
||||
"ci_lower": ci_results[metric][1],
|
||||
"ci_upper": ci_results[metric][2],
|
||||
}
|
||||
for metric, scores in metrics_data.items()
|
||||
}
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(aggregate_data, f, indent=4)
|
||||
|
||||
|
||||
def calculate_metrics_statistics(
|
||||
json_data: str, aggregate_output_path: str
|
||||
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
|
||||
"""Calculate metrics statistics and save aggregated results."""
|
||||
data = load_metrics_data(json_data)
|
||||
metrics_data, metric_details = extract_metrics_and_details(data)
|
||||
|
||||
# Calculate confidence intervals
|
||||
ci_results = {}
|
||||
for metric, scores in metrics_data.items():
|
||||
mean_score, lower, upper = bootstrap_ci(scores)
|
||||
ci_results[metric] = (mean_score, lower, upper)
|
||||
|
||||
# Save aggregate metrics
|
||||
save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
|
||||
|
||||
return metrics_data, metric_details, ci_results
|
||||
|
|
@ -26,6 +26,9 @@ class EvalConfig(BaseSettings):
|
|||
] # Use only 'correctness' for DirectLLM
|
||||
deepeval_model: str = "gpt-4o-mini"
|
||||
|
||||
# Metrics params
|
||||
calculate_metrics: bool = True
|
||||
|
||||
# Visualization
|
||||
dashboard: bool = True
|
||||
|
||||
|
|
@ -33,6 +36,7 @@ class EvalConfig(BaseSettings):
|
|||
questions_path: str = "questions_output.json"
|
||||
answers_path: str = "answers_output.json"
|
||||
metrics_path: str = "metrics_output.json"
|
||||
aggregate_metrics_path: str = "aggregate_metrics.json"
|
||||
dashboard_path: str = "dashboard.html"
|
||||
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
|
||||
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
|
||||
|
|
@ -49,10 +53,12 @@ class EvalConfig(BaseSettings):
|
|||
"evaluating_answers": self.evaluating_answers,
|
||||
"evaluation_engine": self.evaluation_engine,
|
||||
"evaluation_metrics": self.evaluation_metrics,
|
||||
"calculate_metrics": self.calculate_metrics,
|
||||
"dashboard": self.dashboard,
|
||||
"questions_path": self.questions_path,
|
||||
"answers_path": self.answers_path,
|
||||
"metrics_path": self.metrics_path,
|
||||
"aggregate_metrics_path": self.aggregate_metrics_path,
|
||||
"dashboard_path": self.dashboard_path,
|
||||
"deepeval_model": self.deepeval_model,
|
||||
"task_getter_type": self.task_getter_type,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
import logging
|
||||
import json
|
||||
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
|
||||
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
|
||||
from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
|
||||
from evals.eval_framework.analysis.dashboard_generator import create_dashboard
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
from cognee.infrastructure.databases.relational.get_relational_engine import (
|
||||
get_relational_engine,
|
||||
|
|
@ -28,8 +29,8 @@ async def create_and_insert_metrics_table(questions_payload):
|
|||
await session.commit()
|
||||
|
||||
|
||||
async def run_evaluation(params: dict) -> None:
|
||||
if params.get("evaluating_answers"):
|
||||
async def execute_evaluation(params: dict) -> None:
|
||||
"""Execute the evaluation step and save results."""
|
||||
logging.info("Evaluation started...")
|
||||
try:
|
||||
with open(params["answers_path"], "r", encoding="utf-8") as f:
|
||||
|
|
@ -48,12 +49,36 @@ async def run_evaluation(params: dict) -> None:
|
|||
json.dump(metrics, f, ensure_ascii=False, indent=4)
|
||||
|
||||
await create_and_insert_metrics_table(metrics)
|
||||
logging.info("Evaluation completed")
|
||||
|
||||
logging.info("Evaluation End...")
|
||||
|
||||
async def run_evaluation(params: dict) -> None:
|
||||
"""Run each step of the evaluation pipeline based on configuration flags."""
|
||||
# Step 1: Evaluate answers if requested
|
||||
if params.get("evaluating_answers"):
|
||||
await execute_evaluation(params)
|
||||
else:
|
||||
logging.info("Skipping evaluation as evaluating_answers is False")
|
||||
|
||||
# Step 2: Calculate metrics if requested
|
||||
if params.get("calculate_metrics"):
|
||||
logging.info("Calculating metrics statistics...")
|
||||
calculate_metrics_statistics(
|
||||
json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
|
||||
)
|
||||
logging.info("Metrics calculation completed")
|
||||
else:
|
||||
logging.info("Skipping metrics calculation as calculate_metrics is False")
|
||||
|
||||
# Step 3: Generate dashboard if requested
|
||||
if params.get("dashboard"):
|
||||
generate_metrics_dashboard(
|
||||
json_data=params["metrics_path"],
|
||||
logging.info("Generating dashboard...")
|
||||
create_dashboard(
|
||||
metrics_path=params["metrics_path"],
|
||||
aggregate_metrics_path=params["aggregate_metrics_path"],
|
||||
output_file=params["dashboard_path"],
|
||||
benchmark=params["benchmark"],
|
||||
)
|
||||
logging.info(f"Dashboard generated at {params['dashboard_path']}")
|
||||
else:
|
||||
logging.info("Skipping dashboard generation as dashboard is False")
|
||||
|
|
|
|||
116
evals/eval_framework/modal_run_eval.py
Normal file
116
evals/eval_framework/modal_run_eval.py
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
import modal
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import datetime
|
||||
import logging
|
||||
from evals.eval_framework.eval_config import EvalConfig
|
||||
from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
|
||||
from evals.eval_framework.answer_generation.run_question_answering_module import (
|
||||
run_question_answering,
|
||||
)
|
||||
from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def read_and_combine_metrics(eval_params: dict) -> dict:
|
||||
"""Read and combine metrics files into a single result dictionary."""
|
||||
try:
|
||||
with open(eval_params["metrics_path"], "r") as f:
|
||||
metrics = json.load(f)
|
||||
with open(eval_params["aggregate_metrics_path"], "r") as f:
|
||||
aggregate_metrics = json.load(f)
|
||||
|
||||
return {
|
||||
"task_getter_type": eval_params["task_getter_type"],
|
||||
"number_of_samples": eval_params["number_of_samples_in_corpus"],
|
||||
"metrics": metrics,
|
||||
"aggregate_metrics": aggregate_metrics,
|
||||
}
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error reading metrics files: {e}")
|
||||
return None
|
||||
|
||||
|
||||
app = modal.App("modal-run-eval")
|
||||
|
||||
image = (
|
||||
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
|
||||
.copy_local_file("pyproject.toml", "pyproject.toml")
|
||||
.copy_local_file("poetry.lock", "poetry.lock")
|
||||
.env(
|
||||
{
|
||||
"ENV": os.getenv("ENV"),
|
||||
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
|
||||
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
|
||||
}
|
||||
)
|
||||
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
|
||||
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
|
||||
)
|
||||
|
||||
|
||||
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
|
||||
async def modal_run_eval(eval_params=None):
|
||||
"""Runs evaluation pipeline and returns combined metrics results."""
|
||||
if eval_params is None:
|
||||
eval_params = EvalConfig().to_dict()
|
||||
|
||||
logger.info(f"Running evaluation with params: {eval_params}")
|
||||
|
||||
# Run the evaluation pipeline
|
||||
await run_corpus_builder(eval_params)
|
||||
await run_question_answering(eval_params)
|
||||
await run_evaluation(eval_params)
|
||||
|
||||
# Early return if metrics calculation wasn't requested
|
||||
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
|
||||
logger.info(
|
||||
"Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
|
||||
)
|
||||
return None
|
||||
|
||||
return read_and_combine_metrics(eval_params)
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
async def main():
|
||||
# List of configurations to run
|
||||
configs = [
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=2,
|
||||
building_corpus_from_scratch=True,
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=False,
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=10,
|
||||
building_corpus_from_scratch=True,
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=False,
|
||||
),
|
||||
]
|
||||
|
||||
# Run evaluations in parallel with different configurations
|
||||
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
|
||||
results = await asyncio.gather(*modal_tasks)
|
||||
|
||||
# Filter out None results and save combined results
|
||||
results = [r for r in results if r is not None]
|
||||
if results:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = f"combined_results_{timestamp}.json"
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
|
||||
else:
|
||||
logger.info("No metrics were collected from any of the evaluation runs")
|
||||
Loading…
Add table
Reference in a new issue