Feat/cog 1331 modal run eval (#576)

<!-- .github/pull_request_template.md -->

## Description
- Split metrics dashboard into two modules: calculator (statistics) and
generator (visualization)
- Added aggregate metrics as a new phase in evaluation pipeline
- Created modal example to run multiple evaluations in parallel and
collect results into a single combined output
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced metrics reporting with improved visualizations, including
histogram and confidence interval plots.
- Introduced an asynchronous evaluation process that supports parallel
execution and streamlined result aggregation.
- Added new configuration options to control metrics calculation and
aggregated output storage.

- **Refactor**
- Restructured dashboard generation and evaluation workflows into a more
modular, maintainable design.
- Improved error handling and logging for better feedback during
evaluation processes.

- **Bug Fixes**
- Updated test cases to ensure accurate validation of the new dashboard
generation and metrics calculation functionalities.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
lxobr 2025-03-03 14:22:32 +01:00 committed by GitHub
parent 8874ddad2e
commit bee04cad86
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 429 additions and 151 deletions

View file

@ -1,104 +1,88 @@
import unittest import unittest
from unittest.mock import patch
import json import json
import os import os
import tempfile
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci
import numpy as np
class TestGenerateMetricsDashboard(unittest.TestCase): from evals.eval_framework.analysis.dashboard_generator import (
create_distribution_plots,
create_ci_plot,
generate_details_html,
get_dashboard_html_template,
create_dashboard,
)
class TestDashboardFunctions(unittest.TestCase):
def setUp(self): def setUp(self):
self.test_data = [ """Set up test data."""
self.metrics_data = {
"accuracy": [0.8, 0.85, 0.9, 0.95, 1.0],
"f1_score": [0.7, 0.75, 0.8, 0.85, 0.9],
}
self.ci_data = {
"accuracy": (0.9, 0.85, 0.95),
"f1_score": (0.8, 0.75, 0.85),
}
self.detail_data = [
{ {
"question": "What is AI?", "question": "What is AI?",
"answer": "Artificial Intelligence", "answer": "Artificial Intelligence",
"golden_answer": "Artificial Intelligence", "golden_answer": "Artificial Intelligence",
"metrics": { "metrics": {
"accuracy": {"score": 0.9, "reason": "Close enough"}, "accuracy": {"score": 1.0, "reason": "Exact match"},
"relevance": {"score": 0.8}, "f1_score": {"score": 0.9, "reason": "High similarity"},
}, },
}, }
{
"question": "What is ML?",
"answer": "Machine Learning",
"golden_answer": "Machine Learning",
"metrics": {
"accuracy": {"score": 0.95, "reason": "Exact match"},
"relevance": {"score": 0.85},
},
},
] ]
self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") def test_generate_details_html(self):
json.dump(self.test_data, self.temp_json) """Test HTML details generation."""
self.temp_json.close() html_output = generate_details_html(self.detail_data)
self.output_file = "test_dashboard.html"
def tearDown(self): self.assertIn("<h3>accuracy Details</h3>", html_output[0])
os.remove(self.temp_json.name) self.assertIn("<th>Question</th>", html_output[1])
if os.path.exists(self.output_file): self.assertIn("Exact match", "".join(html_output))
os.remove(self.output_file)
def test_generate_metrics_dashboard_valid_json(self): def test_get_dashboard_html_template(self):
"""Test if the function processes valid JSON correctly and creates an output file.""" """Test full dashboard HTML generation."""
result = generate_metrics_dashboard( figures = create_distribution_plots(self.metrics_data)
self.temp_json.name, self.output_file, benchmark="Test Benchmark" ci_plot = create_ci_plot(self.ci_data)
dashboard_html = get_dashboard_html_template(
figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1"
) )
self.assertTrue(os.path.exists(self.output_file)) self.assertIn("<title>LLM Evaluation Dashboard Benchmark 1</title>", dashboard_html)
self.assertEqual(result, self.output_file) self.assertIn("<h2>Metrics Distribution</h2>", dashboard_html)
self.assertIn("<h2>95% confidence interval for all the metrics</h2>", dashboard_html)
self.assertIn("Benchmark 1", dashboard_html)
with open(self.output_file, "r", encoding="utf-8") as f: def test_create_dashboard(self):
html_content = f.read() """Test the full dashboard generation and file creation."""
self.assertIn("<title>LLM Evaluation Dashboard Test Benchmark</title>", html_content) metrics_path = "test_metrics.json"
self.assertIn("accuracy", html_content) aggregate_metrics_path = "test_aggregate.json"
self.assertIn("relevance", html_content) output_file = "test_dashboard.html"
@patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95)) with open(metrics_path, "w") as f:
def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci): json.dump(self.detail_data, f)
"""Test if bootstrap_ci is called with the correct parameters."""
generate_metrics_dashboard(self.temp_json.name, self.output_file)
mock_bootstrap_ci.assert_any_call([0.9, 0.95]) # For accuracy with open(aggregate_metrics_path, "w") as f:
mock_bootstrap_ci.assert_any_call([0.8, 0.85]) # For relevance json.dump(
{
@patch("plotly.graph_objects.Figure.to_html", return_value="<div>Plotly Chart</div>") metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]}
def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html): for metric, v in self.ci_data.items()
"""Test if Plotly figures are generated correctly.""" },
generate_metrics_dashboard(self.temp_json.name, self.output_file) f,
self.assertGreaterEqual(mock_to_html.call_count, 3) # 2 metrics + CI chart
with open(self.output_file, "r", encoding="utf-8") as f:
file_content = f.read()
self.assertIn(
"<div>Plotly Chart</div>",
file_content,
"The output file does not contain the expected Plotly chart HTML.",
) )
output = create_dashboard(
metrics_path, aggregate_metrics_path, output_file, "Test Benchmark"
)
class TestBootstrapCI(unittest.TestCase): self.assertEqual(output, output_file)
def test_bootstrap_ci_basic(self): self.assertTrue(os.path.exists(output_file))
scores = [1, 2, 3, 4, 5]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertAlmostEqual(mean, np.mean(scores), places=2) os.remove(metrics_path)
self.assertLessEqual(lower, mean) os.remove(aggregate_metrics_path)
self.assertGreaterEqual(upper, mean) os.remove(output_file)
def test_bootstrap_ci_single_value(self):
scores = [3, 3, 3, 3, 3]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertEqual(mean, 3)
self.assertEqual(lower, 3)
self.assertEqual(upper, 3)
def test_bootstrap_ci_empty_list(self):
mean, lower, upper = bootstrap_ci([])
self.assertTrue(np.isnan(mean))
self.assertTrue(np.isnan(lower))
self.assertTrue(np.isnan(upper))

View file

@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter):
"question": "What is 2 + 2?", "question": "What is 2 + 2?",
"answer": "4", "answer": "4",
"golden_answer": "4", "golden_answer": "4",
"retrieval_context": "2 + 2 = 4",
} }
] ]
@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter):
"question": None, "question": None,
"answer": None, "answer": None,
"golden_answer": None, "golden_answer": None,
"retrieval_context": None,
} }
] ]
evaluator_metrics = ["EM", "f1"] evaluator_metrics = ["EM", "f1"]

View file

@ -2,6 +2,10 @@ import pytest
from typing import Optional from typing import Optional
import sys import sys
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
import unittest
import numpy as np
from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci
with patch.dict( with patch.dict(
sys.modules, sys.modules,
@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], ( assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
f"F1 score failed for '{actual}' vs '{expected}'" f"F1 score failed for '{actual}' vs '{expected}'"
) )
class TestBootstrapCI(unittest.TestCase):
def test_bootstrap_ci_basic(self):
scores = [1, 2, 3, 4, 5]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertAlmostEqual(mean, np.mean(scores), places=2)
self.assertLessEqual(lower, mean)
self.assertGreaterEqual(upper, mean)
def test_bootstrap_ci_single_value(self):
scores = [3, 3, 3, 3, 3]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertEqual(mean, 3)
self.assertEqual(lower, 3)
self.assertEqual(upper, 3)
def test_bootstrap_ci_empty_list(self):
mean, lower, upper = bootstrap_ci([])
self.assertTrue(np.isnan(mean))
self.assertTrue(np.isnan(lower))
self.assertTrue(np.isnan(upper))

View file

@ -1,50 +1,12 @@
import json import json
from collections import defaultdict
import plotly.graph_objects as go import plotly.graph_objects as go
import numpy as np from typing import Dict, List, Tuple
from collections import defaultdict
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95): def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
means = [] """Create distribution histogram plots for each metric."""
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound
def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
try:
with open(json_data, "r", encoding="utf-8") as f:
data = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_data}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_data}: {e}")
metrics_data = defaultdict(list)
metric_details = defaultdict(list)
for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
figures = [] figures = []
for metric, scores in metrics_data.items(): for metric, scores in metrics_data.items():
fig = go.Figure() fig = go.Figure()
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4")) fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
template="seaborn", template="seaborn",
) )
figures.append(fig.to_html(full_html=False)) figures.append(fig.to_html(full_html=False))
return figures
ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)
# Bar chart with confidence intervals def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
"""Create confidence interval bar plot."""
fig = go.Figure() fig = go.Figure()
for metric, (mean_score, lower, upper) in ci_results.items(): for metric, (mean_score, lower, upper) in ci_results.items():
fig.add_trace( fig.add_trace(
@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
yaxis_title="Score", yaxis_title="Score",
template="seaborn", template="seaborn",
) )
figures.append(fig.to_html(full_html=False)) return fig.to_html(full_html=False)
def generate_details_html(metrics_data: List[Dict]) -> List[str]:
"""Generate HTML for detailed metric information."""
details_html = [] details_html = []
metric_details = {}
# Organize metrics by type
for entry in metrics_data:
for metric, values in entry["metrics"].items():
if metric not in metric_details:
metric_details[metric] = []
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values.get("reason", ""),
"score": values["score"],
}
)
for metric, details in metric_details.items(): for metric, details in metric_details.items():
details_html.append(f"<h3>{metric} Details</h3>") details_html.append(f"<h3>{metric} Details</h3>")
details_html.append(""" details_html.append("""
@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
f"</tr>" f"</tr>"
) )
details_html.append("</table>") details_html.append("</table>")
return details_html
html_template = f"""
def get_dashboard_html_template(
figures: List[str], details_html: List[str], benchmark: str = ""
) -> str:
"""Generate the complete HTML dashboard template."""
return f"""
<!DOCTYPE html> <!DOCTYPE html>
<html> <html>
<head> <head>
@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
<h1>LLM Evaluation Metrics Dashboard {benchmark}</h1> <h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
<h2>Metrics Distribution</h2> <h2>Metrics Distribution</h2>
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])} {"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
<h2>95% confidence interval for all the metrics</h2> <h2>95% confidence interval for all the metrics</h2>
<div class="chart">{figures[-1]}</div> <div class="chart">{figures[-1]}</div>
@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
</html> </html>
""" """
def create_dashboard(
metrics_path: str,
aggregate_metrics_path: str,
output_file: str = "dashboard_with_ci.html",
benchmark: str = "",
) -> str:
"""Create and save the dashboard with all visualizations."""
# Read metrics files
with open(metrics_path, "r") as f:
metrics_data = json.load(f)
with open(aggregate_metrics_path, "r") as f:
aggregate_data = json.load(f)
# Extract data for visualizations
metrics_by_type = defaultdict(list)
for entry in metrics_data:
for metric, values in entry["metrics"].items():
metrics_by_type[metric].append(values["score"])
# Generate visualizations
distribution_figures = create_distribution_plots(metrics_by_type)
ci_plot = create_ci_plot(
{
metric: (data["mean"], data["ci_lower"], data["ci_upper"])
for metric, data in aggregate_data.items()
}
)
# Combine all figures
figures = distribution_figures + [ci_plot]
# Generate HTML components
details_html = generate_details_html(metrics_data)
dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
# Write to file
with open(output_file, "w", encoding="utf-8") as f: with open(output_file, "w", encoding="utf-8") as f:
f.write(html_template) f.write(dashboard_html)
return output_file return output_file

View file

@ -0,0 +1,92 @@
import json
from collections import defaultdict
import numpy as np
from typing import Dict, List, Tuple
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
"""Calculate bootstrap confidence intervals for a list of scores."""
means = []
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound
def load_metrics_data(json_file_path: str) -> List[Dict]:
"""Load metrics data from JSON file."""
try:
with open(json_file_path, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
def extract_metrics_and_details(
data: List[Dict],
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
"""Extract metrics scores and details from evaluation data."""
metrics_data = defaultdict(list)
metric_details = defaultdict(list)
for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
return metrics_data, metric_details
def save_aggregate_metrics(
metrics_data: Dict[str, List[float]],
ci_results: Dict[str, Tuple[float, float, float]],
output_path: str,
) -> None:
"""Save aggregated metrics and confidence intervals to file."""
aggregate_data = {
metric: {
"scores": scores,
"mean": ci_results[metric][0],
"ci_lower": ci_results[metric][1],
"ci_upper": ci_results[metric][2],
}
for metric, scores in metrics_data.items()
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(aggregate_data, f, indent=4)
def calculate_metrics_statistics(
json_data: str, aggregate_output_path: str
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
"""Calculate metrics statistics and save aggregated results."""
data = load_metrics_data(json_data)
metrics_data, metric_details = extract_metrics_and_details(data)
# Calculate confidence intervals
ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)
# Save aggregate metrics
save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
return metrics_data, metric_details, ci_results

View file

@ -26,6 +26,9 @@ class EvalConfig(BaseSettings):
] # Use only 'correctness' for DirectLLM ] # Use only 'correctness' for DirectLLM
deepeval_model: str = "gpt-4o-mini" deepeval_model: str = "gpt-4o-mini"
# Metrics params
calculate_metrics: bool = True
# Visualization # Visualization
dashboard: bool = True dashboard: bool = True
@ -33,6 +36,7 @@ class EvalConfig(BaseSettings):
questions_path: str = "questions_output.json" questions_path: str = "questions_output.json"
answers_path: str = "answers_output.json" answers_path: str = "answers_output.json"
metrics_path: str = "metrics_output.json" metrics_path: str = "metrics_output.json"
aggregate_metrics_path: str = "aggregate_metrics.json"
dashboard_path: str = "dashboard.html" dashboard_path: str = "dashboard.html"
direct_llm_system_prompt: str = "direct_llm_eval_system.txt" direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt" direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
@ -49,10 +53,12 @@ class EvalConfig(BaseSettings):
"evaluating_answers": self.evaluating_answers, "evaluating_answers": self.evaluating_answers,
"evaluation_engine": self.evaluation_engine, "evaluation_engine": self.evaluation_engine,
"evaluation_metrics": self.evaluation_metrics, "evaluation_metrics": self.evaluation_metrics,
"calculate_metrics": self.calculate_metrics,
"dashboard": self.dashboard, "dashboard": self.dashboard,
"questions_path": self.questions_path, "questions_path": self.questions_path,
"answers_path": self.answers_path, "answers_path": self.answers_path,
"metrics_path": self.metrics_path, "metrics_path": self.metrics_path,
"aggregate_metrics_path": self.aggregate_metrics_path,
"dashboard_path": self.dashboard_path, "dashboard_path": self.dashboard_path,
"deepeval_model": self.deepeval_model, "deepeval_model": self.deepeval_model,
"task_getter_type": self.task_getter_type, "task_getter_type": self.task_getter_type,

View file

@ -1,7 +1,8 @@
import logging import logging
import json import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from evals.eval_framework.analysis.dashboard_generator import create_dashboard
from cognee.infrastructure.files.storage import LocalStorage from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import ( from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine, get_relational_engine,
@ -28,32 +29,56 @@ async def create_and_insert_metrics_table(questions_payload):
await session.commit() await session.commit()
async def execute_evaluation(params: dict) -> None:
"""Execute the evaluation step and save results."""
logging.info("Evaluation started...")
try:
with open(params["answers_path"], "r", encoding="utf-8") as f:
answers = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
metrics = await evaluator.execute(
answers=answers, evaluator_metrics=params["evaluation_metrics"]
)
with open(params["metrics_path"], "w", encoding="utf-8") as f:
json.dump(metrics, f, ensure_ascii=False, indent=4)
await create_and_insert_metrics_table(metrics)
logging.info("Evaluation completed")
async def run_evaluation(params: dict) -> None: async def run_evaluation(params: dict) -> None:
"""Run each step of the evaluation pipeline based on configuration flags."""
# Step 1: Evaluate answers if requested
if params.get("evaluating_answers"): if params.get("evaluating_answers"):
logging.info("Evaluation started...") await execute_evaluation(params)
try: else:
with open(params["answers_path"], "r", encoding="utf-8") as f: logging.info("Skipping evaluation as evaluating_answers is False")
answers = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}") # Step 2: Calculate metrics if requested
evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"]) if params.get("calculate_metrics"):
metrics = await evaluator.execute( logging.info("Calculating metrics statistics...")
answers=answers, evaluator_metrics=params["evaluation_metrics"] calculate_metrics_statistics(
json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
) )
with open(params["metrics_path"], "w", encoding="utf-8") as f: logging.info("Metrics calculation completed")
json.dump(metrics, f, ensure_ascii=False, indent=4) else:
logging.info("Skipping metrics calculation as calculate_metrics is False")
await create_and_insert_metrics_table(metrics)
logging.info("Evaluation End...")
# Step 3: Generate dashboard if requested
if params.get("dashboard"): if params.get("dashboard"):
generate_metrics_dashboard( logging.info("Generating dashboard...")
json_data=params["metrics_path"], create_dashboard(
metrics_path=params["metrics_path"],
aggregate_metrics_path=params["aggregate_metrics_path"],
output_file=params["dashboard_path"], output_file=params["dashboard_path"],
benchmark=params["benchmark"], benchmark=params["benchmark"],
) )
logging.info(f"Dashboard generated at {params['dashboard_path']}")
else:
logging.info("Skipping dashboard generation as dashboard is False")

View file

@ -0,0 +1,116 @@
import modal
import os
import json
import asyncio
import datetime
import logging
from evals.eval_framework.eval_config import EvalConfig
from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from evals.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering,
)
from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
logger = logging.getLogger(__name__)
def read_and_combine_metrics(eval_params: dict) -> dict:
"""Read and combine metrics files into a single result dictionary."""
try:
with open(eval_params["metrics_path"], "r") as f:
metrics = json.load(f)
with open(eval_params["aggregate_metrics_path"], "r") as f:
aggregate_metrics = json.load(f)
return {
"task_getter_type": eval_params["task_getter_type"],
"number_of_samples": eval_params["number_of_samples_in_corpus"],
"metrics": metrics,
"aggregate_metrics": aggregate_metrics,
}
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.error(f"Error reading metrics files: {e}")
return None
app = modal.App("modal-run-eval")
image = (
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
.copy_local_file("pyproject.toml", "pyproject.toml")
.copy_local_file("poetry.lock", "poetry.lock")
.env(
{
"ENV": os.getenv("ENV"),
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
}
)
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
)
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
async def modal_run_eval(eval_params=None):
"""Runs evaluation pipeline and returns combined metrics results."""
if eval_params is None:
eval_params = EvalConfig().to_dict()
logger.info(f"Running evaluation with params: {eval_params}")
# Run the evaluation pipeline
await run_corpus_builder(eval_params)
await run_question_answering(eval_params)
await run_evaluation(eval_params)
# Early return if metrics calculation wasn't requested
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
logger.info(
"Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
)
return None
return read_and_combine_metrics(eval_params)
@app.local_entrypoint()
async def main():
# List of configurations to run
configs = [
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=2,
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=10,
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
),
]
# Run evaluations in parallel with different configurations
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
results = await asyncio.gather(*modal_tasks)
# Filter out None results and save combined results
results = [r for r in results if r is not None]
if results:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"combined_results_{timestamp}.json"
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
else:
logger.info("No metrics were collected from any of the evaluation runs")