Feat/cog 1331 modal run eval (#576)

<!-- .github/pull_request_template.md -->

## Description
- Split metrics dashboard into two modules: calculator (statistics) and
generator (visualization)
- Added aggregate metrics as a new phase in evaluation pipeline
- Created modal example to run multiple evaluations in parallel and
collect results into a single combined output
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced metrics reporting with improved visualizations, including
histogram and confidence interval plots.
- Introduced an asynchronous evaluation process that supports parallel
execution and streamlined result aggregation.
- Added new configuration options to control metrics calculation and
aggregated output storage.

- **Refactor**
- Restructured dashboard generation and evaluation workflows into a more
modular, maintainable design.
- Improved error handling and logging for better feedback during
evaluation processes.

- **Bug Fixes**
- Updated test cases to ensure accurate validation of the new dashboard
generation and metrics calculation functionalities.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
lxobr 2025-03-03 14:22:32 +01:00 committed by GitHub
parent 8874ddad2e
commit bee04cad86
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 429 additions and 151 deletions

View file

@ -1,104 +1,88 @@
import unittest
from unittest.mock import patch
import json
import os
import tempfile
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci
import numpy as np
class TestGenerateMetricsDashboard(unittest.TestCase):
from evals.eval_framework.analysis.dashboard_generator import (
create_distribution_plots,
create_ci_plot,
generate_details_html,
get_dashboard_html_template,
create_dashboard,
)
class TestDashboardFunctions(unittest.TestCase):
def setUp(self):
self.test_data = [
"""Set up test data."""
self.metrics_data = {
"accuracy": [0.8, 0.85, 0.9, 0.95, 1.0],
"f1_score": [0.7, 0.75, 0.8, 0.85, 0.9],
}
self.ci_data = {
"accuracy": (0.9, 0.85, 0.95),
"f1_score": (0.8, 0.75, 0.85),
}
self.detail_data = [
{
"question": "What is AI?",
"answer": "Artificial Intelligence",
"golden_answer": "Artificial Intelligence",
"metrics": {
"accuracy": {"score": 0.9, "reason": "Close enough"},
"relevance": {"score": 0.8},
},
},
{
"question": "What is ML?",
"answer": "Machine Learning",
"golden_answer": "Machine Learning",
"metrics": {
"accuracy": {"score": 0.95, "reason": "Exact match"},
"relevance": {"score": 0.85},
},
"accuracy": {"score": 1.0, "reason": "Exact match"},
"f1_score": {"score": 0.9, "reason": "High similarity"},
},
}
]
self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8")
json.dump(self.test_data, self.temp_json)
self.temp_json.close()
self.output_file = "test_dashboard.html"
def test_generate_details_html(self):
"""Test HTML details generation."""
html_output = generate_details_html(self.detail_data)
def tearDown(self):
os.remove(self.temp_json.name)
if os.path.exists(self.output_file):
os.remove(self.output_file)
self.assertIn("<h3>accuracy Details</h3>", html_output[0])
self.assertIn("<th>Question</th>", html_output[1])
self.assertIn("Exact match", "".join(html_output))
def test_generate_metrics_dashboard_valid_json(self):
"""Test if the function processes valid JSON correctly and creates an output file."""
result = generate_metrics_dashboard(
self.temp_json.name, self.output_file, benchmark="Test Benchmark"
def test_get_dashboard_html_template(self):
"""Test full dashboard HTML generation."""
figures = create_distribution_plots(self.metrics_data)
ci_plot = create_ci_plot(self.ci_data)
dashboard_html = get_dashboard_html_template(
figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1"
)
self.assertTrue(os.path.exists(self.output_file))
self.assertEqual(result, self.output_file)
self.assertIn("<title>LLM Evaluation Dashboard Benchmark 1</title>", dashboard_html)
self.assertIn("<h2>Metrics Distribution</h2>", dashboard_html)
self.assertIn("<h2>95% confidence interval for all the metrics</h2>", dashboard_html)
self.assertIn("Benchmark 1", dashboard_html)
with open(self.output_file, "r", encoding="utf-8") as f:
html_content = f.read()
self.assertIn("<title>LLM Evaluation Dashboard Test Benchmark</title>", html_content)
self.assertIn("accuracy", html_content)
self.assertIn("relevance", html_content)
def test_create_dashboard(self):
"""Test the full dashboard generation and file creation."""
metrics_path = "test_metrics.json"
aggregate_metrics_path = "test_aggregate.json"
output_file = "test_dashboard.html"
@patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95))
def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci):
"""Test if bootstrap_ci is called with the correct parameters."""
generate_metrics_dashboard(self.temp_json.name, self.output_file)
with open(metrics_path, "w") as f:
json.dump(self.detail_data, f)
mock_bootstrap_ci.assert_any_call([0.9, 0.95]) # For accuracy
mock_bootstrap_ci.assert_any_call([0.8, 0.85]) # For relevance
@patch("plotly.graph_objects.Figure.to_html", return_value="<div>Plotly Chart</div>")
def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html):
"""Test if Plotly figures are generated correctly."""
generate_metrics_dashboard(self.temp_json.name, self.output_file)
self.assertGreaterEqual(mock_to_html.call_count, 3) # 2 metrics + CI chart
with open(self.output_file, "r", encoding="utf-8") as f:
file_content = f.read()
self.assertIn(
"<div>Plotly Chart</div>",
file_content,
"The output file does not contain the expected Plotly chart HTML.",
with open(aggregate_metrics_path, "w") as f:
json.dump(
{
metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]}
for metric, v in self.ci_data.items()
},
f,
)
output = create_dashboard(
metrics_path, aggregate_metrics_path, output_file, "Test Benchmark"
)
class TestBootstrapCI(unittest.TestCase):
def test_bootstrap_ci_basic(self):
scores = [1, 2, 3, 4, 5]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertEqual(output, output_file)
self.assertTrue(os.path.exists(output_file))
self.assertAlmostEqual(mean, np.mean(scores), places=2)
self.assertLessEqual(lower, mean)
self.assertGreaterEqual(upper, mean)
def test_bootstrap_ci_single_value(self):
scores = [3, 3, 3, 3, 3]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertEqual(mean, 3)
self.assertEqual(lower, 3)
self.assertEqual(upper, 3)
def test_bootstrap_ci_empty_list(self):
mean, lower, upper = bootstrap_ci([])
self.assertTrue(np.isnan(mean))
self.assertTrue(np.isnan(lower))
self.assertTrue(np.isnan(upper))
os.remove(metrics_path)
os.remove(aggregate_metrics_path)
os.remove(output_file)

View file

@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter):
"question": "What is 2 + 2?",
"answer": "4",
"golden_answer": "4",
"retrieval_context": "2 + 2 = 4",
}
]
@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter):
"question": None,
"answer": None,
"golden_answer": None,
"retrieval_context": None,
}
]
evaluator_metrics = ["EM", "f1"]

View file

@ -2,6 +2,10 @@ import pytest
from typing import Optional
import sys
from unittest.mock import patch, MagicMock
import unittest
import numpy as np
from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci
with patch.dict(
sys.modules,
@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
f"F1 score failed for '{actual}' vs '{expected}'"
)
class TestBootstrapCI(unittest.TestCase):
def test_bootstrap_ci_basic(self):
scores = [1, 2, 3, 4, 5]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertAlmostEqual(mean, np.mean(scores), places=2)
self.assertLessEqual(lower, mean)
self.assertGreaterEqual(upper, mean)
def test_bootstrap_ci_single_value(self):
scores = [3, 3, 3, 3, 3]
mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
self.assertEqual(mean, 3)
self.assertEqual(lower, 3)
self.assertEqual(upper, 3)
def test_bootstrap_ci_empty_list(self):
mean, lower, upper = bootstrap_ci([])
self.assertTrue(np.isnan(mean))
self.assertTrue(np.isnan(lower))
self.assertTrue(np.isnan(upper))

View file

@ -1,50 +1,12 @@
import json
from collections import defaultdict
import plotly.graph_objects as go
import numpy as np
from typing import Dict, List, Tuple
from collections import defaultdict
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
means = []
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound
def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
try:
with open(json_data, "r", encoding="utf-8") as f:
data = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_data}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_data}: {e}")
metrics_data = defaultdict(list)
metric_details = defaultdict(list)
for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
"""Create distribution histogram plots for each metric."""
figures = []
for metric, scores in metrics_data.items():
fig = go.Figure()
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
template="seaborn",
)
figures.append(fig.to_html(full_html=False))
return figures
ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)
# Bar chart with confidence intervals
def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
"""Create confidence interval bar plot."""
fig = go.Figure()
for metric, (mean_score, lower, upper) in ci_results.items():
fig.add_trace(
@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
yaxis_title="Score",
template="seaborn",
)
figures.append(fig.to_html(full_html=False))
return fig.to_html(full_html=False)
def generate_details_html(metrics_data: List[Dict]) -> List[str]:
"""Generate HTML for detailed metric information."""
details_html = []
metric_details = {}
# Organize metrics by type
for entry in metrics_data:
for metric, values in entry["metrics"].items():
if metric not in metric_details:
metric_details[metric] = []
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values.get("reason", ""),
"score": values["score"],
}
)
for metric, details in metric_details.items():
details_html.append(f"<h3>{metric} Details</h3>")
details_html.append("""
@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
f"</tr>"
)
details_html.append("</table>")
return details_html
html_template = f"""
def get_dashboard_html_template(
figures: List[str], details_html: List[str], benchmark: str = ""
) -> str:
"""Generate the complete HTML dashboard template."""
return f"""
<!DOCTYPE html>
<html>
<head>
@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
<h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
<h2>Metrics Distribution</h2>
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])}
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
<h2>95% confidence interval for all the metrics</h2>
<div class="chart">{figures[-1]}</div>
@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
</html>
"""
def create_dashboard(
metrics_path: str,
aggregate_metrics_path: str,
output_file: str = "dashboard_with_ci.html",
benchmark: str = "",
) -> str:
"""Create and save the dashboard with all visualizations."""
# Read metrics files
with open(metrics_path, "r") as f:
metrics_data = json.load(f)
with open(aggregate_metrics_path, "r") as f:
aggregate_data = json.load(f)
# Extract data for visualizations
metrics_by_type = defaultdict(list)
for entry in metrics_data:
for metric, values in entry["metrics"].items():
metrics_by_type[metric].append(values["score"])
# Generate visualizations
distribution_figures = create_distribution_plots(metrics_by_type)
ci_plot = create_ci_plot(
{
metric: (data["mean"], data["ci_lower"], data["ci_upper"])
for metric, data in aggregate_data.items()
}
)
# Combine all figures
figures = distribution_figures + [ci_plot]
# Generate HTML components
details_html = generate_details_html(metrics_data)
dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
# Write to file
with open(output_file, "w", encoding="utf-8") as f:
f.write(html_template)
f.write(dashboard_html)
return output_file

View file

@ -0,0 +1,92 @@
import json
from collections import defaultdict
import numpy as np
from typing import Dict, List, Tuple
def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
"""Calculate bootstrap confidence intervals for a list of scores."""
means = []
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))
lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound
def load_metrics_data(json_file_path: str) -> List[Dict]:
"""Load metrics data from JSON file."""
try:
with open(json_file_path, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
def extract_metrics_and_details(
data: List[Dict],
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
"""Extract metrics scores and details from evaluation data."""
metrics_data = defaultdict(list)
metric_details = defaultdict(list)
for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
return metrics_data, metric_details
def save_aggregate_metrics(
metrics_data: Dict[str, List[float]],
ci_results: Dict[str, Tuple[float, float, float]],
output_path: str,
) -> None:
"""Save aggregated metrics and confidence intervals to file."""
aggregate_data = {
metric: {
"scores": scores,
"mean": ci_results[metric][0],
"ci_lower": ci_results[metric][1],
"ci_upper": ci_results[metric][2],
}
for metric, scores in metrics_data.items()
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(aggregate_data, f, indent=4)
def calculate_metrics_statistics(
json_data: str, aggregate_output_path: str
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
"""Calculate metrics statistics and save aggregated results."""
data = load_metrics_data(json_data)
metrics_data, metric_details = extract_metrics_and_details(data)
# Calculate confidence intervals
ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)
# Save aggregate metrics
save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
return metrics_data, metric_details, ci_results

View file

@ -26,6 +26,9 @@ class EvalConfig(BaseSettings):
] # Use only 'correctness' for DirectLLM
deepeval_model: str = "gpt-4o-mini"
# Metrics params
calculate_metrics: bool = True
# Visualization
dashboard: bool = True
@ -33,6 +36,7 @@ class EvalConfig(BaseSettings):
questions_path: str = "questions_output.json"
answers_path: str = "answers_output.json"
metrics_path: str = "metrics_output.json"
aggregate_metrics_path: str = "aggregate_metrics.json"
dashboard_path: str = "dashboard.html"
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
@ -49,10 +53,12 @@ class EvalConfig(BaseSettings):
"evaluating_answers": self.evaluating_answers,
"evaluation_engine": self.evaluation_engine,
"evaluation_metrics": self.evaluation_metrics,
"calculate_metrics": self.calculate_metrics,
"dashboard": self.dashboard,
"questions_path": self.questions_path,
"answers_path": self.answers_path,
"metrics_path": self.metrics_path,
"aggregate_metrics_path": self.aggregate_metrics_path,
"dashboard_path": self.dashboard_path,
"deepeval_model": self.deepeval_model,
"task_getter_type": self.task_getter_type,

View file

@ -1,7 +1,8 @@
import logging
import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from evals.eval_framework.analysis.dashboard_generator import create_dashboard
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine,
@ -28,8 +29,8 @@ async def create_and_insert_metrics_table(questions_payload):
await session.commit()
async def run_evaluation(params: dict) -> None:
if params.get("evaluating_answers"):
async def execute_evaluation(params: dict) -> None:
"""Execute the evaluation step and save results."""
logging.info("Evaluation started...")
try:
with open(params["answers_path"], "r", encoding="utf-8") as f:
@ -48,12 +49,36 @@ async def run_evaluation(params: dict) -> None:
json.dump(metrics, f, ensure_ascii=False, indent=4)
await create_and_insert_metrics_table(metrics)
logging.info("Evaluation completed")
logging.info("Evaluation End...")
async def run_evaluation(params: dict) -> None:
"""Run each step of the evaluation pipeline based on configuration flags."""
# Step 1: Evaluate answers if requested
if params.get("evaluating_answers"):
await execute_evaluation(params)
else:
logging.info("Skipping evaluation as evaluating_answers is False")
# Step 2: Calculate metrics if requested
if params.get("calculate_metrics"):
logging.info("Calculating metrics statistics...")
calculate_metrics_statistics(
json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
)
logging.info("Metrics calculation completed")
else:
logging.info("Skipping metrics calculation as calculate_metrics is False")
# Step 3: Generate dashboard if requested
if params.get("dashboard"):
generate_metrics_dashboard(
json_data=params["metrics_path"],
logging.info("Generating dashboard...")
create_dashboard(
metrics_path=params["metrics_path"],
aggregate_metrics_path=params["aggregate_metrics_path"],
output_file=params["dashboard_path"],
benchmark=params["benchmark"],
)
logging.info(f"Dashboard generated at {params['dashboard_path']}")
else:
logging.info("Skipping dashboard generation as dashboard is False")

View file

@ -0,0 +1,116 @@
import modal
import os
import json
import asyncio
import datetime
import logging
from evals.eval_framework.eval_config import EvalConfig
from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from evals.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering,
)
from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
logger = logging.getLogger(__name__)
def read_and_combine_metrics(eval_params: dict) -> dict:
"""Read and combine metrics files into a single result dictionary."""
try:
with open(eval_params["metrics_path"], "r") as f:
metrics = json.load(f)
with open(eval_params["aggregate_metrics_path"], "r") as f:
aggregate_metrics = json.load(f)
return {
"task_getter_type": eval_params["task_getter_type"],
"number_of_samples": eval_params["number_of_samples_in_corpus"],
"metrics": metrics,
"aggregate_metrics": aggregate_metrics,
}
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.error(f"Error reading metrics files: {e}")
return None
app = modal.App("modal-run-eval")
image = (
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
.copy_local_file("pyproject.toml", "pyproject.toml")
.copy_local_file("poetry.lock", "poetry.lock")
.env(
{
"ENV": os.getenv("ENV"),
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
}
)
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
)
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
async def modal_run_eval(eval_params=None):
"""Runs evaluation pipeline and returns combined metrics results."""
if eval_params is None:
eval_params = EvalConfig().to_dict()
logger.info(f"Running evaluation with params: {eval_params}")
# Run the evaluation pipeline
await run_corpus_builder(eval_params)
await run_question_answering(eval_params)
await run_evaluation(eval_params)
# Early return if metrics calculation wasn't requested
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
logger.info(
"Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
)
return None
return read_and_combine_metrics(eval_params)
@app.local_entrypoint()
async def main():
# List of configurations to run
configs = [
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=2,
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=10,
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
),
]
# Run evaluations in parallel with different configurations
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
results = await asyncio.gather(*modal_tasks)
# Filter out None results and save combined results
results = [r for r in results if r is not None]
if results:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"combined_results_{timestamp}.json"
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
else:
logger.info("No metrics were collected from any of the evaluation runs")