diff --git a/cognee/tests/unit/eval_framework/dashboard_test.py b/cognee/tests/unit/eval_framework/dashboard_test.py
index 3fda78189..fe4424b9e 100644
--- a/cognee/tests/unit/eval_framework/dashboard_test.py
+++ b/cognee/tests/unit/eval_framework/dashboard_test.py
@@ -1,104 +1,88 @@
import unittest
-from unittest.mock import patch
import json
import os
-import tempfile
-from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci
-import numpy as np
-class TestGenerateMetricsDashboard(unittest.TestCase):
+from evals.eval_framework.analysis.dashboard_generator import (
+ create_distribution_plots,
+ create_ci_plot,
+ generate_details_html,
+ get_dashboard_html_template,
+ create_dashboard,
+)
+
+
+class TestDashboardFunctions(unittest.TestCase):
def setUp(self):
- self.test_data = [
+ """Set up test data."""
+ self.metrics_data = {
+ "accuracy": [0.8, 0.85, 0.9, 0.95, 1.0],
+ "f1_score": [0.7, 0.75, 0.8, 0.85, 0.9],
+ }
+
+ self.ci_data = {
+ "accuracy": (0.9, 0.85, 0.95),
+ "f1_score": (0.8, 0.75, 0.85),
+ }
+
+ self.detail_data = [
{
"question": "What is AI?",
"answer": "Artificial Intelligence",
"golden_answer": "Artificial Intelligence",
"metrics": {
- "accuracy": {"score": 0.9, "reason": "Close enough"},
- "relevance": {"score": 0.8},
+ "accuracy": {"score": 1.0, "reason": "Exact match"},
+ "f1_score": {"score": 0.9, "reason": "High similarity"},
},
- },
- {
- "question": "What is ML?",
- "answer": "Machine Learning",
- "golden_answer": "Machine Learning",
- "metrics": {
- "accuracy": {"score": 0.95, "reason": "Exact match"},
- "relevance": {"score": 0.85},
- },
- },
+ }
]
- self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8")
- json.dump(self.test_data, self.temp_json)
- self.temp_json.close()
- self.output_file = "test_dashboard.html"
+ def test_generate_details_html(self):
+ """Test HTML details generation."""
+ html_output = generate_details_html(self.detail_data)
- def tearDown(self):
- os.remove(self.temp_json.name)
- if os.path.exists(self.output_file):
- os.remove(self.output_file)
+ self.assertIn("
accuracy Details
", html_output[0])
+ self.assertIn("Question | ", html_output[1])
+ self.assertIn("Exact match", "".join(html_output))
- def test_generate_metrics_dashboard_valid_json(self):
- """Test if the function processes valid JSON correctly and creates an output file."""
- result = generate_metrics_dashboard(
- self.temp_json.name, self.output_file, benchmark="Test Benchmark"
+ def test_get_dashboard_html_template(self):
+ """Test full dashboard HTML generation."""
+ figures = create_distribution_plots(self.metrics_data)
+ ci_plot = create_ci_plot(self.ci_data)
+ dashboard_html = get_dashboard_html_template(
+ figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1"
)
- self.assertTrue(os.path.exists(self.output_file))
- self.assertEqual(result, self.output_file)
+ self.assertIn("LLM Evaluation Dashboard Benchmark 1", dashboard_html)
+ self.assertIn("Metrics Distribution
", dashboard_html)
+ self.assertIn("95% confidence interval for all the metrics
", dashboard_html)
+ self.assertIn("Benchmark 1", dashboard_html)
- with open(self.output_file, "r", encoding="utf-8") as f:
- html_content = f.read()
- self.assertIn("LLM Evaluation Dashboard Test Benchmark", html_content)
- self.assertIn("accuracy", html_content)
- self.assertIn("relevance", html_content)
+ def test_create_dashboard(self):
+ """Test the full dashboard generation and file creation."""
+ metrics_path = "test_metrics.json"
+ aggregate_metrics_path = "test_aggregate.json"
+ output_file = "test_dashboard.html"
- @patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95))
- def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci):
- """Test if bootstrap_ci is called with the correct parameters."""
- generate_metrics_dashboard(self.temp_json.name, self.output_file)
+ with open(metrics_path, "w") as f:
+ json.dump(self.detail_data, f)
- mock_bootstrap_ci.assert_any_call([0.9, 0.95]) # For accuracy
- mock_bootstrap_ci.assert_any_call([0.8, 0.85]) # For relevance
-
- @patch("plotly.graph_objects.Figure.to_html", return_value="Plotly Chart
")
- def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html):
- """Test if Plotly figures are generated correctly."""
- generate_metrics_dashboard(self.temp_json.name, self.output_file)
-
- self.assertGreaterEqual(mock_to_html.call_count, 3) # 2 metrics + CI chart
-
- with open(self.output_file, "r", encoding="utf-8") as f:
- file_content = f.read()
- self.assertIn(
- "Plotly Chart
",
- file_content,
- "The output file does not contain the expected Plotly chart HTML.",
+ with open(aggregate_metrics_path, "w") as f:
+ json.dump(
+ {
+ metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]}
+ for metric, v in self.ci_data.items()
+ },
+ f,
)
+ output = create_dashboard(
+ metrics_path, aggregate_metrics_path, output_file, "Test Benchmark"
+ )
-class TestBootstrapCI(unittest.TestCase):
- def test_bootstrap_ci_basic(self):
- scores = [1, 2, 3, 4, 5]
- mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
+ self.assertEqual(output, output_file)
+ self.assertTrue(os.path.exists(output_file))
- self.assertAlmostEqual(mean, np.mean(scores), places=2)
- self.assertLessEqual(lower, mean)
- self.assertGreaterEqual(upper, mean)
-
- def test_bootstrap_ci_single_value(self):
- scores = [3, 3, 3, 3, 3]
- mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
-
- self.assertEqual(mean, 3)
- self.assertEqual(lower, 3)
- self.assertEqual(upper, 3)
-
- def test_bootstrap_ci_empty_list(self):
- mean, lower, upper = bootstrap_ci([])
-
- self.assertTrue(np.isnan(mean))
- self.assertTrue(np.isnan(lower))
- self.assertTrue(np.isnan(upper))
+ os.remove(metrics_path)
+ os.remove(aggregate_metrics_path)
+ os.remove(output_file)
diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
index d4226cc66..3b0a0a19d 100644
--- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
+++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
@@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter):
"question": "What is 2 + 2?",
"answer": "4",
"golden_answer": "4",
+ "retrieval_context": "2 + 2 = 4",
}
]
@@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter):
"question": None,
"answer": None,
"golden_answer": None,
+ "retrieval_context": None,
}
]
evaluator_metrics = ["EM", "f1"]
diff --git a/cognee/tests/unit/eval_framework/metrics_test.py b/cognee/tests/unit/eval_framework/metrics_test.py
index c67e845b5..719995229 100644
--- a/cognee/tests/unit/eval_framework/metrics_test.py
+++ b/cognee/tests/unit/eval_framework/metrics_test.py
@@ -2,6 +2,10 @@ import pytest
from typing import Optional
import sys
from unittest.mock import patch, MagicMock
+import unittest
+import numpy as np
+from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci
+
with patch.dict(
sys.modules,
@@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
f"F1 score failed for '{actual}' vs '{expected}'"
)
+
+
+class TestBootstrapCI(unittest.TestCase):
+ def test_bootstrap_ci_basic(self):
+ scores = [1, 2, 3, 4, 5]
+ mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
+
+ self.assertAlmostEqual(mean, np.mean(scores), places=2)
+ self.assertLessEqual(lower, mean)
+ self.assertGreaterEqual(upper, mean)
+
+ def test_bootstrap_ci_single_value(self):
+ scores = [3, 3, 3, 3, 3]
+ mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
+
+ self.assertEqual(mean, 3)
+ self.assertEqual(lower, 3)
+ self.assertEqual(upper, 3)
+
+ def test_bootstrap_ci_empty_list(self):
+ mean, lower, upper = bootstrap_ci([])
+
+ self.assertTrue(np.isnan(mean))
+ self.assertTrue(np.isnan(lower))
+ self.assertTrue(np.isnan(upper))
diff --git a/evals/eval_framework/analysis/__init__.py b/evals/eval_framework/analysis/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/evals/eval_framework/metrics_dashboard.py b/evals/eval_framework/analysis/dashboard_generator.py
similarity index 57%
rename from evals/eval_framework/metrics_dashboard.py
rename to evals/eval_framework/analysis/dashboard_generator.py
index 739f3f605..2c917740a 100644
--- a/evals/eval_framework/metrics_dashboard.py
+++ b/evals/eval_framework/analysis/dashboard_generator.py
@@ -1,50 +1,12 @@
import json
-from collections import defaultdict
import plotly.graph_objects as go
-import numpy as np
+from typing import Dict, List, Tuple
+from collections import defaultdict
-def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
- means = []
- n = len(scores)
- for _ in range(num_samples):
- sample = np.random.choice(scores, size=n, replace=True)
- means.append(np.mean(sample))
-
- lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
- upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
- return np.mean(scores), lower_bound, upper_bound
-
-
-def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
- try:
- with open(json_data, "r", encoding="utf-8") as f:
- data = json.load(f)
- except FileNotFoundError:
- raise FileNotFoundError(f"Could not find the file: {json_data}")
- except json.JSONDecodeError as e:
- raise ValueError(f"Error decoding JSON from {json_data}: {e}")
-
- metrics_data = defaultdict(list)
- metric_details = defaultdict(list)
-
- for entry in data:
- for metric, values in entry["metrics"].items():
- score = values["score"]
- metrics_data[metric].append(score)
- if "reason" in values:
- metric_details[metric].append(
- {
- "question": entry["question"],
- "answer": entry["answer"],
- "golden_answer": entry["golden_answer"],
- "reason": values["reason"],
- "score": score,
- }
- )
-
+def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
+ """Create distribution histogram plots for each metric."""
figures = []
-
for metric, scores in metrics_data.items():
fig = go.Figure()
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
@@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
template="seaborn",
)
figures.append(fig.to_html(full_html=False))
+ return figures
- ci_results = {}
- for metric, scores in metrics_data.items():
- mean_score, lower, upper = bootstrap_ci(scores)
- ci_results[metric] = (mean_score, lower, upper)
- # Bar chart with confidence intervals
+def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
+ """Create confidence interval bar plot."""
fig = go.Figure()
for metric, (mean_score, lower, upper) in ci_results.items():
fig.add_trace(
@@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
yaxis_title="Score",
template="seaborn",
)
- figures.append(fig.to_html(full_html=False))
+ return fig.to_html(full_html=False)
+
+def generate_details_html(metrics_data: List[Dict]) -> List[str]:
+ """Generate HTML for detailed metric information."""
details_html = []
+ metric_details = {}
+
+ # Organize metrics by type
+ for entry in metrics_data:
+ for metric, values in entry["metrics"].items():
+ if metric not in metric_details:
+ metric_details[metric] = []
+ metric_details[metric].append(
+ {
+ "question": entry["question"],
+ "answer": entry["answer"],
+ "golden_answer": entry["golden_answer"],
+ "reason": values.get("reason", ""),
+ "score": values["score"],
+ }
+ )
+
for metric, details in metric_details.items():
details_html.append(f"{metric} Details
")
details_html.append("""
@@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
f""
)
details_html.append("")
+ return details_html
- html_template = f"""
+
+def get_dashboard_html_template(
+ figures: List[str], details_html: List[str], benchmark: str = ""
+) -> str:
+ """Generate the complete HTML dashboard template."""
+ return f"""
@@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
LLM Evaluation Metrics Dashboard {benchmark}
Metrics Distribution
- {"".join([f'{fig}
' for fig in figures[: len(metrics_data)]])}
+ {"".join([f'{fig}
' for fig in figures[:-1]])}
95% confidence interval for all the metrics
{figures[-1]}
@@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
"""
+
+def create_dashboard(
+ metrics_path: str,
+ aggregate_metrics_path: str,
+ output_file: str = "dashboard_with_ci.html",
+ benchmark: str = "",
+) -> str:
+ """Create and save the dashboard with all visualizations."""
+ # Read metrics files
+ with open(metrics_path, "r") as f:
+ metrics_data = json.load(f)
+ with open(aggregate_metrics_path, "r") as f:
+ aggregate_data = json.load(f)
+
+ # Extract data for visualizations
+ metrics_by_type = defaultdict(list)
+ for entry in metrics_data:
+ for metric, values in entry["metrics"].items():
+ metrics_by_type[metric].append(values["score"])
+
+ # Generate visualizations
+ distribution_figures = create_distribution_plots(metrics_by_type)
+ ci_plot = create_ci_plot(
+ {
+ metric: (data["mean"], data["ci_lower"], data["ci_upper"])
+ for metric, data in aggregate_data.items()
+ }
+ )
+
+ # Combine all figures
+ figures = distribution_figures + [ci_plot]
+
+ # Generate HTML components
+ details_html = generate_details_html(metrics_data)
+ dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
+
+ # Write to file
with open(output_file, "w", encoding="utf-8") as f:
- f.write(html_template)
+ f.write(dashboard_html)
+
return output_file
diff --git a/evals/eval_framework/analysis/metrics_calculator.py b/evals/eval_framework/analysis/metrics_calculator.py
new file mode 100644
index 000000000..ca1960748
--- /dev/null
+++ b/evals/eval_framework/analysis/metrics_calculator.py
@@ -0,0 +1,92 @@
+import json
+from collections import defaultdict
+import numpy as np
+from typing import Dict, List, Tuple
+
+
+def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
+ """Calculate bootstrap confidence intervals for a list of scores."""
+ means = []
+ n = len(scores)
+ for _ in range(num_samples):
+ sample = np.random.choice(scores, size=n, replace=True)
+ means.append(np.mean(sample))
+
+ lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
+ upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
+ return np.mean(scores), lower_bound, upper_bound
+
+
+def load_metrics_data(json_file_path: str) -> List[Dict]:
+ """Load metrics data from JSON file."""
+ try:
+ with open(json_file_path, "r", encoding="utf-8") as f:
+ return json.load(f)
+ except FileNotFoundError:
+ raise FileNotFoundError(f"Could not find the file: {json_file_path}")
+ except json.JSONDecodeError as e:
+ raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
+
+
+def extract_metrics_and_details(
+ data: List[Dict],
+) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
+ """Extract metrics scores and details from evaluation data."""
+ metrics_data = defaultdict(list)
+ metric_details = defaultdict(list)
+
+ for entry in data:
+ for metric, values in entry["metrics"].items():
+ score = values["score"]
+ metrics_data[metric].append(score)
+ if "reason" in values:
+ metric_details[metric].append(
+ {
+ "question": entry["question"],
+ "answer": entry["answer"],
+ "golden_answer": entry["golden_answer"],
+ "reason": values["reason"],
+ "score": score,
+ }
+ )
+
+ return metrics_data, metric_details
+
+
+def save_aggregate_metrics(
+ metrics_data: Dict[str, List[float]],
+ ci_results: Dict[str, Tuple[float, float, float]],
+ output_path: str,
+) -> None:
+ """Save aggregated metrics and confidence intervals to file."""
+ aggregate_data = {
+ metric: {
+ "scores": scores,
+ "mean": ci_results[metric][0],
+ "ci_lower": ci_results[metric][1],
+ "ci_upper": ci_results[metric][2],
+ }
+ for metric, scores in metrics_data.items()
+ }
+
+ with open(output_path, "w", encoding="utf-8") as f:
+ json.dump(aggregate_data, f, indent=4)
+
+
+def calculate_metrics_statistics(
+ json_data: str, aggregate_output_path: str
+) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
+ """Calculate metrics statistics and save aggregated results."""
+ data = load_metrics_data(json_data)
+ metrics_data, metric_details = extract_metrics_and_details(data)
+
+ # Calculate confidence intervals
+ ci_results = {}
+ for metric, scores in metrics_data.items():
+ mean_score, lower, upper = bootstrap_ci(scores)
+ ci_results[metric] = (mean_score, lower, upper)
+
+ # Save aggregate metrics
+ save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
+
+ return metrics_data, metric_details, ci_results
diff --git a/evals/eval_framework/eval_config.py b/evals/eval_framework/eval_config.py
index f1d65341a..1ac72a105 100644
--- a/evals/eval_framework/eval_config.py
+++ b/evals/eval_framework/eval_config.py
@@ -26,6 +26,9 @@ class EvalConfig(BaseSettings):
] # Use only 'correctness' for DirectLLM
deepeval_model: str = "gpt-4o-mini"
+ # Metrics params
+ calculate_metrics: bool = True
+
# Visualization
dashboard: bool = True
@@ -33,6 +36,7 @@ class EvalConfig(BaseSettings):
questions_path: str = "questions_output.json"
answers_path: str = "answers_output.json"
metrics_path: str = "metrics_output.json"
+ aggregate_metrics_path: str = "aggregate_metrics.json"
dashboard_path: str = "dashboard.html"
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
@@ -49,10 +53,12 @@ class EvalConfig(BaseSettings):
"evaluating_answers": self.evaluating_answers,
"evaluation_engine": self.evaluation_engine,
"evaluation_metrics": self.evaluation_metrics,
+ "calculate_metrics": self.calculate_metrics,
"dashboard": self.dashboard,
"questions_path": self.questions_path,
"answers_path": self.answers_path,
"metrics_path": self.metrics_path,
+ "aggregate_metrics_path": self.aggregate_metrics_path,
"dashboard_path": self.dashboard_path,
"deepeval_model": self.deepeval_model,
"task_getter_type": self.task_getter_type,
diff --git a/evals/eval_framework/evaluation/run_evaluation_module.py b/evals/eval_framework/evaluation/run_evaluation_module.py
index 351d253f6..76a7c5c56 100644
--- a/evals/eval_framework/evaluation/run_evaluation_module.py
+++ b/evals/eval_framework/evaluation/run_evaluation_module.py
@@ -1,7 +1,8 @@
import logging
import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
-from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
+from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
+from evals.eval_framework.analysis.dashboard_generator import create_dashboard
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine,
@@ -28,32 +29,56 @@ async def create_and_insert_metrics_table(questions_payload):
await session.commit()
+async def execute_evaluation(params: dict) -> None:
+ """Execute the evaluation step and save results."""
+ logging.info("Evaluation started...")
+ try:
+ with open(params["answers_path"], "r", encoding="utf-8") as f:
+ answers = json.load(f)
+ except FileNotFoundError:
+ raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
+ except json.JSONDecodeError as e:
+ raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
+
+ logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
+ evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
+ metrics = await evaluator.execute(
+ answers=answers, evaluator_metrics=params["evaluation_metrics"]
+ )
+ with open(params["metrics_path"], "w", encoding="utf-8") as f:
+ json.dump(metrics, f, ensure_ascii=False, indent=4)
+
+ await create_and_insert_metrics_table(metrics)
+ logging.info("Evaluation completed")
+
+
async def run_evaluation(params: dict) -> None:
+ """Run each step of the evaluation pipeline based on configuration flags."""
+ # Step 1: Evaluate answers if requested
if params.get("evaluating_answers"):
- logging.info("Evaluation started...")
- try:
- with open(params["answers_path"], "r", encoding="utf-8") as f:
- answers = json.load(f)
- except FileNotFoundError:
- raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
- except json.JSONDecodeError as e:
- raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
+ await execute_evaluation(params)
+ else:
+ logging.info("Skipping evaluation as evaluating_answers is False")
- logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
- evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
- metrics = await evaluator.execute(
- answers=answers, evaluator_metrics=params["evaluation_metrics"]
+ # Step 2: Calculate metrics if requested
+ if params.get("calculate_metrics"):
+ logging.info("Calculating metrics statistics...")
+ calculate_metrics_statistics(
+ json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
)
- with open(params["metrics_path"], "w", encoding="utf-8") as f:
- json.dump(metrics, f, ensure_ascii=False, indent=4)
-
- await create_and_insert_metrics_table(metrics)
-
- logging.info("Evaluation End...")
+ logging.info("Metrics calculation completed")
+ else:
+ logging.info("Skipping metrics calculation as calculate_metrics is False")
+ # Step 3: Generate dashboard if requested
if params.get("dashboard"):
- generate_metrics_dashboard(
- json_data=params["metrics_path"],
+ logging.info("Generating dashboard...")
+ create_dashboard(
+ metrics_path=params["metrics_path"],
+ aggregate_metrics_path=params["aggregate_metrics_path"],
output_file=params["dashboard_path"],
benchmark=params["benchmark"],
)
+ logging.info(f"Dashboard generated at {params['dashboard_path']}")
+ else:
+ logging.info("Skipping dashboard generation as dashboard is False")
diff --git a/evals/eval_framework/modal_run_eval.py b/evals/eval_framework/modal_run_eval.py
new file mode 100644
index 000000000..f04c42954
--- /dev/null
+++ b/evals/eval_framework/modal_run_eval.py
@@ -0,0 +1,116 @@
+import modal
+import os
+import json
+import asyncio
+import datetime
+import logging
+from evals.eval_framework.eval_config import EvalConfig
+from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
+from evals.eval_framework.answer_generation.run_question_answering_module import (
+ run_question_answering,
+)
+from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
+
+logger = logging.getLogger(__name__)
+
+
+def read_and_combine_metrics(eval_params: dict) -> dict:
+ """Read and combine metrics files into a single result dictionary."""
+ try:
+ with open(eval_params["metrics_path"], "r") as f:
+ metrics = json.load(f)
+ with open(eval_params["aggregate_metrics_path"], "r") as f:
+ aggregate_metrics = json.load(f)
+
+ return {
+ "task_getter_type": eval_params["task_getter_type"],
+ "number_of_samples": eval_params["number_of_samples_in_corpus"],
+ "metrics": metrics,
+ "aggregate_metrics": aggregate_metrics,
+ }
+ except (FileNotFoundError, json.JSONDecodeError) as e:
+ logger.error(f"Error reading metrics files: {e}")
+ return None
+
+
+app = modal.App("modal-run-eval")
+
+image = (
+ modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
+ .copy_local_file("pyproject.toml", "pyproject.toml")
+ .copy_local_file("poetry.lock", "poetry.lock")
+ .env(
+ {
+ "ENV": os.getenv("ENV"),
+ "LLM_API_KEY": os.getenv("LLM_API_KEY"),
+ "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
+ }
+ )
+ .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
+ .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
+)
+
+
+@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
+async def modal_run_eval(eval_params=None):
+ """Runs evaluation pipeline and returns combined metrics results."""
+ if eval_params is None:
+ eval_params = EvalConfig().to_dict()
+
+ logger.info(f"Running evaluation with params: {eval_params}")
+
+ # Run the evaluation pipeline
+ await run_corpus_builder(eval_params)
+ await run_question_answering(eval_params)
+ await run_evaluation(eval_params)
+
+ # Early return if metrics calculation wasn't requested
+ if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
+ logger.info(
+ "Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
+ )
+ return None
+
+ return read_and_combine_metrics(eval_params)
+
+
+@app.local_entrypoint()
+async def main():
+ # List of configurations to run
+ configs = [
+ EvalConfig(
+ task_getter_type="Default",
+ number_of_samples_in_corpus=2,
+ building_corpus_from_scratch=True,
+ answering_questions=True,
+ evaluating_answers=True,
+ calculate_metrics=True,
+ dashboard=False,
+ ),
+ EvalConfig(
+ task_getter_type="Default",
+ number_of_samples_in_corpus=10,
+ building_corpus_from_scratch=True,
+ answering_questions=True,
+ evaluating_answers=True,
+ calculate_metrics=True,
+ dashboard=False,
+ ),
+ ]
+
+ # Run evaluations in parallel with different configurations
+ modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
+ results = await asyncio.gather(*modal_tasks)
+
+ # Filter out None results and save combined results
+ results = [r for r in results if r is not None]
+ if results:
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_file = f"combined_results_{timestamp}.json"
+
+ with open(output_file, "w") as f:
+ json.dump(results, f, indent=2)
+
+ logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
+ else:
+ logger.info("No metrics were collected from any of the evaluation runs")