Feat/cog 1331 modal run eval (#576)

## Description - Split metrics dashboard into two modules: calculator (statistics) and generator (visualization) - Added aggregate metrics as a new phase in evaluation pipeline - Created modal example to run multiple evaluations in parallel and collect results into a single combined output ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - Enhanced metrics reporting with improved visualizations, including histogram and confidence interval plots. - Introduced an asynchronous evaluation process that supports parallel execution and streamlined result aggregation. - Added new configuration options to control metrics calculation and aggregated output storage. - **Refactor** - Restructured dashboard generation and evaluation workflows into a more modular, maintainable design. - Improved error handling and logging for better feedback during evaluation processes. - **Bug Fixes** - Updated test cases to ensure accurate validation of the new dashboard generation and metrics calculation functionalities.
2025-03-03 14:22:32 +01:00 · 2025-03-03 14:22:32 +01:00 · bee04cad86
commit bee04cad86
parent 8874ddad2e
9 changed files with 429 additions and 151 deletions
--- a/cognee/tests/unit/eval_framework/dashboard_test.py
+++ b/cognee/tests/unit/eval_framework/dashboard_test.py
@ -1,104 +1,88 @@
 import unittest
 from unittest.mock import patch
 import json
 import os
 import tempfile
 from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci
 import numpy as np
-class TestGenerateMetricsDashboard(unittest.TestCase):
+from evals.eval_framework.analysis.dashboard_generator import (
    create_distribution_plots,
    create_ci_plot,
    generate_details_html,
    get_dashboard_html_template,
    create_dashboard,
 )
 class TestDashboardFunctions(unittest.TestCase):
    def setUp(self):
-        self.test_data = [
+        """Set up test data."""
        self.metrics_data = {
            "accuracy": [0.8, 0.85, 0.9, 0.95, 1.0],
            "f1_score": [0.7, 0.75, 0.8, 0.85, 0.9],
        }
        self.ci_data = {
            "accuracy": (0.9, 0.85, 0.95),
            "f1_score": (0.8, 0.75, 0.85),
        }
        self.detail_data = [
            {
                "question": "What is AI?",
                "answer": "Artificial Intelligence",
                "golden_answer": "Artificial Intelligence",
                "metrics": {
-                    "accuracy": {"score": 0.9, "reason": "Close enough"},
+                    "accuracy": {"score": 1.0, "reason": "Exact match"},
-                    "relevance": {"score": 0.8},
+                    "f1_score": {"score": 0.9, "reason": "High similarity"},
                },
-            },
+            }
            {
                "question": "What is ML?",
                "answer": "Machine Learning",
                "golden_answer": "Machine Learning",
                "metrics": {
                    "accuracy": {"score": 0.95, "reason": "Exact match"},
                    "relevance": {"score": 0.85},
                },
            },
        ]
-        self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8")
+    def test_generate_details_html(self):
-        json.dump(self.test_data, self.temp_json)
+        """Test HTML details generation."""
-        self.temp_json.close()
+        html_output = generate_details_html(self.detail_data)
        self.output_file = "test_dashboard.html"
-    def tearDown(self):
+        self.assertIn("<h3>accuracy Details</h3>", html_output[0])
-        os.remove(self.temp_json.name)
+        self.assertIn("<th>Question</th>", html_output[1])
-        if os.path.exists(self.output_file):
+        self.assertIn("Exact match", "".join(html_output))
            os.remove(self.output_file)
-    def test_generate_metrics_dashboard_valid_json(self):
+    def test_get_dashboard_html_template(self):
-        """Test if the function processes valid JSON correctly and creates an output file."""
+        """Test full dashboard HTML generation."""
-        result = generate_metrics_dashboard(
+        figures = create_distribution_plots(self.metrics_data)
-            self.temp_json.name, self.output_file, benchmark="Test Benchmark"
+        ci_plot = create_ci_plot(self.ci_data)
        dashboard_html = get_dashboard_html_template(
            figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1"
        )
-        self.assertTrue(os.path.exists(self.output_file))
+        self.assertIn("<title>LLM Evaluation Dashboard Benchmark 1</title>", dashboard_html)
-        self.assertEqual(result, self.output_file)
+        self.assertIn("<h2>Metrics Distribution</h2>", dashboard_html)
        self.assertIn("<h2>95% confidence interval for all the metrics</h2>", dashboard_html)
        self.assertIn("Benchmark 1", dashboard_html)
-        with open(self.output_file, "r", encoding="utf-8") as f:
+    def test_create_dashboard(self):
-            html_content = f.read()
+        """Test the full dashboard generation and file creation."""
-            self.assertIn("<title>LLM Evaluation Dashboard Test Benchmark</title>", html_content)
+        metrics_path = "test_metrics.json"
-            self.assertIn("accuracy", html_content)
+        aggregate_metrics_path = "test_aggregate.json"
-            self.assertIn("relevance", html_content)
+        output_file = "test_dashboard.html"
-    @patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95))
+        with open(metrics_path, "w") as f:
-    def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci):
+            json.dump(self.detail_data, f)
        """Test if bootstrap_ci is called with the correct parameters."""
        generate_metrics_dashboard(self.temp_json.name, self.output_file)
-        mock_bootstrap_ci.assert_any_call([0.9, 0.95])  # For accuracy
+        with open(aggregate_metrics_path, "w") as f:
-        mock_bootstrap_ci.assert_any_call([0.8, 0.85])  # For relevance
+            json.dump(
-
+                {
-    @patch("plotly.graph_objects.Figure.to_html", return_value="<div>Plotly Chart</div>")
+                    metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]}
-    def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html):
+                    for metric, v in self.ci_data.items()
-        """Test if Plotly figures are generated correctly."""
+                },
-        generate_metrics_dashboard(self.temp_json.name, self.output_file)
+                f,
        self.assertGreaterEqual(mock_to_html.call_count, 3)  # 2 metrics + CI chart
        with open(self.output_file, "r", encoding="utf-8") as f:
            file_content = f.read()
            self.assertIn(
                "<div>Plotly Chart</div>",
                file_content,
                "The output file does not contain the expected Plotly chart HTML.",
            )
        output = create_dashboard(
            metrics_path, aggregate_metrics_path, output_file, "Test Benchmark"
        )
-class TestBootstrapCI(unittest.TestCase):
+        self.assertEqual(output, output_file)
-    def test_bootstrap_ci_basic(self):
+        self.assertTrue(os.path.exists(output_file))
        scores = [1, 2, 3, 4, 5]
        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
-        self.assertAlmostEqual(mean, np.mean(scores), places=2)
+        os.remove(metrics_path)
-        self.assertLessEqual(lower, mean)
+        os.remove(aggregate_metrics_path)
-        self.assertGreaterEqual(upper, mean)
+        os.remove(output_file)
    def test_bootstrap_ci_single_value(self):
        scores = [3, 3, 3, 3, 3]
        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
        self.assertEqual(mean, 3)
        self.assertEqual(lower, 3)
        self.assertEqual(upper, 3)
    def test_bootstrap_ci_empty_list(self):
        mean, lower, upper = bootstrap_ci([])
        self.assertTrue(np.isnan(mean))
        self.assertTrue(np.isnan(lower))
        self.assertTrue(np.isnan(upper))
--- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
+++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter):
            "question": "What is 2 + 2?",
            "answer": "4",
            "golden_answer": "4",
            "retrieval_context": "2 + 2 = 4",
        }
    ]
@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter):
            "question": None,
            "answer": None,
            "golden_answer": None,
            "retrieval_context": None,
        }
    ]
    evaluator_metrics = ["EM", "f1"]
--- a/cognee/tests/unit/eval_framework/metrics_test.py
+++ b/cognee/tests/unit/eval_framework/metrics_test.py
@ -2,6 +2,10 @@ import pytest
 from typing import Optional
 import sys
 from unittest.mock import patch, MagicMock
 import unittest
 import numpy as np
 from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci
 with patch.dict(
    sys.modules,
@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra
    assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
        f"F1 score failed for '{actual}' vs '{expected}'"
    )
 class TestBootstrapCI(unittest.TestCase):
    def test_bootstrap_ci_basic(self):
        scores = [1, 2, 3, 4, 5]
        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
        self.assertAlmostEqual(mean, np.mean(scores), places=2)
        self.assertLessEqual(lower, mean)
        self.assertGreaterEqual(upper, mean)
    def test_bootstrap_ci_single_value(self):
        scores = [3, 3, 3, 3, 3]
        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
        self.assertEqual(mean, 3)
        self.assertEqual(lower, 3)
        self.assertEqual(upper, 3)
    def test_bootstrap_ci_empty_list(self):
        mean, lower, upper = bootstrap_ci([])
        self.assertTrue(np.isnan(mean))
        self.assertTrue(np.isnan(lower))
        self.assertTrue(np.isnan(upper))
--- a/evals/eval_framework/analysis/init.py
+++ b/evals/eval_framework/analysis/init.py
--- a/evals/eval_framework/analysis/dashboard_generator.py
+++ b/evals/eval_framework/analysis/dashboard_generator.py
@ -1,50 +1,12 @@
 import json
 from collections import defaultdict
 import plotly.graph_objects as go
-import numpy as np
+from typing import Dict, List, Tuple
 from collections import defaultdict
-def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
+def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
-    means = []
+    """Create distribution histogram plots for each metric."""
    n = len(scores)
    for _ in range(num_samples):
        sample = np.random.choice(scores, size=n, replace=True)
        means.append(np.mean(sample))
    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
    return np.mean(scores), lower_bound, upper_bound
 def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
    try:
        with open(json_data, "r", encoding="utf-8") as f:
            data = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find the file: {json_data}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Error decoding JSON from {json_data}: {e}")
    metrics_data = defaultdict(list)
    metric_details = defaultdict(list)
    for entry in data:
        for metric, values in entry["metrics"].items():
            score = values["score"]
            metrics_data[metric].append(score)
            if "reason" in values:
                metric_details[metric].append(
                    {
                        "question": entry["question"],
                        "answer": entry["answer"],
                        "golden_answer": entry["golden_answer"],
                        "reason": values["reason"],
                        "score": score,
                    }
                )
    figures = []
    for metric, scores in metrics_data.items():
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
            template="seaborn",
        )
        figures.append(fig.to_html(full_html=False))
    return figures
    ci_results = {}
    for metric, scores in metrics_data.items():
        mean_score, lower, upper = bootstrap_ci(scores)
        ci_results[metric] = (mean_score, lower, upper)
-    # Bar chart with confidence intervals
+def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
    """Create confidence interval bar plot."""
    fig = go.Figure()
    for metric, (mean_score, lower, upper) in ci_results.items():
        fig.add_trace(
@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
        yaxis_title="Score",
        template="seaborn",
    )
-    figures.append(fig.to_html(full_html=False))
+    return fig.to_html(full_html=False)
 def generate_details_html(metrics_data: List[Dict]) -> List[str]:
    """Generate HTML for detailed metric information."""
    details_html = []
    metric_details = {}
    # Organize metrics by type
    for entry in metrics_data:
        for metric, values in entry["metrics"].items():
            if metric not in metric_details:
                metric_details[metric] = []
            metric_details[metric].append(
                {
                    "question": entry["question"],
                    "answer": entry["answer"],
                    "golden_answer": entry["golden_answer"],
                    "reason": values.get("reason", ""),
                    "score": values["score"],
                }
            )
    for metric, details in metric_details.items():
        details_html.append(f"<h3>{metric} Details</h3>")
        details_html.append("""
@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
                f"</tr>"
            )
        details_html.append("</table>")
    return details_html
-    html_template = f"""
+
 def get_dashboard_html_template(
    figures: List[str], details_html: List[str], benchmark: str = ""
 ) -> str:
    """Generate the complete HTML dashboard template."""
    return f"""
    <!DOCTYPE html>
    <html>
    <head>
@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
        <h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
        <h2>Metrics Distribution</h2>
-        {"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])}
+        {"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
        <h2>95% confidence interval for all the metrics</h2>
        <div class="chart">{figures[-1]}</div>
@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
    </html>
    """
 def create_dashboard(
    metrics_path: str,
    aggregate_metrics_path: str,
    output_file: str = "dashboard_with_ci.html",
    benchmark: str = "",
 ) -> str:
    """Create and save the dashboard with all visualizations."""
    # Read metrics files
    with open(metrics_path, "r") as f:
        metrics_data = json.load(f)
    with open(aggregate_metrics_path, "r") as f:
        aggregate_data = json.load(f)
    # Extract data for visualizations
    metrics_by_type = defaultdict(list)
    for entry in metrics_data:
        for metric, values in entry["metrics"].items():
            metrics_by_type[metric].append(values["score"])
    # Generate visualizations
    distribution_figures = create_distribution_plots(metrics_by_type)
    ci_plot = create_ci_plot(
        {
            metric: (data["mean"], data["ci_lower"], data["ci_upper"])
            for metric, data in aggregate_data.items()
        }
    )
    # Combine all figures
    figures = distribution_figures + [ci_plot]
    # Generate HTML components
    details_html = generate_details_html(metrics_data)
    dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
    # Write to file
    with open(output_file, "w", encoding="utf-8") as f:
-        f.write(html_template)
+        f.write(dashboard_html)
    return output_file
--- a/evals/eval_framework/analysis/metrics_calculator.py
+++ b/evals/eval_framework/analysis/metrics_calculator.py
@ -0,0 +1,92 @@
 import json
 from collections import defaultdict
 import numpy as np
 from typing import Dict, List, Tuple
 def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
    """Calculate bootstrap confidence intervals for a list of scores."""
    means = []
    n = len(scores)
    for _ in range(num_samples):
        sample = np.random.choice(scores, size=n, replace=True)
        means.append(np.mean(sample))
    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
    return np.mean(scores), lower_bound, upper_bound
 def load_metrics_data(json_file_path: str) -> List[Dict]:
    """Load metrics data from JSON file."""
    try:
        with open(json_file_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find the file: {json_file_path}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
 def extract_metrics_and_details(
    data: List[Dict],
 ) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
    """Extract metrics scores and details from evaluation data."""
    metrics_data = defaultdict(list)
    metric_details = defaultdict(list)
    for entry in data:
        for metric, values in entry["metrics"].items():
            score = values["score"]
            metrics_data[metric].append(score)
            if "reason" in values:
                metric_details[metric].append(
                    {
                        "question": entry["question"],
                        "answer": entry["answer"],
                        "golden_answer": entry["golden_answer"],
                        "reason": values["reason"],
                        "score": score,
                    }
                )
    return metrics_data, metric_details
 def save_aggregate_metrics(
    metrics_data: Dict[str, List[float]],
    ci_results: Dict[str, Tuple[float, float, float]],
    output_path: str,
 ) -> None:
    """Save aggregated metrics and confidence intervals to file."""
    aggregate_data = {
        metric: {
            "scores": scores,
            "mean": ci_results[metric][0],
            "ci_lower": ci_results[metric][1],
            "ci_upper": ci_results[metric][2],
        }
        for metric, scores in metrics_data.items()
    }
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(aggregate_data, f, indent=4)
 def calculate_metrics_statistics(
    json_data: str, aggregate_output_path: str
 ) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
    """Calculate metrics statistics and save aggregated results."""
    data = load_metrics_data(json_data)
    metrics_data, metric_details = extract_metrics_and_details(data)
    # Calculate confidence intervals
    ci_results = {}
    for metric, scores in metrics_data.items():
        mean_score, lower, upper = bootstrap_ci(scores)
        ci_results[metric] = (mean_score, lower, upper)
    # Save aggregate metrics
    save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
    return metrics_data, metric_details, ci_results
--- a/evals/eval_framework/eval_config.py
+++ b/evals/eval_framework/eval_config.py
@ -26,6 +26,9 @@ class EvalConfig(BaseSettings):
    ]  # Use only 'correctness' for DirectLLM
    deepeval_model: str = "gpt-4o-mini"
    # Metrics params
    calculate_metrics: bool = True
    # Visualization
    dashboard: bool = True
@ -33,6 +36,7 @@ class EvalConfig(BaseSettings):
    questions_path: str = "questions_output.json"
    answers_path: str = "answers_output.json"
    metrics_path: str = "metrics_output.json"
    aggregate_metrics_path: str = "aggregate_metrics.json"
    dashboard_path: str = "dashboard.html"
    direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
    direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
@ -49,10 +53,12 @@ class EvalConfig(BaseSettings):
            "evaluating_answers": self.evaluating_answers,
            "evaluation_engine": self.evaluation_engine,
            "evaluation_metrics": self.evaluation_metrics,
            "calculate_metrics": self.calculate_metrics,
            "dashboard": self.dashboard,
            "questions_path": self.questions_path,
            "answers_path": self.answers_path,
            "metrics_path": self.metrics_path,
            "aggregate_metrics_path": self.aggregate_metrics_path,
            "dashboard_path": self.dashboard_path,
            "deepeval_model": self.deepeval_model,
            "task_getter_type": self.task_getter_type,
--- a/evals/eval_framework/evaluation/run_evaluation_module.py
+++ b/evals/eval_framework/evaluation/run_evaluation_module.py
@ -1,7 +1,8 @@
 import logging
 import json
 from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
-from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
+from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
 from evals.eval_framework.analysis.dashboard_generator import create_dashboard
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.infrastructure.databases.relational.get_relational_engine import (
    get_relational_engine,
@ -28,32 +29,56 @@ async def create_and_insert_metrics_table(questions_payload):
        await session.commit()
 async def execute_evaluation(params: dict) -> None:
    """Execute the evaluation step and save results."""
    logging.info("Evaluation started...")
    try:
        with open(params["answers_path"], "r", encoding="utf-8") as f:
            answers = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
    logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
    evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
    metrics = await evaluator.execute(
        answers=answers, evaluator_metrics=params["evaluation_metrics"]
    )
    with open(params["metrics_path"], "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=4)
    await create_and_insert_metrics_table(metrics)
    logging.info("Evaluation completed")
 async def run_evaluation(params: dict) -> None:
    """Run each step of the evaluation pipeline based on configuration flags."""
    # Step 1: Evaluate answers if requested
    if params.get("evaluating_answers"):
-        logging.info("Evaluation started...")
+        await execute_evaluation(params)
-        try:
+    else:
-            with open(params["answers_path"], "r", encoding="utf-8") as f:
+        logging.info("Skipping evaluation as evaluating_answers is False")
                answers = json.load(f)
        except FileNotFoundError:
            raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
-        logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
+    # Step 2: Calculate metrics if requested
-        evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
+    if params.get("calculate_metrics"):
-        metrics = await evaluator.execute(
+        logging.info("Calculating metrics statistics...")
-            answers=answers, evaluator_metrics=params["evaluation_metrics"]
+        calculate_metrics_statistics(
            json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
        )
-        with open(params["metrics_path"], "w", encoding="utf-8") as f:
+        logging.info("Metrics calculation completed")
-            json.dump(metrics, f, ensure_ascii=False, indent=4)
+    else:
-
+        logging.info("Skipping metrics calculation as calculate_metrics is False")
        await create_and_insert_metrics_table(metrics)
        logging.info("Evaluation End...")
    # Step 3: Generate dashboard if requested
    if params.get("dashboard"):
-        generate_metrics_dashboard(
+        logging.info("Generating dashboard...")
-            json_data=params["metrics_path"],
+        create_dashboard(
            metrics_path=params["metrics_path"],
            aggregate_metrics_path=params["aggregate_metrics_path"],
            output_file=params["dashboard_path"],
            benchmark=params["benchmark"],
        )
        logging.info(f"Dashboard generated at {params['dashboard_path']}")
    else:
        logging.info("Skipping dashboard generation as dashboard is False")
--- a/evals/eval_framework/modal_run_eval.py
+++ b/evals/eval_framework/modal_run_eval.py
@ -0,0 +1,116 @@
 import modal
 import os
 import json
 import asyncio
 import datetime
 import logging
 from evals.eval_framework.eval_config import EvalConfig
 from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
 from evals.eval_framework.answer_generation.run_question_answering_module import (
    run_question_answering,
 )
 from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
 logger = logging.getLogger(__name__)
 def read_and_combine_metrics(eval_params: dict) -> dict:
    """Read and combine metrics files into a single result dictionary."""
    try:
        with open(eval_params["metrics_path"], "r") as f:
            metrics = json.load(f)
        with open(eval_params["aggregate_metrics_path"], "r") as f:
            aggregate_metrics = json.load(f)
        return {
            "task_getter_type": eval_params["task_getter_type"],
            "number_of_samples": eval_params["number_of_samples_in_corpus"],
            "metrics": metrics,
            "aggregate_metrics": aggregate_metrics,
        }
    except (FileNotFoundError, json.JSONDecodeError) as e:
        logger.error(f"Error reading metrics files: {e}")
        return None
 app = modal.App("modal-run-eval")
 image = (
    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
    .copy_local_file("pyproject.toml", "pyproject.toml")
    .copy_local_file("poetry.lock", "poetry.lock")
    .env(
        {
            "ENV": os.getenv("ENV"),
            "LLM_API_KEY": os.getenv("LLM_API_KEY"),
            "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
        }
    )
    .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
    .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
 )
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
 async def modal_run_eval(eval_params=None):
    """Runs evaluation pipeline and returns combined metrics results."""
    if eval_params is None:
        eval_params = EvalConfig().to_dict()
    logger.info(f"Running evaluation with params: {eval_params}")
    # Run the evaluation pipeline
    await run_corpus_builder(eval_params)
    await run_question_answering(eval_params)
    await run_evaluation(eval_params)
    # Early return if metrics calculation wasn't requested
    if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
        logger.info(
            "Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
        )
        return None
    return read_and_combine_metrics(eval_params)
@app.local_entrypoint()
 async def main():
    # List of configurations to run
    configs = [
        EvalConfig(
            task_getter_type="Default",
            number_of_samples_in_corpus=2,
            building_corpus_from_scratch=True,
            answering_questions=True,
            evaluating_answers=True,
            calculate_metrics=True,
            dashboard=False,
        ),
        EvalConfig(
            task_getter_type="Default",
            number_of_samples_in_corpus=10,
            building_corpus_from_scratch=True,
            answering_questions=True,
            evaluating_answers=True,
            calculate_metrics=True,
            dashboard=False,
        ),
    ]
    # Run evaluations in parallel with different configurations
    modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
    results = await asyncio.gather(*modal_tasks)
    # Filter out None results and save combined results
    results = [r for r in results if r is not None]
    if results:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"combined_results_{timestamp}.json"
        with open(output_file, "w") as f:
            json.dump(results, f, indent=2)
        logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
    else:
        logger.info("No metrics were collected from any of the evaluation runs")