Feat/cog 1331 modal run eval (#576)

## Description - Split metrics dashboard into two modules: calculator (statistics) and generator (visualization) - Added aggregate metrics as a new phase in evaluation pipeline - Created modal example to run multiple evaluations in parallel and collect results into a single combined output ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - Enhanced metrics reporting with improved visualizations, including histogram and confidence interval plots. - Introduced an asynchronous evaluation process that supports parallel execution and streamlined result aggregation. - Added new configuration options to control metrics calculation and aggregated output storage. - **Refactor** - Restructured dashboard generation and evaluation workflows into a more modular, maintainable design. - Improved error handling and logging for better feedback during evaluation processes. - **Bug Fixes** - Updated test cases to ensure accurate validation of the new dashboard generation and metrics calculation functionalities.
2025-03-03 14:22:32 +01:00 · 2025-03-03 14:22:32 +01:00 · bee04cad86
commit bee04cad86
parent 8874ddad2e
9 changed files with 429 additions and 151 deletions
--- a/cognee/tests/unit/eval_framework/dashboard_test.py
+++ b/cognee/tests/unit/eval_framework/dashboard_test.py
@ -1,104 +1,88 @@
 import unittest
-from unittest.mock import patch
 import json
 import os
-import tempfile
-from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci
-import numpy as np


-class TestGenerateMetricsDashboard(unittest.TestCase):
+from evals.eval_framework.analysis.dashboard_generator import (
+    create_distribution_plots,
+    create_ci_plot,
+    generate_details_html,
+    get_dashboard_html_template,
+    create_dashboard,
+)
+
+
+class TestDashboardFunctions(unittest.TestCase):
    def setUp(self):
-        self.test_data = [
+        """Set up test data."""
+        self.metrics_data = {
+            "accuracy": [0.8, 0.85, 0.9, 0.95, 1.0],
+            "f1_score": [0.7, 0.75, 0.8, 0.85, 0.9],
+        }
+
+        self.ci_data = {
+            "accuracy": (0.9, 0.85, 0.95),
+            "f1_score": (0.8, 0.75, 0.85),
+        }
+
+        self.detail_data = [
            {
                "question": "What is AI?",
                "answer": "Artificial Intelligence",
                "golden_answer": "Artificial Intelligence",
                "metrics": {
-                    "accuracy": {"score": 0.9, "reason": "Close enough"},
-                    "relevance": {"score": 0.8},
-                },
-            },
-            {
-                "question": "What is ML?",
-                "answer": "Machine Learning",
-                "golden_answer": "Machine Learning",
-                "metrics": {
-                    "accuracy": {"score": 0.95, "reason": "Exact match"},
-                    "relevance": {"score": 0.85},
-                },
+                    "accuracy": {"score": 1.0, "reason": "Exact match"},
+                    "f1_score": {"score": 0.9, "reason": "High similarity"},
                },
+            }
        ]

-        self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8")
-        json.dump(self.test_data, self.temp_json)
-        self.temp_json.close()
-        self.output_file = "test_dashboard.html"
+    def test_generate_details_html(self):
+        """Test HTML details generation."""
+        html_output = generate_details_html(self.detail_data)

-    def tearDown(self):
-        os.remove(self.temp_json.name)
-        if os.path.exists(self.output_file):
-            os.remove(self.output_file)
+        self.assertIn("<h3>accuracy Details</h3>", html_output[0])
+        self.assertIn("<th>Question</th>", html_output[1])
+        self.assertIn("Exact match", "".join(html_output))

-    def test_generate_metrics_dashboard_valid_json(self):
-        """Test if the function processes valid JSON correctly and creates an output file."""
-        result = generate_metrics_dashboard(
-            self.temp_json.name, self.output_file, benchmark="Test Benchmark"
+    def test_get_dashboard_html_template(self):
+        """Test full dashboard HTML generation."""
+        figures = create_distribution_plots(self.metrics_data)
+        ci_plot = create_ci_plot(self.ci_data)
+        dashboard_html = get_dashboard_html_template(
+            figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1"
        )

-        self.assertTrue(os.path.exists(self.output_file))
-        self.assertEqual(result, self.output_file)
+        self.assertIn("<title>LLM Evaluation Dashboard Benchmark 1</title>", dashboard_html)
+        self.assertIn("<h2>Metrics Distribution</h2>", dashboard_html)
+        self.assertIn("<h2>95% confidence interval for all the metrics</h2>", dashboard_html)
+        self.assertIn("Benchmark 1", dashboard_html)

-        with open(self.output_file, "r", encoding="utf-8") as f:
-            html_content = f.read()
-            self.assertIn("<title>LLM Evaluation Dashboard Test Benchmark</title>", html_content)
-            self.assertIn("accuracy", html_content)
-            self.assertIn("relevance", html_content)
+    def test_create_dashboard(self):
+        """Test the full dashboard generation and file creation."""
+        metrics_path = "test_metrics.json"
+        aggregate_metrics_path = "test_aggregate.json"
+        output_file = "test_dashboard.html"

-    @patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95))
-    def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci):
-        """Test if bootstrap_ci is called with the correct parameters."""
-        generate_metrics_dashboard(self.temp_json.name, self.output_file)
+        with open(metrics_path, "w") as f:
+            json.dump(self.detail_data, f)

-        mock_bootstrap_ci.assert_any_call([0.9, 0.95])  # For accuracy
-        mock_bootstrap_ci.assert_any_call([0.8, 0.85])  # For relevance
-
-    @patch("plotly.graph_objects.Figure.to_html", return_value="<div>Plotly Chart</div>")
-    def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html):
-        """Test if Plotly figures are generated correctly."""
-        generate_metrics_dashboard(self.temp_json.name, self.output_file)
-
-        self.assertGreaterEqual(mock_to_html.call_count, 3)  # 2 metrics + CI chart
-
-        with open(self.output_file, "r", encoding="utf-8") as f:
-            file_content = f.read()
-            self.assertIn(
-                "<div>Plotly Chart</div>",
-                file_content,
-                "The output file does not contain the expected Plotly chart HTML.",
+        with open(aggregate_metrics_path, "w") as f:
+            json.dump(
+                {
+                    metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]}
+                    for metric, v in self.ci_data.items()
+                },
+                f,
            )

+        output = create_dashboard(
+            metrics_path, aggregate_metrics_path, output_file, "Test Benchmark"
+        )

-class TestBootstrapCI(unittest.TestCase):
-    def test_bootstrap_ci_basic(self):
-        scores = [1, 2, 3, 4, 5]
-        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
+        self.assertEqual(output, output_file)
+        self.assertTrue(os.path.exists(output_file))

-        self.assertAlmostEqual(mean, np.mean(scores), places=2)
-        self.assertLessEqual(lower, mean)
-        self.assertGreaterEqual(upper, mean)
-
-    def test_bootstrap_ci_single_value(self):
-        scores = [3, 3, 3, 3, 3]
-        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
-
-        self.assertEqual(mean, 3)
-        self.assertEqual(lower, 3)
-        self.assertEqual(upper, 3)
-
-    def test_bootstrap_ci_empty_list(self):
-        mean, lower, upper = bootstrap_ci([])
-
-        self.assertTrue(np.isnan(mean))
-        self.assertTrue(np.isnan(lower))
-        self.assertTrue(np.isnan(upper))
+        os.remove(metrics_path)
+        os.remove(aggregate_metrics_path)
+        os.remove(output_file)
--- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
+++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter):
            "question": "What is 2 + 2?",
            "answer": "4",
            "golden_answer": "4",
+            "retrieval_context": "2 + 2 = 4",
        }
    ]

@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter):
            "question": None,
            "answer": None,
            "golden_answer": None,
+            "retrieval_context": None,
        }
    ]
    evaluator_metrics = ["EM", "f1"]
--- a/cognee/tests/unit/eval_framework/metrics_test.py
+++ b/cognee/tests/unit/eval_framework/metrics_test.py
@ -2,6 +2,10 @@ import pytest
 from typing import Optional
 import sys
 from unittest.mock import patch, MagicMock
+import unittest
+import numpy as np
+from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci
+

 with patch.dict(
    sys.modules,
@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra
    assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
        f"F1 score failed for '{actual}' vs '{expected}'"
    )
+
+
+class TestBootstrapCI(unittest.TestCase):
+    def test_bootstrap_ci_basic(self):
+        scores = [1, 2, 3, 4, 5]
+        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
+
+        self.assertAlmostEqual(mean, np.mean(scores), places=2)
+        self.assertLessEqual(lower, mean)
+        self.assertGreaterEqual(upper, mean)
+
+    def test_bootstrap_ci_single_value(self):
+        scores = [3, 3, 3, 3, 3]
+        mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95)
+
+        self.assertEqual(mean, 3)
+        self.assertEqual(lower, 3)
+        self.assertEqual(upper, 3)
+
+    def test_bootstrap_ci_empty_list(self):
+        mean, lower, upper = bootstrap_ci([])
+
+        self.assertTrue(np.isnan(mean))
+        self.assertTrue(np.isnan(lower))
+        self.assertTrue(np.isnan(upper))
--- a/evals/eval_framework/analysis/init.py
+++ b/evals/eval_framework/analysis/init.py
--- a/evals/eval_framework/analysis/dashboard_generator.py
+++ b/evals/eval_framework/analysis/dashboard_generator.py
@ -1,50 +1,12 @@
 import json
-from collections import defaultdict
 import plotly.graph_objects as go
-import numpy as np
+from typing import Dict, List, Tuple
+from collections import defaultdict


-def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
-    means = []
-    n = len(scores)
-    for _ in range(num_samples):
-        sample = np.random.choice(scores, size=n, replace=True)
-        means.append(np.mean(sample))
-
-    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
-    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
-    return np.mean(scores), lower_bound, upper_bound
-
-
-def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
-    try:
-        with open(json_data, "r", encoding="utf-8") as f:
-            data = json.load(f)
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Could not find the file: {json_data}")
-    except json.JSONDecodeError as e:
-        raise ValueError(f"Error decoding JSON from {json_data}: {e}")
-
-    metrics_data = defaultdict(list)
-    metric_details = defaultdict(list)
-
-    for entry in data:
-        for metric, values in entry["metrics"].items():
-            score = values["score"]
-            metrics_data[metric].append(score)
-            if "reason" in values:
-                metric_details[metric].append(
-                    {
-                        "question": entry["question"],
-                        "answer": entry["answer"],
-                        "golden_answer": entry["golden_answer"],
-                        "reason": values["reason"],
-                        "score": score,
-                    }
-                )
-
+def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
+    """Create distribution histogram plots for each metric."""
    figures = []
-
    for metric, scores in metrics_data.items():
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
            template="seaborn",
        )
        figures.append(fig.to_html(full_html=False))
+    return figures

-    ci_results = {}
-    for metric, scores in metrics_data.items():
-        mean_score, lower, upper = bootstrap_ci(scores)
-        ci_results[metric] = (mean_score, lower, upper)

-    # Bar chart with confidence intervals
+def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
+    """Create confidence interval bar plot."""
    fig = go.Figure()
    for metric, (mean_score, lower, upper) in ci_results.items():
        fig.add_trace(
@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
        yaxis_title="Score",
        template="seaborn",
    )
-    figures.append(fig.to_html(full_html=False))
+    return fig.to_html(full_html=False)

+
+def generate_details_html(metrics_data: List[Dict]) -> List[str]:
+    """Generate HTML for detailed metric information."""
    details_html = []
+    metric_details = {}
+
+    # Organize metrics by type
+    for entry in metrics_data:
+        for metric, values in entry["metrics"].items():
+            if metric not in metric_details:
+                metric_details[metric] = []
+            metric_details[metric].append(
+                {
+                    "question": entry["question"],
+                    "answer": entry["answer"],
+                    "golden_answer": entry["golden_answer"],
+                    "reason": values.get("reason", ""),
+                    "score": values["score"],
+                }
+            )
+
    for metric, details in metric_details.items():
        details_html.append(f"<h3>{metric} Details</h3>")
        details_html.append("""
@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
                f"</tr>"
            )
        details_html.append("</table>")
+    return details_html

-    html_template = f"""
+
+def get_dashboard_html_template(
+    figures: List[str], details_html: List[str], benchmark: str = ""
+) -> str:
+    """Generate the complete HTML dashboard template."""
+    return f"""
    <!DOCTYPE html>
    <html>
    <head>
@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
        <h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>

        <h2>Metrics Distribution</h2>
-        {"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])}
+        {"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}

        <h2>95% confidence interval for all the metrics</h2>
        <div class="chart">{figures[-1]}</div>
@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
    </html>
    """

+
+def create_dashboard(
+    metrics_path: str,
+    aggregate_metrics_path: str,
+    output_file: str = "dashboard_with_ci.html",
+    benchmark: str = "",
+) -> str:
+    """Create and save the dashboard with all visualizations."""
+    # Read metrics files
+    with open(metrics_path, "r") as f:
+        metrics_data = json.load(f)
+    with open(aggregate_metrics_path, "r") as f:
+        aggregate_data = json.load(f)
+
+    # Extract data for visualizations
+    metrics_by_type = defaultdict(list)
+    for entry in metrics_data:
+        for metric, values in entry["metrics"].items():
+            metrics_by_type[metric].append(values["score"])
+
+    # Generate visualizations
+    distribution_figures = create_distribution_plots(metrics_by_type)
+    ci_plot = create_ci_plot(
+        {
+            metric: (data["mean"], data["ci_lower"], data["ci_upper"])
+            for metric, data in aggregate_data.items()
+        }
+    )
+
+    # Combine all figures
+    figures = distribution_figures + [ci_plot]
+
+    # Generate HTML components
+    details_html = generate_details_html(metrics_data)
+    dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
+
+    # Write to file
    with open(output_file, "w", encoding="utf-8") as f:
-        f.write(html_template)
+        f.write(dashboard_html)
+
    return output_file
--- a/evals/eval_framework/analysis/metrics_calculator.py
+++ b/evals/eval_framework/analysis/metrics_calculator.py
@ -0,0 +1,92 @@
+import json
+from collections import defaultdict
+import numpy as np
+from typing import Dict, List, Tuple
+
+
+def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
+    """Calculate bootstrap confidence intervals for a list of scores."""
+    means = []
+    n = len(scores)
+    for _ in range(num_samples):
+        sample = np.random.choice(scores, size=n, replace=True)
+        means.append(np.mean(sample))
+
+    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
+    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
+    return np.mean(scores), lower_bound, upper_bound
+
+
+def load_metrics_data(json_file_path: str) -> List[Dict]:
+    """Load metrics data from JSON file."""
+    try:
+        with open(json_file_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Could not find the file: {json_file_path}")
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
+
+
+def extract_metrics_and_details(
+    data: List[Dict],
+) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
+    """Extract metrics scores and details from evaluation data."""
+    metrics_data = defaultdict(list)
+    metric_details = defaultdict(list)
+
+    for entry in data:
+        for metric, values in entry["metrics"].items():
+            score = values["score"]
+            metrics_data[metric].append(score)
+            if "reason" in values:
+                metric_details[metric].append(
+                    {
+                        "question": entry["question"],
+                        "answer": entry["answer"],
+                        "golden_answer": entry["golden_answer"],
+                        "reason": values["reason"],
+                        "score": score,
+                    }
+                )
+
+    return metrics_data, metric_details
+
+
+def save_aggregate_metrics(
+    metrics_data: Dict[str, List[float]],
+    ci_results: Dict[str, Tuple[float, float, float]],
+    output_path: str,
+) -> None:
+    """Save aggregated metrics and confidence intervals to file."""
+    aggregate_data = {
+        metric: {
+            "scores": scores,
+            "mean": ci_results[metric][0],
+            "ci_lower": ci_results[metric][1],
+            "ci_upper": ci_results[metric][2],
+        }
+        for metric, scores in metrics_data.items()
+    }
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(aggregate_data, f, indent=4)
+
+
+def calculate_metrics_statistics(
+    json_data: str, aggregate_output_path: str
+) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
+    """Calculate metrics statistics and save aggregated results."""
+    data = load_metrics_data(json_data)
+    metrics_data, metric_details = extract_metrics_and_details(data)
+
+    # Calculate confidence intervals
+    ci_results = {}
+    for metric, scores in metrics_data.items():
+        mean_score, lower, upper = bootstrap_ci(scores)
+        ci_results[metric] = (mean_score, lower, upper)
+
+    # Save aggregate metrics
+    save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
+
+    return metrics_data, metric_details, ci_results
--- a/evals/eval_framework/eval_config.py
+++ b/evals/eval_framework/eval_config.py
@ -26,6 +26,9 @@ class EvalConfig(BaseSettings):
    ]  # Use only 'correctness' for DirectLLM
    deepeval_model: str = "gpt-4o-mini"

+    # Metrics params
+    calculate_metrics: bool = True
+
    # Visualization
    dashboard: bool = True

@ -33,6 +36,7 @@ class EvalConfig(BaseSettings):
    questions_path: str = "questions_output.json"
    answers_path: str = "answers_output.json"
    metrics_path: str = "metrics_output.json"
+    aggregate_metrics_path: str = "aggregate_metrics.json"
    dashboard_path: str = "dashboard.html"
    direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
    direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
@ -49,10 +53,12 @@ class EvalConfig(BaseSettings):
            "evaluating_answers": self.evaluating_answers,
            "evaluation_engine": self.evaluation_engine,
            "evaluation_metrics": self.evaluation_metrics,
+            "calculate_metrics": self.calculate_metrics,
            "dashboard": self.dashboard,
            "questions_path": self.questions_path,
            "answers_path": self.answers_path,
            "metrics_path": self.metrics_path,
+            "aggregate_metrics_path": self.aggregate_metrics_path,
            "dashboard_path": self.dashboard_path,
            "deepeval_model": self.deepeval_model,
            "task_getter_type": self.task_getter_type,
--- a/evals/eval_framework/evaluation/run_evaluation_module.py
+++ b/evals/eval_framework/evaluation/run_evaluation_module.py
@ -1,7 +1,8 @@
 import logging
 import json
 from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
-from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
+from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
+from evals.eval_framework.analysis.dashboard_generator import create_dashboard
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.infrastructure.databases.relational.get_relational_engine import (
    get_relational_engine,
@ -28,8 +29,8 @@ async def create_and_insert_metrics_table(questions_payload):
        await session.commit()


-async def run_evaluation(params: dict) -> None:
-    if params.get("evaluating_answers"):
+async def execute_evaluation(params: dict) -> None:
+    """Execute the evaluation step and save results."""
    logging.info("Evaluation started...")
    try:
        with open(params["answers_path"], "r", encoding="utf-8") as f:
@ -48,12 +49,36 @@ async def run_evaluation(params: dict) -> None:
        json.dump(metrics, f, ensure_ascii=False, indent=4)

    await create_and_insert_metrics_table(metrics)
+    logging.info("Evaluation completed")

-        logging.info("Evaluation End...")

+async def run_evaluation(params: dict) -> None:
+    """Run each step of the evaluation pipeline based on configuration flags."""
+    # Step 1: Evaluate answers if requested
+    if params.get("evaluating_answers"):
+        await execute_evaluation(params)
+    else:
+        logging.info("Skipping evaluation as evaluating_answers is False")
+
+    # Step 2: Calculate metrics if requested
+    if params.get("calculate_metrics"):
+        logging.info("Calculating metrics statistics...")
+        calculate_metrics_statistics(
+            json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
+        )
+        logging.info("Metrics calculation completed")
+    else:
+        logging.info("Skipping metrics calculation as calculate_metrics is False")
+
+    # Step 3: Generate dashboard if requested
    if params.get("dashboard"):
-        generate_metrics_dashboard(
-            json_data=params["metrics_path"],
+        logging.info("Generating dashboard...")
+        create_dashboard(
+            metrics_path=params["metrics_path"],
+            aggregate_metrics_path=params["aggregate_metrics_path"],
            output_file=params["dashboard_path"],
            benchmark=params["benchmark"],
        )
+        logging.info(f"Dashboard generated at {params['dashboard_path']}")
+    else:
+        logging.info("Skipping dashboard generation as dashboard is False")
--- a/evals/eval_framework/modal_run_eval.py
+++ b/evals/eval_framework/modal_run_eval.py
@ -0,0 +1,116 @@
+import modal
+import os
+import json
+import asyncio
+import datetime
+import logging
+from evals.eval_framework.eval_config import EvalConfig
+from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
+from evals.eval_framework.answer_generation.run_question_answering_module import (
+    run_question_answering,
+)
+from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
+
+logger = logging.getLogger(__name__)
+
+
+def read_and_combine_metrics(eval_params: dict) -> dict:
+    """Read and combine metrics files into a single result dictionary."""
+    try:
+        with open(eval_params["metrics_path"], "r") as f:
+            metrics = json.load(f)
+        with open(eval_params["aggregate_metrics_path"], "r") as f:
+            aggregate_metrics = json.load(f)
+
+        return {
+            "task_getter_type": eval_params["task_getter_type"],
+            "number_of_samples": eval_params["number_of_samples_in_corpus"],
+            "metrics": metrics,
+            "aggregate_metrics": aggregate_metrics,
+        }
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        logger.error(f"Error reading metrics files: {e}")
+        return None
+
+
+app = modal.App("modal-run-eval")
+
+image = (
+    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
+    .copy_local_file("pyproject.toml", "pyproject.toml")
+    .copy_local_file("poetry.lock", "poetry.lock")
+    .env(
+        {
+            "ENV": os.getenv("ENV"),
+            "LLM_API_KEY": os.getenv("LLM_API_KEY"),
+            "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
+        }
+    )
+    .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
+    .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
+)
+
+
+@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
+async def modal_run_eval(eval_params=None):
+    """Runs evaluation pipeline and returns combined metrics results."""
+    if eval_params is None:
+        eval_params = EvalConfig().to_dict()
+
+    logger.info(f"Running evaluation with params: {eval_params}")
+
+    # Run the evaluation pipeline
+    await run_corpus_builder(eval_params)
+    await run_question_answering(eval_params)
+    await run_evaluation(eval_params)
+
+    # Early return if metrics calculation wasn't requested
+    if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
+        logger.info(
+            "Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
+        )
+        return None
+
+    return read_and_combine_metrics(eval_params)
+
+
+@app.local_entrypoint()
+async def main():
+    # List of configurations to run
+    configs = [
+        EvalConfig(
+            task_getter_type="Default",
+            number_of_samples_in_corpus=2,
+            building_corpus_from_scratch=True,
+            answering_questions=True,
+            evaluating_answers=True,
+            calculate_metrics=True,
+            dashboard=False,
+        ),
+        EvalConfig(
+            task_getter_type="Default",
+            number_of_samples_in_corpus=10,
+            building_corpus_from_scratch=True,
+            answering_questions=True,
+            evaluating_answers=True,
+            calculate_metrics=True,
+            dashboard=False,
+        ),
+    ]
+
+    # Run evaluations in parallel with different configurations
+    modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
+    results = await asyncio.gather(*modal_tasks)
+
+    # Filter out None results and save combined results
+    results = [r for r in results if r is not None]
+    if results:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_file = f"combined_results_{timestamp}.json"
+
+        with open(output_file, "w") as f:
+            json.dump(results, f, indent=2)
+
+        logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
+    else:
+        logger.info("No metrics were collected from any of the evaluation runs")