diff --git a/cognee/tests/unit/eval_framework/dashboard_test.py b/cognee/tests/unit/eval_framework/dashboard_test.py index 3fda78189..fe4424b9e 100644 --- a/cognee/tests/unit/eval_framework/dashboard_test.py +++ b/cognee/tests/unit/eval_framework/dashboard_test.py @@ -1,104 +1,88 @@ import unittest -from unittest.mock import patch import json import os -import tempfile -from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard, bootstrap_ci -import numpy as np -class TestGenerateMetricsDashboard(unittest.TestCase): +from evals.eval_framework.analysis.dashboard_generator import ( + create_distribution_plots, + create_ci_plot, + generate_details_html, + get_dashboard_html_template, + create_dashboard, +) + + +class TestDashboardFunctions(unittest.TestCase): def setUp(self): - self.test_data = [ + """Set up test data.""" + self.metrics_data = { + "accuracy": [0.8, 0.85, 0.9, 0.95, 1.0], + "f1_score": [0.7, 0.75, 0.8, 0.85, 0.9], + } + + self.ci_data = { + "accuracy": (0.9, 0.85, 0.95), + "f1_score": (0.8, 0.75, 0.85), + } + + self.detail_data = [ { "question": "What is AI?", "answer": "Artificial Intelligence", "golden_answer": "Artificial Intelligence", "metrics": { - "accuracy": {"score": 0.9, "reason": "Close enough"}, - "relevance": {"score": 0.8}, + "accuracy": {"score": 1.0, "reason": "Exact match"}, + "f1_score": {"score": 0.9, "reason": "High similarity"}, }, - }, - { - "question": "What is ML?", - "answer": "Machine Learning", - "golden_answer": "Machine Learning", - "metrics": { - "accuracy": {"score": 0.95, "reason": "Exact match"}, - "relevance": {"score": 0.85}, - }, - }, + } ] - self.temp_json = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") - json.dump(self.test_data, self.temp_json) - self.temp_json.close() - self.output_file = "test_dashboard.html" + def test_generate_details_html(self): + """Test HTML details generation.""" + html_output = generate_details_html(self.detail_data) - def tearDown(self): - os.remove(self.temp_json.name) - if os.path.exists(self.output_file): - os.remove(self.output_file) + self.assertIn("

accuracy Details

", html_output[0]) + self.assertIn("Question", html_output[1]) + self.assertIn("Exact match", "".join(html_output)) - def test_generate_metrics_dashboard_valid_json(self): - """Test if the function processes valid JSON correctly and creates an output file.""" - result = generate_metrics_dashboard( - self.temp_json.name, self.output_file, benchmark="Test Benchmark" + def test_get_dashboard_html_template(self): + """Test full dashboard HTML generation.""" + figures = create_distribution_plots(self.metrics_data) + ci_plot = create_ci_plot(self.ci_data) + dashboard_html = get_dashboard_html_template( + figures + [ci_plot], generate_details_html(self.detail_data), "Benchmark 1" ) - self.assertTrue(os.path.exists(self.output_file)) - self.assertEqual(result, self.output_file) + self.assertIn("LLM Evaluation Dashboard Benchmark 1", dashboard_html) + self.assertIn("

Metrics Distribution

", dashboard_html) + self.assertIn("

95% confidence interval for all the metrics

", dashboard_html) + self.assertIn("Benchmark 1", dashboard_html) - with open(self.output_file, "r", encoding="utf-8") as f: - html_content = f.read() - self.assertIn("LLM Evaluation Dashboard Test Benchmark", html_content) - self.assertIn("accuracy", html_content) - self.assertIn("relevance", html_content) + def test_create_dashboard(self): + """Test the full dashboard generation and file creation.""" + metrics_path = "test_metrics.json" + aggregate_metrics_path = "test_aggregate.json" + output_file = "test_dashboard.html" - @patch("evals.eval_framework.metrics_dashboard.bootstrap_ci", return_value=(0.9, 0.85, 0.95)) - def test_generate_metrics_dashboard_ci_calculation(self, mock_bootstrap_ci): - """Test if bootstrap_ci is called with the correct parameters.""" - generate_metrics_dashboard(self.temp_json.name, self.output_file) + with open(metrics_path, "w") as f: + json.dump(self.detail_data, f) - mock_bootstrap_ci.assert_any_call([0.9, 0.95]) # For accuracy - mock_bootstrap_ci.assert_any_call([0.8, 0.85]) # For relevance - - @patch("plotly.graph_objects.Figure.to_html", return_value="
Plotly Chart
") - def test_generate_metrics_dashboard_plotly_charts(self, mock_to_html): - """Test if Plotly figures are generated correctly.""" - generate_metrics_dashboard(self.temp_json.name, self.output_file) - - self.assertGreaterEqual(mock_to_html.call_count, 3) # 2 metrics + CI chart - - with open(self.output_file, "r", encoding="utf-8") as f: - file_content = f.read() - self.assertIn( - "
Plotly Chart
", - file_content, - "The output file does not contain the expected Plotly chart HTML.", + with open(aggregate_metrics_path, "w") as f: + json.dump( + { + metric: {"mean": v[0], "ci_lower": v[1], "ci_upper": v[2]} + for metric, v in self.ci_data.items() + }, + f, ) + output = create_dashboard( + metrics_path, aggregate_metrics_path, output_file, "Test Benchmark" + ) -class TestBootstrapCI(unittest.TestCase): - def test_bootstrap_ci_basic(self): - scores = [1, 2, 3, 4, 5] - mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95) + self.assertEqual(output, output_file) + self.assertTrue(os.path.exists(output_file)) - self.assertAlmostEqual(mean, np.mean(scores), places=2) - self.assertLessEqual(lower, mean) - self.assertGreaterEqual(upper, mean) - - def test_bootstrap_ci_single_value(self): - scores = [3, 3, 3, 3, 3] - mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95) - - self.assertEqual(mean, 3) - self.assertEqual(lower, 3) - self.assertEqual(upper, 3) - - def test_bootstrap_ci_empty_list(self): - mean, lower, upper = bootstrap_ci([]) - - self.assertTrue(np.isnan(mean)) - self.assertTrue(np.isnan(lower)) - self.assertTrue(np.isnan(upper)) + os.remove(metrics_path) + os.remove(aggregate_metrics_path) + os.remove(output_file) diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py index d4226cc66..3b0a0a19d 100644 --- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py +++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py @@ -22,6 +22,7 @@ async def test_evaluate_answers_em_f1(adapter): "question": "What is 2 + 2?", "answer": "4", "golden_answer": "4", + "retrieval_context": "2 + 2 = 4", } ] @@ -77,6 +78,7 @@ async def test_none_values_in_answers(adapter): "question": None, "answer": None, "golden_answer": None, + "retrieval_context": None, } ] evaluator_metrics = ["EM", "f1"] diff --git a/cognee/tests/unit/eval_framework/metrics_test.py b/cognee/tests/unit/eval_framework/metrics_test.py index c67e845b5..719995229 100644 --- a/cognee/tests/unit/eval_framework/metrics_test.py +++ b/cognee/tests/unit/eval_framework/metrics_test.py @@ -2,6 +2,10 @@ import pytest from typing import Optional import sys from unittest.mock import patch, MagicMock +import unittest +import numpy as np +from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci + with patch.dict( sys.modules, @@ -56,3 +60,28 @@ def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_ra assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], ( f"F1 score failed for '{actual}' vs '{expected}'" ) + + +class TestBootstrapCI(unittest.TestCase): + def test_bootstrap_ci_basic(self): + scores = [1, 2, 3, 4, 5] + mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95) + + self.assertAlmostEqual(mean, np.mean(scores), places=2) + self.assertLessEqual(lower, mean) + self.assertGreaterEqual(upper, mean) + + def test_bootstrap_ci_single_value(self): + scores = [3, 3, 3, 3, 3] + mean, lower, upper = bootstrap_ci(scores, num_samples=1000, confidence_level=0.95) + + self.assertEqual(mean, 3) + self.assertEqual(lower, 3) + self.assertEqual(upper, 3) + + def test_bootstrap_ci_empty_list(self): + mean, lower, upper = bootstrap_ci([]) + + self.assertTrue(np.isnan(mean)) + self.assertTrue(np.isnan(lower)) + self.assertTrue(np.isnan(upper)) diff --git a/evals/eval_framework/analysis/__init__.py b/evals/eval_framework/analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/evals/eval_framework/metrics_dashboard.py b/evals/eval_framework/analysis/dashboard_generator.py similarity index 57% rename from evals/eval_framework/metrics_dashboard.py rename to evals/eval_framework/analysis/dashboard_generator.py index 739f3f605..2c917740a 100644 --- a/evals/eval_framework/metrics_dashboard.py +++ b/evals/eval_framework/analysis/dashboard_generator.py @@ -1,50 +1,12 @@ import json -from collections import defaultdict import plotly.graph_objects as go -import numpy as np +from typing import Dict, List, Tuple +from collections import defaultdict -def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95): - means = [] - n = len(scores) - for _ in range(num_samples): - sample = np.random.choice(scores, size=n, replace=True) - means.append(np.mean(sample)) - - lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100) - upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100) - return np.mean(scores), lower_bound, upper_bound - - -def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""): - try: - with open(json_data, "r", encoding="utf-8") as f: - data = json.load(f) - except FileNotFoundError: - raise FileNotFoundError(f"Could not find the file: {json_data}") - except json.JSONDecodeError as e: - raise ValueError(f"Error decoding JSON from {json_data}: {e}") - - metrics_data = defaultdict(list) - metric_details = defaultdict(list) - - for entry in data: - for metric, values in entry["metrics"].items(): - score = values["score"] - metrics_data[metric].append(score) - if "reason" in values: - metric_details[metric].append( - { - "question": entry["question"], - "answer": entry["answer"], - "golden_answer": entry["golden_answer"], - "reason": values["reason"], - "score": score, - } - ) - +def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]: + """Create distribution histogram plots for each metric.""" figures = [] - for metric, scores in metrics_data.items(): fig = go.Figure() fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4")) @@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", template="seaborn", ) figures.append(fig.to_html(full_html=False)) + return figures - ci_results = {} - for metric, scores in metrics_data.items(): - mean_score, lower, upper = bootstrap_ci(scores) - ci_results[metric] = (mean_score, lower, upper) - # Bar chart with confidence intervals +def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str: + """Create confidence interval bar plot.""" fig = go.Figure() for metric, (mean_score, lower, upper) in ci_results.items(): fig.add_trace( @@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", yaxis_title="Score", template="seaborn", ) - figures.append(fig.to_html(full_html=False)) + return fig.to_html(full_html=False) + +def generate_details_html(metrics_data: List[Dict]) -> List[str]: + """Generate HTML for detailed metric information.""" details_html = [] + metric_details = {} + + # Organize metrics by type + for entry in metrics_data: + for metric, values in entry["metrics"].items(): + if metric not in metric_details: + metric_details[metric] = [] + metric_details[metric].append( + { + "question": entry["question"], + "answer": entry["answer"], + "golden_answer": entry["golden_answer"], + "reason": values.get("reason", ""), + "score": values["score"], + } + ) + for metric, details in metric_details.items(): details_html.append(f"

{metric} Details

") details_html.append(""" @@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", f"" ) details_html.append("") + return details_html - html_template = f""" + +def get_dashboard_html_template( + figures: List[str], details_html: List[str], benchmark: str = "" +) -> str: + """Generate the complete HTML dashboard template.""" + return f""" @@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",

LLM Evaluation Metrics Dashboard {benchmark}

Metrics Distribution

- {"".join([f'
{fig}
' for fig in figures[: len(metrics_data)]])} + {"".join([f'
{fig}
' for fig in figures[:-1]])}

95% confidence interval for all the metrics

{figures[-1]}
@@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", """ + +def create_dashboard( + metrics_path: str, + aggregate_metrics_path: str, + output_file: str = "dashboard_with_ci.html", + benchmark: str = "", +) -> str: + """Create and save the dashboard with all visualizations.""" + # Read metrics files + with open(metrics_path, "r") as f: + metrics_data = json.load(f) + with open(aggregate_metrics_path, "r") as f: + aggregate_data = json.load(f) + + # Extract data for visualizations + metrics_by_type = defaultdict(list) + for entry in metrics_data: + for metric, values in entry["metrics"].items(): + metrics_by_type[metric].append(values["score"]) + + # Generate visualizations + distribution_figures = create_distribution_plots(metrics_by_type) + ci_plot = create_ci_plot( + { + metric: (data["mean"], data["ci_lower"], data["ci_upper"]) + for metric, data in aggregate_data.items() + } + ) + + # Combine all figures + figures = distribution_figures + [ci_plot] + + # Generate HTML components + details_html = generate_details_html(metrics_data) + dashboard_html = get_dashboard_html_template(figures, details_html, benchmark) + + # Write to file with open(output_file, "w", encoding="utf-8") as f: - f.write(html_template) + f.write(dashboard_html) + return output_file diff --git a/evals/eval_framework/analysis/metrics_calculator.py b/evals/eval_framework/analysis/metrics_calculator.py new file mode 100644 index 000000000..ca1960748 --- /dev/null +++ b/evals/eval_framework/analysis/metrics_calculator.py @@ -0,0 +1,92 @@ +import json +from collections import defaultdict +import numpy as np +from typing import Dict, List, Tuple + + +def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95): + """Calculate bootstrap confidence intervals for a list of scores.""" + means = [] + n = len(scores) + for _ in range(num_samples): + sample = np.random.choice(scores, size=n, replace=True) + means.append(np.mean(sample)) + + lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100) + upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100) + return np.mean(scores), lower_bound, upper_bound + + +def load_metrics_data(json_file_path: str) -> List[Dict]: + """Load metrics data from JSON file.""" + try: + with open(json_file_path, "r", encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + raise FileNotFoundError(f"Could not find the file: {json_file_path}") + except json.JSONDecodeError as e: + raise ValueError(f"Error decoding JSON from {json_file_path}: {e}") + + +def extract_metrics_and_details( + data: List[Dict], +) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]: + """Extract metrics scores and details from evaluation data.""" + metrics_data = defaultdict(list) + metric_details = defaultdict(list) + + for entry in data: + for metric, values in entry["metrics"].items(): + score = values["score"] + metrics_data[metric].append(score) + if "reason" in values: + metric_details[metric].append( + { + "question": entry["question"], + "answer": entry["answer"], + "golden_answer": entry["golden_answer"], + "reason": values["reason"], + "score": score, + } + ) + + return metrics_data, metric_details + + +def save_aggregate_metrics( + metrics_data: Dict[str, List[float]], + ci_results: Dict[str, Tuple[float, float, float]], + output_path: str, +) -> None: + """Save aggregated metrics and confidence intervals to file.""" + aggregate_data = { + metric: { + "scores": scores, + "mean": ci_results[metric][0], + "ci_lower": ci_results[metric][1], + "ci_upper": ci_results[metric][2], + } + for metric, scores in metrics_data.items() + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(aggregate_data, f, indent=4) + + +def calculate_metrics_statistics( + json_data: str, aggregate_output_path: str +) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]: + """Calculate metrics statistics and save aggregated results.""" + data = load_metrics_data(json_data) + metrics_data, metric_details = extract_metrics_and_details(data) + + # Calculate confidence intervals + ci_results = {} + for metric, scores in metrics_data.items(): + mean_score, lower, upper = bootstrap_ci(scores) + ci_results[metric] = (mean_score, lower, upper) + + # Save aggregate metrics + save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path) + + return metrics_data, metric_details, ci_results diff --git a/evals/eval_framework/eval_config.py b/evals/eval_framework/eval_config.py index f1d65341a..1ac72a105 100644 --- a/evals/eval_framework/eval_config.py +++ b/evals/eval_framework/eval_config.py @@ -26,6 +26,9 @@ class EvalConfig(BaseSettings): ] # Use only 'correctness' for DirectLLM deepeval_model: str = "gpt-4o-mini" + # Metrics params + calculate_metrics: bool = True + # Visualization dashboard: bool = True @@ -33,6 +36,7 @@ class EvalConfig(BaseSettings): questions_path: str = "questions_output.json" answers_path: str = "answers_output.json" metrics_path: str = "metrics_output.json" + aggregate_metrics_path: str = "aggregate_metrics.json" dashboard_path: str = "dashboard.html" direct_llm_system_prompt: str = "direct_llm_eval_system.txt" direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt" @@ -49,10 +53,12 @@ class EvalConfig(BaseSettings): "evaluating_answers": self.evaluating_answers, "evaluation_engine": self.evaluation_engine, "evaluation_metrics": self.evaluation_metrics, + "calculate_metrics": self.calculate_metrics, "dashboard": self.dashboard, "questions_path": self.questions_path, "answers_path": self.answers_path, "metrics_path": self.metrics_path, + "aggregate_metrics_path": self.aggregate_metrics_path, "dashboard_path": self.dashboard_path, "deepeval_model": self.deepeval_model, "task_getter_type": self.task_getter_type, diff --git a/evals/eval_framework/evaluation/run_evaluation_module.py b/evals/eval_framework/evaluation/run_evaluation_module.py index 351d253f6..76a7c5c56 100644 --- a/evals/eval_framework/evaluation/run_evaluation_module.py +++ b/evals/eval_framework/evaluation/run_evaluation_module.py @@ -1,7 +1,8 @@ import logging import json from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor -from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard +from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics +from evals.eval_framework.analysis.dashboard_generator import create_dashboard from cognee.infrastructure.files.storage import LocalStorage from cognee.infrastructure.databases.relational.get_relational_engine import ( get_relational_engine, @@ -28,32 +29,56 @@ async def create_and_insert_metrics_table(questions_payload): await session.commit() +async def execute_evaluation(params: dict) -> None: + """Execute the evaluation step and save results.""" + logging.info("Evaluation started...") + try: + with open(params["answers_path"], "r", encoding="utf-8") as f: + answers = json.load(f) + except FileNotFoundError: + raise FileNotFoundError(f"Could not find the file: {params['answers_path']}") + except json.JSONDecodeError as e: + raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}") + + logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}") + evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"]) + metrics = await evaluator.execute( + answers=answers, evaluator_metrics=params["evaluation_metrics"] + ) + with open(params["metrics_path"], "w", encoding="utf-8") as f: + json.dump(metrics, f, ensure_ascii=False, indent=4) + + await create_and_insert_metrics_table(metrics) + logging.info("Evaluation completed") + + async def run_evaluation(params: dict) -> None: + """Run each step of the evaluation pipeline based on configuration flags.""" + # Step 1: Evaluate answers if requested if params.get("evaluating_answers"): - logging.info("Evaluation started...") - try: - with open(params["answers_path"], "r", encoding="utf-8") as f: - answers = json.load(f) - except FileNotFoundError: - raise FileNotFoundError(f"Could not find the file: {params['answers_path']}") - except json.JSONDecodeError as e: - raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}") + await execute_evaluation(params) + else: + logging.info("Skipping evaluation as evaluating_answers is False") - logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}") - evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"]) - metrics = await evaluator.execute( - answers=answers, evaluator_metrics=params["evaluation_metrics"] + # Step 2: Calculate metrics if requested + if params.get("calculate_metrics"): + logging.info("Calculating metrics statistics...") + calculate_metrics_statistics( + json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"] ) - with open(params["metrics_path"], "w", encoding="utf-8") as f: - json.dump(metrics, f, ensure_ascii=False, indent=4) - - await create_and_insert_metrics_table(metrics) - - logging.info("Evaluation End...") + logging.info("Metrics calculation completed") + else: + logging.info("Skipping metrics calculation as calculate_metrics is False") + # Step 3: Generate dashboard if requested if params.get("dashboard"): - generate_metrics_dashboard( - json_data=params["metrics_path"], + logging.info("Generating dashboard...") + create_dashboard( + metrics_path=params["metrics_path"], + aggregate_metrics_path=params["aggregate_metrics_path"], output_file=params["dashboard_path"], benchmark=params["benchmark"], ) + logging.info(f"Dashboard generated at {params['dashboard_path']}") + else: + logging.info("Skipping dashboard generation as dashboard is False") diff --git a/evals/eval_framework/modal_run_eval.py b/evals/eval_framework/modal_run_eval.py new file mode 100644 index 000000000..f04c42954 --- /dev/null +++ b/evals/eval_framework/modal_run_eval.py @@ -0,0 +1,116 @@ +import modal +import os +import json +import asyncio +import datetime +import logging +from evals.eval_framework.eval_config import EvalConfig +from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder +from evals.eval_framework.answer_generation.run_question_answering_module import ( + run_question_answering, +) +from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation + +logger = logging.getLogger(__name__) + + +def read_and_combine_metrics(eval_params: dict) -> dict: + """Read and combine metrics files into a single result dictionary.""" + try: + with open(eval_params["metrics_path"], "r") as f: + metrics = json.load(f) + with open(eval_params["aggregate_metrics_path"], "r") as f: + aggregate_metrics = json.load(f) + + return { + "task_getter_type": eval_params["task_getter_type"], + "number_of_samples": eval_params["number_of_samples_in_corpus"], + "metrics": metrics, + "aggregate_metrics": aggregate_metrics, + } + except (FileNotFoundError, json.JSONDecodeError) as e: + logger.error(f"Error reading metrics files: {e}") + return None + + +app = modal.App("modal-run-eval") + +image = ( + modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False) + .copy_local_file("pyproject.toml", "pyproject.toml") + .copy_local_file("poetry.lock", "poetry.lock") + .env( + { + "ENV": os.getenv("ENV"), + "LLM_API_KEY": os.getenv("LLM_API_KEY"), + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), + } + ) + .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml") + .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly") +) + + +@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1) +async def modal_run_eval(eval_params=None): + """Runs evaluation pipeline and returns combined metrics results.""" + if eval_params is None: + eval_params = EvalConfig().to_dict() + + logger.info(f"Running evaluation with params: {eval_params}") + + # Run the evaluation pipeline + await run_corpus_builder(eval_params) + await run_question_answering(eval_params) + await run_evaluation(eval_params) + + # Early return if metrics calculation wasn't requested + if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"): + logger.info( + "Skipping metrics collection as either evaluating_answers or calculate_metrics is False" + ) + return None + + return read_and_combine_metrics(eval_params) + + +@app.local_entrypoint() +async def main(): + # List of configurations to run + configs = [ + EvalConfig( + task_getter_type="Default", + number_of_samples_in_corpus=2, + building_corpus_from_scratch=True, + answering_questions=True, + evaluating_answers=True, + calculate_metrics=True, + dashboard=False, + ), + EvalConfig( + task_getter_type="Default", + number_of_samples_in_corpus=10, + building_corpus_from_scratch=True, + answering_questions=True, + evaluating_answers=True, + calculate_metrics=True, + dashboard=False, + ), + ] + + # Run evaluations in parallel with different configurations + modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs] + results = await asyncio.gather(*modal_tasks) + + # Filter out None results and save combined results + results = [r for r in results if r is not None] + if results: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = f"combined_results_{timestamp}.json" + + with open(output_file, "w") as f: + json.dump(results, f, indent=2) + + logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}") + else: + logger.info("No metrics were collected from any of the evaluation runs")