From 4c3c811c1e902167474e229067c33c8896f1fcd1 Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Thu, 27 Feb 2025 13:24:47 +0100 Subject: [PATCH] test: eval_framework/evaluation unit tests [cog-1234] (#575) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **Tests** - Added a suite of tests to validate evaluation logic under various scenarios, including handling of valid inputs and error conditions. - Introduced comprehensive tests verifying the accuracy of evaluation metrics, ensuring reliable scoring and error management. - Created a new test suite for the `DeepEvalAdapter`, covering correctness, unsupported metrics, and error handling. - Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`, parameterized for various test cases. --- .../eval_framework/deepeval_adapter_test.py | 89 +++++++++++++++++++ .../tests/unit/eval_framework/metrics_test.py | 58 ++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 cognee/tests/unit/eval_framework/deepeval_adapter_test.py create mode 100644 cognee/tests/unit/eval_framework/metrics_test.py diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py new file mode 100644 index 000000000..d4226cc66 --- /dev/null +++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py @@ -0,0 +1,89 @@ +import pytest +from unittest.mock import patch, MagicMock +from evals.eval_framework.eval_config import EvalConfig +import sys + +with patch.dict( + sys.modules, + {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()}, +): + from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter + + +@pytest.fixture +def adapter(): + return DeepEvalAdapter() + + +@pytest.mark.asyncio +async def test_evaluate_answers_em_f1(adapter): + answers = [ + { + "question": "What is 2 + 2?", + "answer": "4", + "golden_answer": "4", + } + ] + + evaluator_metrics = ["EM", "f1"] + + results = await adapter.evaluate_answers(answers, evaluator_metrics) + + assert len(results) == 1 + assert "metrics" in results[0] + assert "EM" in results[0]["metrics"] + assert "f1" in results[0]["metrics"] + + +@pytest.mark.asyncio +async def test_unsupported_metric(adapter): + answers = [ + { + "question": "What is 2 + 2?", + "answer": "4", + "golden_answer": "4", + } + ] + evaluator_metrics = ["unsupported_metric"] + + with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"): + await adapter.evaluate_answers(answers, evaluator_metrics) + + +@pytest.mark.asyncio +async def test_empty_answers_list(adapter): + results = await adapter.evaluate_answers([], ["EM", "f1"]) + assert results == [] + + +@pytest.mark.asyncio +async def test_missing_fields_in_answer(adapter): + answers = [ + { + "question": "What is the capital of France?", + "answer": "Paris", + } + ] + evaluator_metrics = ["EM", "f1"] + + with pytest.raises(KeyError): + await adapter.evaluate_answers(answers, evaluator_metrics) + + +@pytest.mark.asyncio +async def test_none_values_in_answers(adapter): + answers = [ + { + "question": None, + "answer": None, + "golden_answer": None, + } + ] + evaluator_metrics = ["EM", "f1"] + + results = await adapter.evaluate_answers(answers, evaluator_metrics) + + assert len(results) == 1 + assert "metrics" in results[0] + assert "EM" in results[0]["metrics"] + assert "f1" in results[0]["metrics"] diff --git a/cognee/tests/unit/eval_framework/metrics_test.py b/cognee/tests/unit/eval_framework/metrics_test.py new file mode 100644 index 000000000..c67e845b5 --- /dev/null +++ b/cognee/tests/unit/eval_framework/metrics_test.py @@ -0,0 +1,58 @@ +import pytest +from typing import Optional +import sys +from unittest.mock import patch, MagicMock + +with patch.dict( + sys.modules, + {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()}, +): + from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric + from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric + + +class MockTestCase: + def __init__(self, actual_output: Optional[str], expected_output: Optional[str]): + self.actual_output = actual_output + self.expected_output = expected_output + + +@pytest.fixture +def metrics(): + return { + "exact_match": ExactMatchMetric(), + "f1": F1ScoreMetric(), + } + + +@pytest.mark.parametrize( + "actual, expected, expected_exact_score, expected_f1_range", + [ + ("Hello World", "Hello World", 1.0, (1.0, 1.0)), + ("Hello World", "hello world", 1.0, (1.0, 1.0)), + ("Hello World", "Hello World", 0.0, (0.0, 1.0)), + (" Hello World ", "Hello World", 1.0, (1.0, 1.0)), + ("", "Hello World", 0.0, (0.0, 0.0)), + ("Hello World", "", 0.0, (0.0, 0.0)), + ("", "", 1.0, (1.0, 1.0)), + ("Hello World", "Goodbye World", 0.0, (0.0, 1.0)), + ("Hello", "Hello World", 0.0, (0.0, 1.0)), + ("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)), + ("123", "123", 1.0, (1.0, 1.0)), + ("123", "456", 0.0, (0.0, 0.0)), + ("Café", "café", 1.0, (1.0, 1.0)), + ("Café", "Cafe", 0.0, (0.0, 0.0)), + ], +) +def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range): + test_case = MockTestCase(actual, expected) + + exact_match_score = metrics["exact_match"].measure(test_case) + assert exact_match_score == expected_exact_score, ( + f"Exact match failed for '{actual}' vs '{expected}'" + ) + + f1_score = metrics["f1"].measure(test_case) + assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], ( + f"F1 score failed for '{actual}' vs '{expected}'" + )