test: eval_framework/evaluation unit tests [cog-1234] (#575)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **Tests** - Added a suite of tests to validate evaluation logic under various scenarios, including handling of valid inputs and error conditions. - Introduced comprehensive tests verifying the accuracy of evaluation metrics, ensuring reliable scoring and error management. - Created a new test suite for the `DeepEvalAdapter`, covering correctness, unsupported metrics, and error handling. - Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`, parameterized for various test cases.
2025-02-27 13:24:47 +01:00 · 2025-02-27 13:24:47 +01:00 · 4c3c811c1e
commit 4c3c811c1e
parent c9aee6fbf4
2 changed files with 147 additions and 0 deletions
--- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
+++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
@ -0,0 +1,89 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from evals.eval_framework.eval_config import EvalConfig
+import sys
+
+with patch.dict(
+    sys.modules,
+    {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
+):
+    from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
+
+
+@pytest.fixture
+def adapter():
+    return DeepEvalAdapter()
+
+
+@pytest.mark.asyncio
+async def test_evaluate_answers_em_f1(adapter):
+    answers = [
+        {
+            "question": "What is 2 + 2?",
+            "answer": "4",
+            "golden_answer": "4",
+        }
+    ]
+
+    evaluator_metrics = ["EM", "f1"]
+
+    results = await adapter.evaluate_answers(answers, evaluator_metrics)
+
+    assert len(results) == 1
+    assert "metrics" in results[0]
+    assert "EM" in results[0]["metrics"]
+    assert "f1" in results[0]["metrics"]
+
+
+@pytest.mark.asyncio
+async def test_unsupported_metric(adapter):
+    answers = [
+        {
+            "question": "What is 2 + 2?",
+            "answer": "4",
+            "golden_answer": "4",
+        }
+    ]
+    evaluator_metrics = ["unsupported_metric"]
+
+    with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"):
+        await adapter.evaluate_answers(answers, evaluator_metrics)
+
+
+@pytest.mark.asyncio
+async def test_empty_answers_list(adapter):
+    results = await adapter.evaluate_answers([], ["EM", "f1"])
+    assert results == []
+
+
+@pytest.mark.asyncio
+async def test_missing_fields_in_answer(adapter):
+    answers = [
+        {
+            "question": "What is the capital of France?",
+            "answer": "Paris",
+        }
+    ]
+    evaluator_metrics = ["EM", "f1"]
+
+    with pytest.raises(KeyError):
+        await adapter.evaluate_answers(answers, evaluator_metrics)
+
+
+@pytest.mark.asyncio
+async def test_none_values_in_answers(adapter):
+    answers = [
+        {
+            "question": None,
+            "answer": None,
+            "golden_answer": None,
+        }
+    ]
+    evaluator_metrics = ["EM", "f1"]
+
+    results = await adapter.evaluate_answers(answers, evaluator_metrics)
+
+    assert len(results) == 1
+    assert "metrics" in results[0]
+    assert "EM" in results[0]["metrics"]
+    assert "f1" in results[0]["metrics"]
--- a/cognee/tests/unit/eval_framework/metrics_test.py
+++ b/cognee/tests/unit/eval_framework/metrics_test.py
@ -0,0 +1,58 @@
+import pytest
+from typing import Optional
+import sys
+from unittest.mock import patch, MagicMock
+
+with patch.dict(
+    sys.modules,
+    {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
+):
+    from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
+    from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
+
+
+class MockTestCase:
+    def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
+        self.actual_output = actual_output
+        self.expected_output = expected_output
+
+
+@pytest.fixture
+def metrics():
+    return {
+        "exact_match": ExactMatchMetric(),
+        "f1": F1ScoreMetric(),
+    }
+
+
+@pytest.mark.parametrize(
+    "actual, expected, expected_exact_score, expected_f1_range",
+    [
+        ("Hello World", "Hello World", 1.0, (1.0, 1.0)),
+        ("Hello World", "hello world", 1.0, (1.0, 1.0)),
+        ("Hello   World", "Hello World", 0.0, (0.0, 1.0)),
+        ("  Hello World  ", "Hello World", 1.0, (1.0, 1.0)),
+        ("", "Hello World", 0.0, (0.0, 0.0)),
+        ("Hello World", "", 0.0, (0.0, 0.0)),
+        ("", "", 1.0, (1.0, 1.0)),
+        ("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
+        ("Hello", "Hello World", 0.0, (0.0, 1.0)),
+        ("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
+        ("123", "123", 1.0, (1.0, 1.0)),
+        ("123", "456", 0.0, (0.0, 0.0)),
+        ("Café", "café", 1.0, (1.0, 1.0)),
+        ("Café", "Cafe", 0.0, (0.0, 0.0)),
+    ],
+)
+def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
+    test_case = MockTestCase(actual, expected)
+
+    exact_match_score = metrics["exact_match"].measure(test_case)
+    assert exact_match_score == expected_exact_score, (
+        f"Exact match failed for '{actual}' vs '{expected}'"
+    )
+
+    f1_score = metrics["f1"].measure(test_case)
+    assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
+        f"F1 score failed for '{actual}' vs '{expected}'"
+    )