From 4c3c811c1e902167474e229067c33c8896f1fcd1 Mon Sep 17 00:00:00 2001
From: alekszievr <44192193+alekszievr@users.noreply.github.com>
Date: Thu, 27 Feb 2025 13:24:47 +0100
Subject: [PATCH] test: eval_framework/evaluation unit tests [cog-1234] (#575)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Tests**
- Added a suite of tests to validate evaluation logic under various
scenarios, including handling of valid inputs and error conditions.
- Introduced comprehensive tests verifying the accuracy of evaluation
metrics, ensuring reliable scoring and error management.
- Created a new test suite for the `DeepEvalAdapter`, covering
correctness, unsupported metrics, and error handling.
- Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`,
parameterized for various test cases.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 .../eval_framework/deepeval_adapter_test.py   | 89 +++++++++++++++++++
 .../tests/unit/eval_framework/metrics_test.py | 58 ++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 cognee/tests/unit/eval_framework/deepeval_adapter_test.py
 create mode 100644 cognee/tests/unit/eval_framework/metrics_test.py

diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
new file mode 100644
index 000000000..d4226cc66
--- /dev/null
+++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
@@ -0,0 +1,89 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from evals.eval_framework.eval_config import EvalConfig
+import sys
+
+with patch.dict(
+    sys.modules,
+    {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
+):
+    from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
+
+
+@pytest.fixture
+def adapter():
+    return DeepEvalAdapter()
+
+
+@pytest.mark.asyncio
+async def test_evaluate_answers_em_f1(adapter):
+    answers = [
+        {
+            "question": "What is 2 + 2?",
+            "answer": "4",
+            "golden_answer": "4",
+        }
+    ]
+
+    evaluator_metrics = ["EM", "f1"]
+
+    results = await adapter.evaluate_answers(answers, evaluator_metrics)
+
+    assert len(results) == 1
+    assert "metrics" in results[0]
+    assert "EM" in results[0]["metrics"]
+    assert "f1" in results[0]["metrics"]
+
+
+@pytest.mark.asyncio
+async def test_unsupported_metric(adapter):
+    answers = [
+        {
+            "question": "What is 2 + 2?",
+            "answer": "4",
+            "golden_answer": "4",
+        }
+    ]
+    evaluator_metrics = ["unsupported_metric"]
+
+    with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"):
+        await adapter.evaluate_answers(answers, evaluator_metrics)
+
+
+@pytest.mark.asyncio
+async def test_empty_answers_list(adapter):
+    results = await adapter.evaluate_answers([], ["EM", "f1"])
+    assert results == []
+
+
+@pytest.mark.asyncio
+async def test_missing_fields_in_answer(adapter):
+    answers = [
+        {
+            "question": "What is the capital of France?",
+            "answer": "Paris",
+        }
+    ]
+    evaluator_metrics = ["EM", "f1"]
+
+    with pytest.raises(KeyError):
+        await adapter.evaluate_answers(answers, evaluator_metrics)
+
+
+@pytest.mark.asyncio
+async def test_none_values_in_answers(adapter):
+    answers = [
+        {
+            "question": None,
+            "answer": None,
+            "golden_answer": None,
+        }
+    ]
+    evaluator_metrics = ["EM", "f1"]
+
+    results = await adapter.evaluate_answers(answers, evaluator_metrics)
+
+    assert len(results) == 1
+    assert "metrics" in results[0]
+    assert "EM" in results[0]["metrics"]
+    assert "f1" in results[0]["metrics"]
diff --git a/cognee/tests/unit/eval_framework/metrics_test.py b/cognee/tests/unit/eval_framework/metrics_test.py
new file mode 100644
index 000000000..c67e845b5
--- /dev/null
+++ b/cognee/tests/unit/eval_framework/metrics_test.py
@@ -0,0 +1,58 @@
+import pytest
+from typing import Optional
+import sys
+from unittest.mock import patch, MagicMock
+
+with patch.dict(
+    sys.modules,
+    {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
+):
+    from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
+    from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
+
+
+class MockTestCase:
+    def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
+        self.actual_output = actual_output
+        self.expected_output = expected_output
+
+
+@pytest.fixture
+def metrics():
+    return {
+        "exact_match": ExactMatchMetric(),
+        "f1": F1ScoreMetric(),
+    }
+
+
+@pytest.mark.parametrize(
+    "actual, expected, expected_exact_score, expected_f1_range",
+    [
+        ("Hello World", "Hello World", 1.0, (1.0, 1.0)),
+        ("Hello World", "hello world", 1.0, (1.0, 1.0)),
+        ("Hello   World", "Hello World", 0.0, (0.0, 1.0)),
+        ("  Hello World  ", "Hello World", 1.0, (1.0, 1.0)),
+        ("", "Hello World", 0.0, (0.0, 0.0)),
+        ("Hello World", "", 0.0, (0.0, 0.0)),
+        ("", "", 1.0, (1.0, 1.0)),
+        ("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
+        ("Hello", "Hello World", 0.0, (0.0, 1.0)),
+        ("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
+        ("123", "123", 1.0, (1.0, 1.0)),
+        ("123", "456", 0.0, (0.0, 0.0)),
+        ("Café", "café", 1.0, (1.0, 1.0)),
+        ("Café", "Cafe", 0.0, (0.0, 0.0)),
+    ],
+)
+def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
+    test_case = MockTestCase(actual, expected)
+
+    exact_match_score = metrics["exact_match"].measure(test_case)
+    assert exact_match_score == expected_exact_score, (
+        f"Exact match failed for '{actual}' vs '{expected}'"
+    )
+
+    f1_score = metrics["f1"].measure(test_case)
+    assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
+        f"F1 score failed for '{actual}' vs '{expected}'"
+    )