test: eval_framework/evaluation unit tests [cog-1234] (#575)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **Tests** - Added a suite of tests to validate evaluation logic under various scenarios, including handling of valid inputs and error conditions. - Introduced comprehensive tests verifying the accuracy of evaluation metrics, ensuring reliable scoring and error management. - Created a new test suite for the `DeepEvalAdapter`, covering correctness, unsupported metrics, and error handling. - Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`, parameterized for various test cases.
2025-02-27 13:24:47 +01:00 · 2025-02-27 13:24:47 +01:00 · 4c3c811c1e
commit 4c3c811c1e
parent c9aee6fbf4
2 changed files with 147 additions and 0 deletions
--- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
+++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py
@ -0,0 +1,89 @@
 import pytest
 from unittest.mock import patch, MagicMock
 from evals.eval_framework.eval_config import EvalConfig
 import sys
 with patch.dict(
    sys.modules,
    {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
 ):
    from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
@pytest.fixture
 def adapter():
    return DeepEvalAdapter()
@pytest.mark.asyncio
 async def test_evaluate_answers_em_f1(adapter):
    answers = [
        {
            "question": "What is 2 + 2?",
            "answer": "4",
            "golden_answer": "4",
        }
    ]
    evaluator_metrics = ["EM", "f1"]
    results = await adapter.evaluate_answers(answers, evaluator_metrics)
    assert len(results) == 1
    assert "metrics" in results[0]
    assert "EM" in results[0]["metrics"]
    assert "f1" in results[0]["metrics"]
@pytest.mark.asyncio
 async def test_unsupported_metric(adapter):
    answers = [
        {
            "question": "What is 2 + 2?",
            "answer": "4",
            "golden_answer": "4",
        }
    ]
    evaluator_metrics = ["unsupported_metric"]
    with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"):
        await adapter.evaluate_answers(answers, evaluator_metrics)
@pytest.mark.asyncio
 async def test_empty_answers_list(adapter):
    results = await adapter.evaluate_answers([], ["EM", "f1"])
    assert results == []
@pytest.mark.asyncio
 async def test_missing_fields_in_answer(adapter):
    answers = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris",
        }
    ]
    evaluator_metrics = ["EM", "f1"]
    with pytest.raises(KeyError):
        await adapter.evaluate_answers(answers, evaluator_metrics)
@pytest.mark.asyncio
 async def test_none_values_in_answers(adapter):
    answers = [
        {
            "question": None,
            "answer": None,
            "golden_answer": None,
        }
    ]
    evaluator_metrics = ["EM", "f1"]
    results = await adapter.evaluate_answers(answers, evaluator_metrics)
    assert len(results) == 1
    assert "metrics" in results[0]
    assert "EM" in results[0]["metrics"]
    assert "f1" in results[0]["metrics"]
--- a/cognee/tests/unit/eval_framework/metrics_test.py
+++ b/cognee/tests/unit/eval_framework/metrics_test.py
@ -0,0 +1,58 @@
 import pytest
 from typing import Optional
 import sys
 from unittest.mock import patch, MagicMock
 with patch.dict(
    sys.modules,
    {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
 ):
    from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
    from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
 class MockTestCase:
    def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
        self.actual_output = actual_output
        self.expected_output = expected_output
@pytest.fixture
 def metrics():
    return {
        "exact_match": ExactMatchMetric(),
        "f1": F1ScoreMetric(),
    }
@pytest.mark.parametrize(
    "actual, expected, expected_exact_score, expected_f1_range",
    [
        ("Hello World", "Hello World", 1.0, (1.0, 1.0)),
        ("Hello World", "hello world", 1.0, (1.0, 1.0)),
        ("Hello   World", "Hello World", 0.0, (0.0, 1.0)),
        ("  Hello World  ", "Hello World", 1.0, (1.0, 1.0)),
        ("", "Hello World", 0.0, (0.0, 0.0)),
        ("Hello World", "", 0.0, (0.0, 0.0)),
        ("", "", 1.0, (1.0, 1.0)),
        ("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
        ("Hello", "Hello World", 0.0, (0.0, 1.0)),
        ("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
        ("123", "123", 1.0, (1.0, 1.0)),
        ("123", "456", 0.0, (0.0, 0.0)),
        ("Café", "café", 1.0, (1.0, 1.0)),
        ("Café", "Cafe", 0.0, (0.0, 0.0)),
    ],
 )
 def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
    test_case = MockTestCase(actual, expected)
    exact_match_score = metrics["exact_match"].measure(test_case)
    assert exact_match_score == expected_exact_score, (
        f"Exact match failed for '{actual}' vs '{expected}'"
    )
    f1_score = metrics["f1"].measure(test_case)
    assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
        f"F1 score failed for '{actual}' vs '{expected}'"
    )