test: eval_framework/evaluation unit tests [cog-1234] (#575)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Tests**
- Added a suite of tests to validate evaluation logic under various
scenarios, including handling of valid inputs and error conditions.
- Introduced comprehensive tests verifying the accuracy of evaluation
metrics, ensuring reliable scoring and error management.
- Created a new test suite for the `DeepEvalAdapter`, covering
correctness, unsupported metrics, and error handling.
- Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`,
parameterized for various test cases.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
alekszievr 2025-02-27 13:24:47 +01:00 committed by GitHub
parent c9aee6fbf4
commit 4c3c811c1e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 147 additions and 0 deletions

View file

@ -0,0 +1,89 @@
import pytest
from unittest.mock import patch, MagicMock
from evals.eval_framework.eval_config import EvalConfig
import sys
with patch.dict(
sys.modules,
{"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
):
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
@pytest.fixture
def adapter():
return DeepEvalAdapter()
@pytest.mark.asyncio
async def test_evaluate_answers_em_f1(adapter):
answers = [
{
"question": "What is 2 + 2?",
"answer": "4",
"golden_answer": "4",
}
]
evaluator_metrics = ["EM", "f1"]
results = await adapter.evaluate_answers(answers, evaluator_metrics)
assert len(results) == 1
assert "metrics" in results[0]
assert "EM" in results[0]["metrics"]
assert "f1" in results[0]["metrics"]
@pytest.mark.asyncio
async def test_unsupported_metric(adapter):
answers = [
{
"question": "What is 2 + 2?",
"answer": "4",
"golden_answer": "4",
}
]
evaluator_metrics = ["unsupported_metric"]
with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"):
await adapter.evaluate_answers(answers, evaluator_metrics)
@pytest.mark.asyncio
async def test_empty_answers_list(adapter):
results = await adapter.evaluate_answers([], ["EM", "f1"])
assert results == []
@pytest.mark.asyncio
async def test_missing_fields_in_answer(adapter):
answers = [
{
"question": "What is the capital of France?",
"answer": "Paris",
}
]
evaluator_metrics = ["EM", "f1"]
with pytest.raises(KeyError):
await adapter.evaluate_answers(answers, evaluator_metrics)
@pytest.mark.asyncio
async def test_none_values_in_answers(adapter):
answers = [
{
"question": None,
"answer": None,
"golden_answer": None,
}
]
evaluator_metrics = ["EM", "f1"]
results = await adapter.evaluate_answers(answers, evaluator_metrics)
assert len(results) == 1
assert "metrics" in results[0]
assert "EM" in results[0]["metrics"]
assert "f1" in results[0]["metrics"]

View file

@ -0,0 +1,58 @@
import pytest
from typing import Optional
import sys
from unittest.mock import patch, MagicMock
with patch.dict(
sys.modules,
{"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
):
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
class MockTestCase:
def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
self.actual_output = actual_output
self.expected_output = expected_output
@pytest.fixture
def metrics():
return {
"exact_match": ExactMatchMetric(),
"f1": F1ScoreMetric(),
}
@pytest.mark.parametrize(
"actual, expected, expected_exact_score, expected_f1_range",
[
("Hello World", "Hello World", 1.0, (1.0, 1.0)),
("Hello World", "hello world", 1.0, (1.0, 1.0)),
("Hello World", "Hello World", 0.0, (0.0, 1.0)),
(" Hello World ", "Hello World", 1.0, (1.0, 1.0)),
("", "Hello World", 0.0, (0.0, 0.0)),
("Hello World", "", 0.0, (0.0, 0.0)),
("", "", 1.0, (1.0, 1.0)),
("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
("Hello", "Hello World", 0.0, (0.0, 1.0)),
("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
("123", "123", 1.0, (1.0, 1.0)),
("123", "456", 0.0, (0.0, 0.0)),
("Café", "café", 1.0, (1.0, 1.0)),
("Café", "Cafe", 0.0, (0.0, 0.0)),
],
)
def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
test_case = MockTestCase(actual, expected)
exact_match_score = metrics["exact_match"].measure(test_case)
assert exact_match_score == expected_exact_score, (
f"Exact match failed for '{actual}' vs '{expected}'"
)
f1_score = metrics["f1"].measure(test_case)
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
f"F1 score failed for '{actual}' vs '{expected}'"
)