<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **Tests** - Added a suite of tests to validate evaluation logic under various scenarios, including handling of valid inputs and error conditions. - Introduced comprehensive tests verifying the accuracy of evaluation metrics, ensuring reliable scoring and error management. - Created a new test suite for the `DeepEvalAdapter`, covering correctness, unsupported metrics, and error handling. - Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`, parameterized for various test cases. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
58 lines
2 KiB
Python
58 lines
2 KiB
Python
import pytest
|
|
from typing import Optional
|
|
import sys
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
with patch.dict(
|
|
sys.modules,
|
|
{"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
|
|
):
|
|
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
|
|
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
|
|
|
|
|
class MockTestCase:
|
|
def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
|
|
self.actual_output = actual_output
|
|
self.expected_output = expected_output
|
|
|
|
|
|
@pytest.fixture
|
|
def metrics():
|
|
return {
|
|
"exact_match": ExactMatchMetric(),
|
|
"f1": F1ScoreMetric(),
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"actual, expected, expected_exact_score, expected_f1_range",
|
|
[
|
|
("Hello World", "Hello World", 1.0, (1.0, 1.0)),
|
|
("Hello World", "hello world", 1.0, (1.0, 1.0)),
|
|
("Hello World", "Hello World", 0.0, (0.0, 1.0)),
|
|
(" Hello World ", "Hello World", 1.0, (1.0, 1.0)),
|
|
("", "Hello World", 0.0, (0.0, 0.0)),
|
|
("Hello World", "", 0.0, (0.0, 0.0)),
|
|
("", "", 1.0, (1.0, 1.0)),
|
|
("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
|
|
("Hello", "Hello World", 0.0, (0.0, 1.0)),
|
|
("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
|
|
("123", "123", 1.0, (1.0, 1.0)),
|
|
("123", "456", 0.0, (0.0, 0.0)),
|
|
("Café", "café", 1.0, (1.0, 1.0)),
|
|
("Café", "Cafe", 0.0, (0.0, 0.0)),
|
|
],
|
|
)
|
|
def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
|
|
test_case = MockTestCase(actual, expected)
|
|
|
|
exact_match_score = metrics["exact_match"].measure(test_case)
|
|
assert exact_match_score == expected_exact_score, (
|
|
f"Exact match failed for '{actual}' vs '{expected}'"
|
|
)
|
|
|
|
f1_score = metrics["f1"].measure(test_case)
|
|
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
|
|
f"F1 score failed for '{actual}' vs '{expected}'"
|
|
)
|