test: eval_framework/evaluation unit tests [cog-1234] (#575)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **Tests** - Added a suite of tests to validate evaluation logic under various scenarios, including handling of valid inputs and error conditions. - Introduced comprehensive tests verifying the accuracy of evaluation metrics, ensuring reliable scoring and error management. - Created a new test suite for the `DeepEvalAdapter`, covering correctness, unsupported metrics, and error handling. - Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`, parameterized for various test cases. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
c9aee6fbf4
commit
4c3c811c1e
2 changed files with 147 additions and 0 deletions
89
cognee/tests/unit/eval_framework/deepeval_adapter_test.py
Normal file
89
cognee/tests/unit/eval_framework/deepeval_adapter_test.py
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from evals.eval_framework.eval_config import EvalConfig
|
||||||
|
import sys
|
||||||
|
|
||||||
|
with patch.dict(
|
||||||
|
sys.modules,
|
||||||
|
{"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
|
||||||
|
):
|
||||||
|
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def adapter():
|
||||||
|
return DeepEvalAdapter()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_answers_em_f1(adapter):
|
||||||
|
answers = [
|
||||||
|
{
|
||||||
|
"question": "What is 2 + 2?",
|
||||||
|
"answer": "4",
|
||||||
|
"golden_answer": "4",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
evaluator_metrics = ["EM", "f1"]
|
||||||
|
|
||||||
|
results = await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
assert "metrics" in results[0]
|
||||||
|
assert "EM" in results[0]["metrics"]
|
||||||
|
assert "f1" in results[0]["metrics"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unsupported_metric(adapter):
|
||||||
|
answers = [
|
||||||
|
{
|
||||||
|
"question": "What is 2 + 2?",
|
||||||
|
"answer": "4",
|
||||||
|
"golden_answer": "4",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
evaluator_metrics = ["unsupported_metric"]
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"):
|
||||||
|
await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_empty_answers_list(adapter):
|
||||||
|
results = await adapter.evaluate_answers([], ["EM", "f1"])
|
||||||
|
assert results == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_missing_fields_in_answer(adapter):
|
||||||
|
answers = [
|
||||||
|
{
|
||||||
|
"question": "What is the capital of France?",
|
||||||
|
"answer": "Paris",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
evaluator_metrics = ["EM", "f1"]
|
||||||
|
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_none_values_in_answers(adapter):
|
||||||
|
answers = [
|
||||||
|
{
|
||||||
|
"question": None,
|
||||||
|
"answer": None,
|
||||||
|
"golden_answer": None,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
evaluator_metrics = ["EM", "f1"]
|
||||||
|
|
||||||
|
results = await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
assert "metrics" in results[0]
|
||||||
|
assert "EM" in results[0]["metrics"]
|
||||||
|
assert "f1" in results[0]["metrics"]
|
||||||
58
cognee/tests/unit/eval_framework/metrics_test.py
Normal file
58
cognee/tests/unit/eval_framework/metrics_test.py
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
import pytest
|
||||||
|
from typing import Optional
|
||||||
|
import sys
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
with patch.dict(
|
||||||
|
sys.modules,
|
||||||
|
{"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
|
||||||
|
):
|
||||||
|
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
|
||||||
|
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
||||||
|
|
||||||
|
|
||||||
|
class MockTestCase:
|
||||||
|
def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
|
||||||
|
self.actual_output = actual_output
|
||||||
|
self.expected_output = expected_output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def metrics():
|
||||||
|
return {
|
||||||
|
"exact_match": ExactMatchMetric(),
|
||||||
|
"f1": F1ScoreMetric(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"actual, expected, expected_exact_score, expected_f1_range",
|
||||||
|
[
|
||||||
|
("Hello World", "Hello World", 1.0, (1.0, 1.0)),
|
||||||
|
("Hello World", "hello world", 1.0, (1.0, 1.0)),
|
||||||
|
("Hello World", "Hello World", 0.0, (0.0, 1.0)),
|
||||||
|
(" Hello World ", "Hello World", 1.0, (1.0, 1.0)),
|
||||||
|
("", "Hello World", 0.0, (0.0, 0.0)),
|
||||||
|
("Hello World", "", 0.0, (0.0, 0.0)),
|
||||||
|
("", "", 1.0, (1.0, 1.0)),
|
||||||
|
("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
|
||||||
|
("Hello", "Hello World", 0.0, (0.0, 1.0)),
|
||||||
|
("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
|
||||||
|
("123", "123", 1.0, (1.0, 1.0)),
|
||||||
|
("123", "456", 0.0, (0.0, 0.0)),
|
||||||
|
("Café", "café", 1.0, (1.0, 1.0)),
|
||||||
|
("Café", "Cafe", 0.0, (0.0, 0.0)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
|
||||||
|
test_case = MockTestCase(actual, expected)
|
||||||
|
|
||||||
|
exact_match_score = metrics["exact_match"].measure(test_case)
|
||||||
|
assert exact_match_score == expected_exact_score, (
|
||||||
|
f"Exact match failed for '{actual}' vs '{expected}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
f1_score = metrics["f1"].measure(test_case)
|
||||||
|
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
|
||||||
|
f"F1 score failed for '{actual}' vs '{expected}'"
|
||||||
|
)
|
||||||
Loading…
Add table
Reference in a new issue