test: eval_framework/evaluation unit tests [cog-1234] (#575)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **Tests** - Added a suite of tests to validate evaluation logic under various scenarios, including handling of valid inputs and error conditions. - Introduced comprehensive tests verifying the accuracy of evaluation metrics, ensuring reliable scoring and error management. - Created a new test suite for the `DeepEvalAdapter`, covering correctness, unsupported metrics, and error handling. - Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`, parameterized for various test cases. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
c9aee6fbf4
commit
4c3c811c1e
2 changed files with 147 additions and 0 deletions
89
cognee/tests/unit/eval_framework/deepeval_adapter_test.py
Normal file
89
cognee/tests/unit/eval_framework/deepeval_adapter_test.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from evals.eval_framework.eval_config import EvalConfig
|
||||
import sys
|
||||
|
||||
with patch.dict(
|
||||
sys.modules,
|
||||
{"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
|
||||
):
|
||||
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def adapter():
|
||||
return DeepEvalAdapter()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_answers_em_f1(adapter):
|
||||
answers = [
|
||||
{
|
||||
"question": "What is 2 + 2?",
|
||||
"answer": "4",
|
||||
"golden_answer": "4",
|
||||
}
|
||||
]
|
||||
|
||||
evaluator_metrics = ["EM", "f1"]
|
||||
|
||||
results = await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||
|
||||
assert len(results) == 1
|
||||
assert "metrics" in results[0]
|
||||
assert "EM" in results[0]["metrics"]
|
||||
assert "f1" in results[0]["metrics"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unsupported_metric(adapter):
|
||||
answers = [
|
||||
{
|
||||
"question": "What is 2 + 2?",
|
||||
"answer": "4",
|
||||
"golden_answer": "4",
|
||||
}
|
||||
]
|
||||
evaluator_metrics = ["unsupported_metric"]
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"):
|
||||
await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_answers_list(adapter):
|
||||
results = await adapter.evaluate_answers([], ["EM", "f1"])
|
||||
assert results == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_fields_in_answer(adapter):
|
||||
answers = [
|
||||
{
|
||||
"question": "What is the capital of France?",
|
||||
"answer": "Paris",
|
||||
}
|
||||
]
|
||||
evaluator_metrics = ["EM", "f1"]
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_none_values_in_answers(adapter):
|
||||
answers = [
|
||||
{
|
||||
"question": None,
|
||||
"answer": None,
|
||||
"golden_answer": None,
|
||||
}
|
||||
]
|
||||
evaluator_metrics = ["EM", "f1"]
|
||||
|
||||
results = await adapter.evaluate_answers(answers, evaluator_metrics)
|
||||
|
||||
assert len(results) == 1
|
||||
assert "metrics" in results[0]
|
||||
assert "EM" in results[0]["metrics"]
|
||||
assert "f1" in results[0]["metrics"]
|
||||
58
cognee/tests/unit/eval_framework/metrics_test.py
Normal file
58
cognee/tests/unit/eval_framework/metrics_test.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import pytest
|
||||
from typing import Optional
|
||||
import sys
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
with patch.dict(
|
||||
sys.modules,
|
||||
{"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
|
||||
):
|
||||
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
|
||||
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
||||
|
||||
|
||||
class MockTestCase:
|
||||
def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
|
||||
self.actual_output = actual_output
|
||||
self.expected_output = expected_output
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def metrics():
|
||||
return {
|
||||
"exact_match": ExactMatchMetric(),
|
||||
"f1": F1ScoreMetric(),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"actual, expected, expected_exact_score, expected_f1_range",
|
||||
[
|
||||
("Hello World", "Hello World", 1.0, (1.0, 1.0)),
|
||||
("Hello World", "hello world", 1.0, (1.0, 1.0)),
|
||||
("Hello World", "Hello World", 0.0, (0.0, 1.0)),
|
||||
(" Hello World ", "Hello World", 1.0, (1.0, 1.0)),
|
||||
("", "Hello World", 0.0, (0.0, 0.0)),
|
||||
("Hello World", "", 0.0, (0.0, 0.0)),
|
||||
("", "", 1.0, (1.0, 1.0)),
|
||||
("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
|
||||
("Hello", "Hello World", 0.0, (0.0, 1.0)),
|
||||
("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
|
||||
("123", "123", 1.0, (1.0, 1.0)),
|
||||
("123", "456", 0.0, (0.0, 0.0)),
|
||||
("Café", "café", 1.0, (1.0, 1.0)),
|
||||
("Café", "Cafe", 0.0, (0.0, 0.0)),
|
||||
],
|
||||
)
|
||||
def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
|
||||
test_case = MockTestCase(actual, expected)
|
||||
|
||||
exact_match_score = metrics["exact_match"].measure(test_case)
|
||||
assert exact_match_score == expected_exact_score, (
|
||||
f"Exact match failed for '{actual}' vs '{expected}'"
|
||||
)
|
||||
|
||||
f1_score = metrics["f1"].measure(test_case)
|
||||
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
|
||||
f"F1 score failed for '{actual}' vs '{expected}'"
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue