cognee/cognee/tests/unit/eval_framework/metrics_test.py
alekszievr 4c3c811c1e
test: eval_framework/evaluation unit tests [cog-1234] (#575)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Tests**
- Added a suite of tests to validate evaluation logic under various
scenarios, including handling of valid inputs and error conditions.
- Introduced comprehensive tests verifying the accuracy of evaluation
metrics, ensuring reliable scoring and error management.
- Created a new test suite for the `DeepEvalAdapter`, covering
correctness, unsupported metrics, and error handling.
- Added unit tests for `ExactMatchMetric` and `F1ScoreMetric`,
parameterized for various test cases.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-02-27 13:24:47 +01:00

58 lines
2 KiB
Python

import pytest
from typing import Optional
import sys
from unittest.mock import patch, MagicMock
with patch.dict(
sys.modules,
{"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
):
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
class MockTestCase:
def __init__(self, actual_output: Optional[str], expected_output: Optional[str]):
self.actual_output = actual_output
self.expected_output = expected_output
@pytest.fixture
def metrics():
return {
"exact_match": ExactMatchMetric(),
"f1": F1ScoreMetric(),
}
@pytest.mark.parametrize(
"actual, expected, expected_exact_score, expected_f1_range",
[
("Hello World", "Hello World", 1.0, (1.0, 1.0)),
("Hello World", "hello world", 1.0, (1.0, 1.0)),
("Hello World", "Hello World", 0.0, (0.0, 1.0)),
(" Hello World ", "Hello World", 1.0, (1.0, 1.0)),
("", "Hello World", 0.0, (0.0, 0.0)),
("Hello World", "", 0.0, (0.0, 0.0)),
("", "", 1.0, (1.0, 1.0)),
("Hello World", "Goodbye World", 0.0, (0.0, 1.0)),
("Hello", "Hello World", 0.0, (0.0, 1.0)),
("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)),
("123", "123", 1.0, (1.0, 1.0)),
("123", "456", 0.0, (0.0, 0.0)),
("Café", "café", 1.0, (1.0, 1.0)),
("Café", "Cafe", 0.0, (0.0, 0.0)),
],
)
def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range):
test_case = MockTestCase(actual, expected)
exact_match_score = metrics["exact_match"].measure(test_case)
assert exact_match_score == expected_exact_score, (
f"Exact match failed for '{actual}' vs '{expected}'"
)
f1_score = metrics["f1"].measure(test_case)
assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], (
f"F1 score failed for '{actual}' vs '{expected}'"
)