diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py new file mode 100644 index 000000000..d4226cc66 --- /dev/null +++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py @@ -0,0 +1,89 @@ +import pytest +from unittest.mock import patch, MagicMock +from evals.eval_framework.eval_config import EvalConfig +import sys + +with patch.dict( + sys.modules, + {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()}, +): + from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter + + +@pytest.fixture +def adapter(): + return DeepEvalAdapter() + + +@pytest.mark.asyncio +async def test_evaluate_answers_em_f1(adapter): + answers = [ + { + "question": "What is 2 + 2?", + "answer": "4", + "golden_answer": "4", + } + ] + + evaluator_metrics = ["EM", "f1"] + + results = await adapter.evaluate_answers(answers, evaluator_metrics) + + assert len(results) == 1 + assert "metrics" in results[0] + assert "EM" in results[0]["metrics"] + assert "f1" in results[0]["metrics"] + + +@pytest.mark.asyncio +async def test_unsupported_metric(adapter): + answers = [ + { + "question": "What is 2 + 2?", + "answer": "4", + "golden_answer": "4", + } + ] + evaluator_metrics = ["unsupported_metric"] + + with pytest.raises(ValueError, match="Unsupported metric: unsupported_metric"): + await adapter.evaluate_answers(answers, evaluator_metrics) + + +@pytest.mark.asyncio +async def test_empty_answers_list(adapter): + results = await adapter.evaluate_answers([], ["EM", "f1"]) + assert results == [] + + +@pytest.mark.asyncio +async def test_missing_fields_in_answer(adapter): + answers = [ + { + "question": "What is the capital of France?", + "answer": "Paris", + } + ] + evaluator_metrics = ["EM", "f1"] + + with pytest.raises(KeyError): + await adapter.evaluate_answers(answers, evaluator_metrics) + + +@pytest.mark.asyncio +async def test_none_values_in_answers(adapter): + answers = [ + { + "question": None, + "answer": None, + "golden_answer": None, + } + ] + evaluator_metrics = ["EM", "f1"] + + results = await adapter.evaluate_answers(answers, evaluator_metrics) + + assert len(results) == 1 + assert "metrics" in results[0] + assert "EM" in results[0]["metrics"] + assert "f1" in results[0]["metrics"] diff --git a/cognee/tests/unit/eval_framework/metrics_test.py b/cognee/tests/unit/eval_framework/metrics_test.py new file mode 100644 index 000000000..c67e845b5 --- /dev/null +++ b/cognee/tests/unit/eval_framework/metrics_test.py @@ -0,0 +1,58 @@ +import pytest +from typing import Optional +import sys +from unittest.mock import patch, MagicMock + +with patch.dict( + sys.modules, + {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()}, +): + from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric + from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric + + +class MockTestCase: + def __init__(self, actual_output: Optional[str], expected_output: Optional[str]): + self.actual_output = actual_output + self.expected_output = expected_output + + +@pytest.fixture +def metrics(): + return { + "exact_match": ExactMatchMetric(), + "f1": F1ScoreMetric(), + } + + +@pytest.mark.parametrize( + "actual, expected, expected_exact_score, expected_f1_range", + [ + ("Hello World", "Hello World", 1.0, (1.0, 1.0)), + ("Hello World", "hello world", 1.0, (1.0, 1.0)), + ("Hello World", "Hello World", 0.0, (0.0, 1.0)), + (" Hello World ", "Hello World", 1.0, (1.0, 1.0)), + ("", "Hello World", 0.0, (0.0, 0.0)), + ("Hello World", "", 0.0, (0.0, 0.0)), + ("", "", 1.0, (1.0, 1.0)), + ("Hello World", "Goodbye World", 0.0, (0.0, 1.0)), + ("Hello", "Hello World", 0.0, (0.0, 1.0)), + ("Hello, World!", "hello, world!", 1.0, (1.0, 1.0)), + ("123", "123", 1.0, (1.0, 1.0)), + ("123", "456", 0.0, (0.0, 0.0)), + ("Café", "café", 1.0, (1.0, 1.0)), + ("Café", "Cafe", 0.0, (0.0, 0.0)), + ], +) +def test_metrics(metrics, actual, expected, expected_exact_score, expected_f1_range): + test_case = MockTestCase(actual, expected) + + exact_match_score = metrics["exact_match"].measure(test_case) + assert exact_match_score == expected_exact_score, ( + f"Exact match failed for '{actual}' vs '{expected}'" + ) + + f1_score = metrics["f1"].measure(test_case) + assert expected_f1_range[0] <= f1_score <= expected_f1_range[1], ( + f"F1 score failed for '{actual}' vs '{expected}'" + )