cognee/evals/eval_framework/evaluation/deep_eval_adapter.py

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.eval_framework.eval_config import EvalConfig
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from typing import Any, Dict, List


class DeepEvalAdapter(BaseEvalAdapter):
    def __init__(self):
        self.g_eval_metrics = {
            "correctness": self.g_eval_correctness(),
            "EM": ExactMatchMetric(),
            "f1": F1ScoreMetric(),
        }

    async def evaluate_answers(
        self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
    ) -> List[Dict[str, Any]]:
        # evaluator_metrics contains all the necessary metrics that are gonna be evaluated dynamically
        for metric in evaluator_metrics:
            if metric not in self.g_eval_metrics:
                raise ValueError(f"Unsupported metric: {metric}")

        results = []
        for answer in answers:
            test_case = LLMTestCase(
                input=answer["question"],
                actual_output=answer["answer"],
                expected_output=answer["golden_answer"],
            )
            metric_results = {}
            for metric in evaluator_metrics:
                metric_to_calculate = self.g_eval_metrics[metric]
                metric_to_calculate.measure(test_case)
                metric_results[metric] = {
                    "score": metric_to_calculate.score,
                    "reason": metric_to_calculate.reason,
                }
            results.append({**answer, "metrics": metric_results})

        return results

    def g_eval_correctness(self):
        return GEval(
            name="Correctness",
            criteria="Determine whether the actual output is factually correct based on the expected output.",
            model=EvalConfig().to_dict()["deepeval_model"],
            evaluation_steps=[
                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
                "You should also heavily penalize omission of detail",
                "Vague language, or contradicting OPINIONS, are OK",
            ],
            evaluation_params=[
                LLMTestCaseParams.INPUT,
                LLMTestCaseParams.ACTUAL_OUTPUT,
                LLMTestCaseParams.EXPECTED_OUTPUT,
            ],
        )