cognee/evals/deepeval_metrics.py
2025-01-05 19:09:08 +01:00

68 lines
2.1 KiB
Python

from deepeval.metrics import BaseMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.official_hotpot_metrics import exact_match_score, f1_score
correctness_metric = GEval(
name="Correctness",
model="gpt-4o-mini",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
evaluation_steps=[
"Determine whether the actual output is factually correct based on the expected output."
],
)
class f1_score_metric(BaseMetric):
"""F1 score taken directly from the official hotpot benchmark
implementation and wrapped into a deepeval metric."""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase):
f1, precision, recall = f1_score(
prediction=test_case.actual_output,
ground_truth=test_case.expected_output,
)
self.score = f1
self.success = self.score >= self.threshold
return self.score
# Reusing regular measure as async F1 score is not implemented
async def a_measure(self, test_case: LLMTestCase):
return self.measure(test_case)
def is_successful(self):
return self.success
@property
def __name__(self):
return "Official hotpot F1 score"
class em_score_metric(BaseMetric):
"""Exact Match score taken directly from the official hotpot benchmark
implementation and wrapped into a deepeval metric."""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase):
self.score = exact_match_score(
prediction=test_case.actual_output,
ground_truth=test_case.expected_output,
)
self.success = self.score >= self.threshold
return self.score
# Reusing regular measure as async F1 score is not implemented
async def a_measure(self, test_case: LLMTestCase):
return self.measure(test_case)
def is_successful(self):
return self.success
@property
def __name__(self):
return "Official hotpot EM score"