fix: deepeval retry (#918)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
- Implemented retries when deepeval's evaluation fails
- Updated metric aggregation to ignore Nones

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
lxobr 2025-06-09 15:15:09 +02:00 committed by GitHub
parent 82c0279f45
commit 3da893c131
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 46 additions and 17 deletions

View file

@ -38,17 +38,19 @@ def extract_metrics_and_details(
for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
# Skip None scores from failed evaluations
if score is not None:
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)
return metrics_data, metric_details

View file

@ -7,10 +7,15 @@ from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric
from typing import Any, Dict, List
from deepeval.metrics import ContextualRelevancyMetric
import time
from cognee.shared.logging_utils import get_logger
logger = get_logger()
class DeepEvalAdapter(BaseEvalAdapter):
def __init__(self):
self.n_retries = 5
self.g_eval_metrics = {
"correctness": self.g_eval_correctness(),
"EM": ExactMatchMetric(),
@ -19,6 +24,33 @@ class DeepEvalAdapter(BaseEvalAdapter):
"context_coverage": ContextCoverageMetric(),
}
def _calculate_metric(self, metric: str, test_case: LLMTestCase) -> Dict[str, Any]:
"""Calculate a single metric for a test case with retry logic."""
metric_to_calculate = self.g_eval_metrics[metric]
for attempt in range(self.n_retries):
try:
metric_to_calculate.measure(test_case)
return {
"score": metric_to_calculate.score,
"reason": metric_to_calculate.reason,
}
except Exception as e:
logger.warning(
f"Attempt {attempt + 1}/{self.n_retries} failed for metric '{metric}': {e}"
)
if attempt < self.n_retries - 1:
time.sleep(2**attempt) # Exponential backoff
else:
logger.error(
f"All {self.n_retries} attempts failed for metric '{metric}'. Returning None values."
)
return {
"score": None,
"reason": None,
}
async def evaluate_answers(
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
) -> List[Dict[str, Any]]:
@ -40,12 +72,7 @@ class DeepEvalAdapter(BaseEvalAdapter):
)
metric_results = {}
for metric in evaluator_metrics:
metric_to_calculate = self.g_eval_metrics[metric]
metric_to_calculate.measure(test_case)
metric_results[metric] = {
"score": metric_to_calculate.score,
"reason": metric_to_calculate.reason,
}
metric_results[metric] = self._calculate_metric(metric, test_case)
results.append({**answer, "metrics": metric_results})
return results