From 3da893c131264d8a32d75fc8f07823170126287e Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 9 Jun 2025 15:15:09 +0200 Subject: [PATCH] fix: deepeval retry (#918) ## Description - Implemented retries when deepeval's evaluation fails - Updated metric aggregation to ignore Nones ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../analysis/metrics_calculator.py | 24 ++++++------ .../evaluation/deep_eval_adapter.py | 39 ++++++++++++++++--- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/cognee/eval_framework/analysis/metrics_calculator.py b/cognee/eval_framework/analysis/metrics_calculator.py index ca1960748..aae7d5684 100644 --- a/cognee/eval_framework/analysis/metrics_calculator.py +++ b/cognee/eval_framework/analysis/metrics_calculator.py @@ -38,17 +38,19 @@ def extract_metrics_and_details( for entry in data: for metric, values in entry["metrics"].items(): score = values["score"] - metrics_data[metric].append(score) - if "reason" in values: - metric_details[metric].append( - { - "question": entry["question"], - "answer": entry["answer"], - "golden_answer": entry["golden_answer"], - "reason": values["reason"], - "score": score, - } - ) + # Skip None scores from failed evaluations + if score is not None: + metrics_data[metric].append(score) + if "reason" in values: + metric_details[metric].append( + { + "question": entry["question"], + "answer": entry["answer"], + "golden_answer": entry["golden_answer"], + "reason": values["reason"], + "score": score, + } + ) return metrics_data, metric_details diff --git a/cognee/eval_framework/evaluation/deep_eval_adapter.py b/cognee/eval_framework/evaluation/deep_eval_adapter.py index ab727479e..540cc0632 100644 --- a/cognee/eval_framework/evaluation/deep_eval_adapter.py +++ b/cognee/eval_framework/evaluation/deep_eval_adapter.py @@ -7,10 +7,15 @@ from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric from typing import Any, Dict, List from deepeval.metrics import ContextualRelevancyMetric +import time +from cognee.shared.logging_utils import get_logger + +logger = get_logger() class DeepEvalAdapter(BaseEvalAdapter): def __init__(self): + self.n_retries = 5 self.g_eval_metrics = { "correctness": self.g_eval_correctness(), "EM": ExactMatchMetric(), @@ -19,6 +24,33 @@ class DeepEvalAdapter(BaseEvalAdapter): "context_coverage": ContextCoverageMetric(), } + def _calculate_metric(self, metric: str, test_case: LLMTestCase) -> Dict[str, Any]: + """Calculate a single metric for a test case with retry logic.""" + metric_to_calculate = self.g_eval_metrics[metric] + + for attempt in range(self.n_retries): + try: + metric_to_calculate.measure(test_case) + return { + "score": metric_to_calculate.score, + "reason": metric_to_calculate.reason, + } + except Exception as e: + logger.warning( + f"Attempt {attempt + 1}/{self.n_retries} failed for metric '{metric}': {e}" + ) + if attempt < self.n_retries - 1: + time.sleep(2**attempt) # Exponential backoff + else: + logger.error( + f"All {self.n_retries} attempts failed for metric '{metric}'. Returning None values." + ) + + return { + "score": None, + "reason": None, + } + async def evaluate_answers( self, answers: List[Dict[str, Any]], evaluator_metrics: List[str] ) -> List[Dict[str, Any]]: @@ -40,12 +72,7 @@ class DeepEvalAdapter(BaseEvalAdapter): ) metric_results = {} for metric in evaluator_metrics: - metric_to_calculate = self.g_eval_metrics[metric] - metric_to_calculate.measure(test_case) - metric_results[metric] = { - "score": metric_to_calculate.score, - "reason": metric_to_calculate.reason, - } + metric_results[metric] = self._calculate_metric(metric, test_case) results.append({**answer, "metrics": metric_results}) return results