From ca2cbfab918d80b49dc15739b8b1d1e6e22a827f Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Sat, 1 Mar 2025 19:50:20 +0100 Subject: [PATCH] feat: add direct llm eval adapter (#591) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description • Created DirectLLMEvalAdapter - a lightweight alternative to DeepEval for answer evaluation • Added evaluation prompt files defining scoring criteria and format • Made adapter selectable via evaluation_engine = "DirectLLM" in config, supports "correctness" metric only ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Introduced a new evaluation method that compares model responses against a reference answer using structured prompt templates. This approach enables automated scoring (ranging from 0 to 1) along with brief justifications. - **Enhancements** - Updated the configuration to clearly distinguish between evaluation options, providing end-users with a more transparent and reliable assessment process. --- .../llm/prompts/direct_llm_eval_prompt.txt | 3 + .../llm/prompts/direct_llm_eval_system.txt | 10 ++++ evals/eval_framework/eval_config.py | 12 +++- .../evaluation/direct_llm_eval_adapter.py | 59 +++++++++++++++++++ .../evaluation/evaluator_adapters.py | 2 + 5 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt create mode 100644 cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt create mode 100644 evals/eval_framework/evaluation/direct_llm_eval_adapter.py diff --git a/cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt b/cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt new file mode 100644 index 000000000..0571129a2 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt @@ -0,0 +1,3 @@ +Question: {{question}} +Provided Answer: {{answer}} +Golden Answer: {{golden_answer}} diff --git a/cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt b/cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt new file mode 100644 index 000000000..d1c8db56c --- /dev/null +++ b/cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt @@ -0,0 +1,10 @@ +You are helping a reasonable person evaluate and score answers +• Compare the provided answer to the golden answer based on common-sense meaning and understanding. +• Focus on the meaning, not the exact wording or structure. +• If the answer is correct, don't penalize it for being too short or too long. +• Extra details are fine as long as the correct answer is included. +• Score should be between 0 and 1. + +Provide: +1. A numerical score +2. A brief explanation justifying the score diff --git a/evals/eval_framework/eval_config.py b/evals/eval_framework/eval_config.py index 1b70ccff5..f1d65341a 100644 --- a/evals/eval_framework/eval_config.py +++ b/evals/eval_framework/eval_config.py @@ -18,8 +18,12 @@ class EvalConfig(BaseSettings): # Evaluation params evaluating_answers: bool = True - evaluation_engine: str = "DeepEval" - evaluation_metrics: List[str] = ["correctness", "EM", "f1"] + evaluation_engine: str = "DeepEval" # Options: 'DeepEval' (uses deepeval_model), 'DirectLLM' (uses default llm from .env) + evaluation_metrics: List[str] = [ + "correctness", + "EM", + "f1", + ] # Use only 'correctness' for DirectLLM deepeval_model: str = "gpt-4o-mini" # Visualization @@ -30,6 +34,8 @@ class EvalConfig(BaseSettings): answers_path: str = "answers_output.json" metrics_path: str = "metrics_output.json" dashboard_path: str = "dashboard.html" + direct_llm_system_prompt: str = "direct_llm_eval_system.txt" + direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt" model_config = SettingsConfigDict(env_file=".env", extra="allow") @@ -50,6 +56,8 @@ class EvalConfig(BaseSettings): "dashboard_path": self.dashboard_path, "deepeval_model": self.deepeval_model, "task_getter_type": self.task_getter_type, + "direct_llm_system_prompt": self.direct_llm_system_prompt, + "direct_llm_eval_prompt": self.direct_llm_eval_prompt, } diff --git a/evals/eval_framework/evaluation/direct_llm_eval_adapter.py b/evals/eval_framework/evaluation/direct_llm_eval_adapter.py new file mode 100644 index 000000000..b911f88b0 --- /dev/null +++ b/evals/eval_framework/evaluation/direct_llm_eval_adapter.py @@ -0,0 +1,59 @@ +from typing import Any, Dict, List +from pydantic import BaseModel +from cognee.infrastructure.llm.get_llm_client import get_llm_client +from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter +from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt +from evals.eval_framework.eval_config import EvalConfig + + +class CorrectnessEvaluation(BaseModel): + """Response model containing evaluation score and explanation.""" + + score: float + explanation: str + + +class DirectLLMEvalAdapter(BaseEvalAdapter): + def __init__(self): + """Initialize adapter with prompt paths from config.""" + config = EvalConfig() + self.system_prompt_path = config.direct_llm_system_prompt + self.eval_prompt_path = config.direct_llm_eval_prompt + self.llm_client = get_llm_client() + + async def evaluate_correctness( + self, question: str, answer: str, golden_answer: str + ) -> Dict[str, Any]: + args = {"question": question, "answer": answer, "golden_answer": golden_answer} + + user_prompt = render_prompt(self.eval_prompt_path, args) + system_prompt = read_query_prompt(self.system_prompt_path) + + evaluation = await self.llm_client.acreate_structured_output( + text_input=user_prompt, + system_prompt=system_prompt, + response_model=CorrectnessEvaluation, + ) + + return {"score": evaluation.score, "reason": evaluation.explanation} + + async def evaluate_answers( + self, answers: List[Dict[str, Any]], evaluator_metrics: List[str] + ) -> List[Dict[str, Any]]: + """Evaluate a list of answers using specified metrics.""" + if not answers or not evaluator_metrics: + return [] + + if "correctness" not in evaluator_metrics: + return [{"metrics": {}, **answer} for answer in answers] + + results = [] + for answer in answers: + correctness = await self.evaluate_correctness( + question=answer["question"], + answer=answer["answer"], + golden_answer=answer["golden_answer"], + ) + results.append({**answer, "metrics": {"correctness": correctness}}) + + return results diff --git a/evals/eval_framework/evaluation/evaluator_adapters.py b/evals/eval_framework/evaluation/evaluator_adapters.py index 1b78dee28..28b5462aa 100644 --- a/evals/eval_framework/evaluation/evaluator_adapters.py +++ b/evals/eval_framework/evaluation/evaluator_adapters.py @@ -1,10 +1,12 @@ from enum import Enum from typing import Type from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter +from evals.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter class EvaluatorAdapter(Enum): DEEPEVAL = ("DeepEval", DeepEvalAdapter) + DIRECT_LLM = ("DirectLLM", DirectLLMEvalAdapter) def __new__(cls, adapter_name: str, adapter_class: Type): obj = object.__new__(cls)