From ca2cbfab918d80b49dc15739b8b1d1e6e22a827f Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Sat, 1 Mar 2025 19:50:20 +0100
Subject: [PATCH] feat: add direct llm eval adapter (#591)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
• Created DirectLLMEvalAdapter - a lightweight alternative to DeepEval
for answer evaluation
• Added evaluation prompt files defining scoring criteria and format
• Made adapter selectable via evaluation_engine = "DirectLLM" in config,
supports "correctness" metric only
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Introduced a new evaluation method that compares model responses
against a reference answer using structured prompt templates. This
approach enables automated scoring (ranging from 0 to 1) along with
brief justifications.

- **Enhancements**
- Updated the configuration to clearly distinguish between evaluation
options, providing end-users with a more transparent and reliable
assessment process.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 .../llm/prompts/direct_llm_eval_prompt.txt    |  3 +
 .../llm/prompts/direct_llm_eval_system.txt    | 10 ++++
 evals/eval_framework/eval_config.py           | 12 +++-
 .../evaluation/direct_llm_eval_adapter.py     | 59 +++++++++++++++++++
 .../evaluation/evaluator_adapters.py          |  2 +
 5 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt
 create mode 100644 cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt
 create mode 100644 evals/eval_framework/evaluation/direct_llm_eval_adapter.py

diff --git a/cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt b/cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt
new file mode 100644
index 000000000..0571129a2
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/direct_llm_eval_prompt.txt
@@ -0,0 +1,3 @@
+Question: {{question}}
+Provided Answer: {{answer}}
+Golden Answer: {{golden_answer}}
diff --git a/cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt b/cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt
new file mode 100644
index 000000000..d1c8db56c
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt
@@ -0,0 +1,10 @@
+You are helping a reasonable person evaluate and score answers
+•	Compare the provided answer to the golden answer based on common-sense meaning and understanding.
+•	Focus on the meaning, not the exact wording or structure.
+•	If the answer is correct, don't penalize it for being too short or too long.
+•	Extra details are fine as long as the correct answer is included.
+•	Score should be between 0 and 1.
+
+Provide:
+1. A numerical score
+2. A brief explanation justifying the score
diff --git a/evals/eval_framework/eval_config.py b/evals/eval_framework/eval_config.py
index 1b70ccff5..f1d65341a 100644
--- a/evals/eval_framework/eval_config.py
+++ b/evals/eval_framework/eval_config.py
@@ -18,8 +18,12 @@ class EvalConfig(BaseSettings):
 
     # Evaluation params
     evaluating_answers: bool = True
-    evaluation_engine: str = "DeepEval"
-    evaluation_metrics: List[str] = ["correctness", "EM", "f1"]
+    evaluation_engine: str = "DeepEval"  # Options: 'DeepEval' (uses deepeval_model), 'DirectLLM' (uses default llm from .env)
+    evaluation_metrics: List[str] = [
+        "correctness",
+        "EM",
+        "f1",
+    ]  # Use only 'correctness' for DirectLLM
     deepeval_model: str = "gpt-4o-mini"
 
     # Visualization
@@ -30,6 +34,8 @@ class EvalConfig(BaseSettings):
     answers_path: str = "answers_output.json"
     metrics_path: str = "metrics_output.json"
     dashboard_path: str = "dashboard.html"
+    direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
+    direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
 
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
@@ -50,6 +56,8 @@ class EvalConfig(BaseSettings):
             "dashboard_path": self.dashboard_path,
             "deepeval_model": self.deepeval_model,
             "task_getter_type": self.task_getter_type,
+            "direct_llm_system_prompt": self.direct_llm_system_prompt,
+            "direct_llm_eval_prompt": self.direct_llm_eval_prompt,
         }
 
 
diff --git a/evals/eval_framework/evaluation/direct_llm_eval_adapter.py b/evals/eval_framework/evaluation/direct_llm_eval_adapter.py
new file mode 100644
index 000000000..b911f88b0
--- /dev/null
+++ b/evals/eval_framework/evaluation/direct_llm_eval_adapter.py
@@ -0,0 +1,59 @@
+from typing import Any, Dict, List
+from pydantic import BaseModel
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
+from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
+from evals.eval_framework.eval_config import EvalConfig
+
+
+class CorrectnessEvaluation(BaseModel):
+    """Response model containing evaluation score and explanation."""
+
+    score: float
+    explanation: str
+
+
+class DirectLLMEvalAdapter(BaseEvalAdapter):
+    def __init__(self):
+        """Initialize adapter with prompt paths from config."""
+        config = EvalConfig()
+        self.system_prompt_path = config.direct_llm_system_prompt
+        self.eval_prompt_path = config.direct_llm_eval_prompt
+        self.llm_client = get_llm_client()
+
+    async def evaluate_correctness(
+        self, question: str, answer: str, golden_answer: str
+    ) -> Dict[str, Any]:
+        args = {"question": question, "answer": answer, "golden_answer": golden_answer}
+
+        user_prompt = render_prompt(self.eval_prompt_path, args)
+        system_prompt = read_query_prompt(self.system_prompt_path)
+
+        evaluation = await self.llm_client.acreate_structured_output(
+            text_input=user_prompt,
+            system_prompt=system_prompt,
+            response_model=CorrectnessEvaluation,
+        )
+
+        return {"score": evaluation.score, "reason": evaluation.explanation}
+
+    async def evaluate_answers(
+        self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
+    ) -> List[Dict[str, Any]]:
+        """Evaluate a list of answers using specified metrics."""
+        if not answers or not evaluator_metrics:
+            return []
+
+        if "correctness" not in evaluator_metrics:
+            return [{"metrics": {}, **answer} for answer in answers]
+
+        results = []
+        for answer in answers:
+            correctness = await self.evaluate_correctness(
+                question=answer["question"],
+                answer=answer["answer"],
+                golden_answer=answer["golden_answer"],
+            )
+            results.append({**answer, "metrics": {"correctness": correctness}})
+
+        return results
diff --git a/evals/eval_framework/evaluation/evaluator_adapters.py b/evals/eval_framework/evaluation/evaluator_adapters.py
index 1b78dee28..28b5462aa 100644
--- a/evals/eval_framework/evaluation/evaluator_adapters.py
+++ b/evals/eval_framework/evaluation/evaluator_adapters.py
@@ -1,10 +1,12 @@
 from enum import Enum
 from typing import Type
 from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
+from evals.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter
 
 
 class EvaluatorAdapter(Enum):
     DEEPEVAL = ("DeepEval", DeepEvalAdapter)
+    DIRECT_LLM = ("DirectLLM", DirectLLMEvalAdapter)
 
     def __new__(cls, adapter_name: str, adapter_class: Type):
         obj = object.__new__(cls)