feat: add direct llm eval adapter (#591)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> • Created DirectLLMEvalAdapter - a lightweight alternative to DeepEval for answer evaluation • Added evaluation prompt files defining scoring criteria and format • Made adapter selectable via evaluation_engine = "DirectLLM" in config, supports "correctness" metric only ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced a new evaluation method that compares model responses against a reference answer using structured prompt templates. This approach enables automated scoring (ranging from 0 to 1) along with brief justifications. - **Enhancements** - Updated the configuration to clearly distinguish between evaluation options, providing end-users with a more transparent and reliable assessment process. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
c496bb485c
commit
ca2cbfab91
5 changed files with 84 additions and 2 deletions
|
|
@ -0,0 +1,3 @@
|
|||
Question: {{question}}
|
||||
Provided Answer: {{answer}}
|
||||
Golden Answer: {{golden_answer}}
|
||||
10
cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt
Normal file
10
cognee/infrastructure/llm/prompts/direct_llm_eval_system.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
You are helping a reasonable person evaluate and score answers
|
||||
• Compare the provided answer to the golden answer based on common-sense meaning and understanding.
|
||||
• Focus on the meaning, not the exact wording or structure.
|
||||
• If the answer is correct, don't penalize it for being too short or too long.
|
||||
• Extra details are fine as long as the correct answer is included.
|
||||
• Score should be between 0 and 1.
|
||||
|
||||
Provide:
|
||||
1. A numerical score
|
||||
2. A brief explanation justifying the score
|
||||
|
|
@ -18,8 +18,12 @@ class EvalConfig(BaseSettings):
|
|||
|
||||
# Evaluation params
|
||||
evaluating_answers: bool = True
|
||||
evaluation_engine: str = "DeepEval"
|
||||
evaluation_metrics: List[str] = ["correctness", "EM", "f1"]
|
||||
evaluation_engine: str = "DeepEval" # Options: 'DeepEval' (uses deepeval_model), 'DirectLLM' (uses default llm from .env)
|
||||
evaluation_metrics: List[str] = [
|
||||
"correctness",
|
||||
"EM",
|
||||
"f1",
|
||||
] # Use only 'correctness' for DirectLLM
|
||||
deepeval_model: str = "gpt-4o-mini"
|
||||
|
||||
# Visualization
|
||||
|
|
@ -30,6 +34,8 @@ class EvalConfig(BaseSettings):
|
|||
answers_path: str = "answers_output.json"
|
||||
metrics_path: str = "metrics_output.json"
|
||||
dashboard_path: str = "dashboard.html"
|
||||
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
|
||||
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
||||
|
|
@ -50,6 +56,8 @@ class EvalConfig(BaseSettings):
|
|||
"dashboard_path": self.dashboard_path,
|
||||
"deepeval_model": self.deepeval_model,
|
||||
"task_getter_type": self.task_getter_type,
|
||||
"direct_llm_system_prompt": self.direct_llm_system_prompt,
|
||||
"direct_llm_eval_prompt": self.direct_llm_eval_prompt,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
59
evals/eval_framework/evaluation/direct_llm_eval_adapter.py
Normal file
59
evals/eval_framework/evaluation/direct_llm_eval_adapter.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
from typing import Any, Dict, List
|
||||
from pydantic import BaseModel
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
|
||||
from evals.eval_framework.eval_config import EvalConfig
|
||||
|
||||
|
||||
class CorrectnessEvaluation(BaseModel):
|
||||
"""Response model containing evaluation score and explanation."""
|
||||
|
||||
score: float
|
||||
explanation: str
|
||||
|
||||
|
||||
class DirectLLMEvalAdapter(BaseEvalAdapter):
|
||||
def __init__(self):
|
||||
"""Initialize adapter with prompt paths from config."""
|
||||
config = EvalConfig()
|
||||
self.system_prompt_path = config.direct_llm_system_prompt
|
||||
self.eval_prompt_path = config.direct_llm_eval_prompt
|
||||
self.llm_client = get_llm_client()
|
||||
|
||||
async def evaluate_correctness(
|
||||
self, question: str, answer: str, golden_answer: str
|
||||
) -> Dict[str, Any]:
|
||||
args = {"question": question, "answer": answer, "golden_answer": golden_answer}
|
||||
|
||||
user_prompt = render_prompt(self.eval_prompt_path, args)
|
||||
system_prompt = read_query_prompt(self.system_prompt_path)
|
||||
|
||||
evaluation = await self.llm_client.acreate_structured_output(
|
||||
text_input=user_prompt,
|
||||
system_prompt=system_prompt,
|
||||
response_model=CorrectnessEvaluation,
|
||||
)
|
||||
|
||||
return {"score": evaluation.score, "reason": evaluation.explanation}
|
||||
|
||||
async def evaluate_answers(
|
||||
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Evaluate a list of answers using specified metrics."""
|
||||
if not answers or not evaluator_metrics:
|
||||
return []
|
||||
|
||||
if "correctness" not in evaluator_metrics:
|
||||
return [{"metrics": {}, **answer} for answer in answers]
|
||||
|
||||
results = []
|
||||
for answer in answers:
|
||||
correctness = await self.evaluate_correctness(
|
||||
question=answer["question"],
|
||||
answer=answer["answer"],
|
||||
golden_answer=answer["golden_answer"],
|
||||
)
|
||||
results.append({**answer, "metrics": {"correctness": correctness}})
|
||||
|
||||
return results
|
||||
|
|
@ -1,10 +1,12 @@
|
|||
from enum import Enum
|
||||
from typing import Type
|
||||
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
|
||||
from evals.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter
|
||||
|
||||
|
||||
class EvaluatorAdapter(Enum):
|
||||
DEEPEVAL = ("DeepEval", DeepEvalAdapter)
|
||||
DIRECT_LLM = ("DirectLLM", DirectLLMEvalAdapter)
|
||||
|
||||
def __new__(cls, adapter_name: str, adapter_class: Type):
|
||||
obj = object.__new__(cls)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue