<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> • Created DirectLLMEvalAdapter - a lightweight alternative to DeepEval for answer evaluation • Added evaluation prompt files defining scoring criteria and format • Made adapter selectable via evaluation_engine = "DirectLLM" in config, supports "correctness" metric only ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced a new evaluation method that compares model responses against a reference answer using structured prompt templates. This approach enables automated scoring (ranging from 0 to 1) along with brief justifications. - **Enhancements** - Updated the configuration to clearly distinguish between evaluation options, providing end-users with a more transparent and reliable assessment process. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
59 lines
2.2 KiB
Python
59 lines
2.2 KiB
Python
from typing import Any, Dict, List
|
|
from pydantic import BaseModel
|
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
|
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
|
|
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
|
|
from evals.eval_framework.eval_config import EvalConfig
|
|
|
|
|
|
class CorrectnessEvaluation(BaseModel):
|
|
"""Response model containing evaluation score and explanation."""
|
|
|
|
score: float
|
|
explanation: str
|
|
|
|
|
|
class DirectLLMEvalAdapter(BaseEvalAdapter):
|
|
def __init__(self):
|
|
"""Initialize adapter with prompt paths from config."""
|
|
config = EvalConfig()
|
|
self.system_prompt_path = config.direct_llm_system_prompt
|
|
self.eval_prompt_path = config.direct_llm_eval_prompt
|
|
self.llm_client = get_llm_client()
|
|
|
|
async def evaluate_correctness(
|
|
self, question: str, answer: str, golden_answer: str
|
|
) -> Dict[str, Any]:
|
|
args = {"question": question, "answer": answer, "golden_answer": golden_answer}
|
|
|
|
user_prompt = render_prompt(self.eval_prompt_path, args)
|
|
system_prompt = read_query_prompt(self.system_prompt_path)
|
|
|
|
evaluation = await self.llm_client.acreate_structured_output(
|
|
text_input=user_prompt,
|
|
system_prompt=system_prompt,
|
|
response_model=CorrectnessEvaluation,
|
|
)
|
|
|
|
return {"score": evaluation.score, "reason": evaluation.explanation}
|
|
|
|
async def evaluate_answers(
|
|
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
|
|
) -> List[Dict[str, Any]]:
|
|
"""Evaluate a list of answers using specified metrics."""
|
|
if not answers or not evaluator_metrics:
|
|
return []
|
|
|
|
if "correctness" not in evaluator_metrics:
|
|
return [{"metrics": {}, **answer} for answer in answers]
|
|
|
|
results = []
|
|
for answer in answers:
|
|
correctness = await self.evaluate_correctness(
|
|
question=answer["question"],
|
|
answer=answer["answer"],
|
|
golden_answer=answer["golden_answer"],
|
|
)
|
|
results.append({**answer, "metrics": {"correctness": correctness}})
|
|
|
|
return results
|