* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Minor refactor and logger usage
51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
from evals.deepeval_metrics import (
|
|
correctness_metric,
|
|
comprehensiveness_metric,
|
|
diversity_metric,
|
|
empowerment_metric,
|
|
directness_metric,
|
|
f1_score_metric,
|
|
em_score_metric,
|
|
)
|
|
from evals.promptfoo_metrics import PromptfooMetric
|
|
from deepeval.metrics import AnswerRelevancyMetric
|
|
import deepeval.metrics
|
|
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
|
|
|
|
native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
|
|
|
|
custom_deepeval_metrics = {
|
|
"Correctness": correctness_metric,
|
|
"Comprehensiveness": comprehensiveness_metric,
|
|
"Diversity": diversity_metric,
|
|
"Empowerment": empowerment_metric,
|
|
"Directness": directness_metric,
|
|
"F1": f1_score_metric,
|
|
"EM": em_score_metric,
|
|
}
|
|
|
|
promptfoo_metrics = {
|
|
"promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]),
|
|
"promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]),
|
|
"promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]),
|
|
"promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]),
|
|
"promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]),
|
|
}
|
|
|
|
qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics
|
|
|
|
|
|
def get_metric(metric_name: str):
|
|
if metric_name in qa_metrics:
|
|
metric = qa_metrics[metric_name]
|
|
else:
|
|
try:
|
|
metric_cls = getattr(deepeval.metrics, metric_name)
|
|
metric = metric_cls()
|
|
except AttributeError:
|
|
raise Exception(f"Metric {metric_name} not supported")
|
|
|
|
if isinstance(metric, type):
|
|
metric = metric()
|
|
|
|
return metric
|