Run eval on a set of parameters and save them as png and json (#443)
* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Support 4 different rag options in eval * Minor refactor and logger usage * Run eval on a set of parameters and save results as json and png * script for running all param combinations * bugfix in simple rag * potential fix: single asyncio run * temp fix: exclude insights * Remove insights, have single asyncio run, refactor --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
This commit is contained in:
parent
6c6ba3270c
commit
8ec1e48ff6
7 changed files with 237 additions and 38 deletions
|
|
@ -8,7 +8,7 @@ import logging
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
|
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
|
||||||
from evals.qa_dataset_utils import load_qa_dataset
|
from evals.qa_dataset_utils import load_qa_dataset
|
||||||
from evals.qa_metrics_utils import get_metric
|
from evals.qa_metrics_utils import get_metrics
|
||||||
from evals.qa_context_provider_utils import qa_context_providers
|
from evals.qa_context_provider_utils import qa_context_providers
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -34,7 +34,7 @@ async def answer_qa_instance(instance, context_provider):
|
||||||
return answer_prediction
|
return answer_prediction
|
||||||
|
|
||||||
|
|
||||||
async def deepeval_answers(instances, answers, eval_metric):
|
async def deepeval_answers(instances, answers, eval_metrics):
|
||||||
test_cases = []
|
test_cases = []
|
||||||
|
|
||||||
for instance, answer in zip(instances, answers):
|
for instance, answer in zip(instances, answers):
|
||||||
|
|
@ -44,37 +44,54 @@ async def deepeval_answers(instances, answers, eval_metric):
|
||||||
test_cases.append(test_case)
|
test_cases.append(test_case)
|
||||||
|
|
||||||
eval_set = EvaluationDataset(test_cases)
|
eval_set = EvaluationDataset(test_cases)
|
||||||
eval_results = eval_set.evaluate([eval_metric])
|
eval_results = eval_set.evaluate(eval_metrics)
|
||||||
|
|
||||||
return eval_results
|
return eval_results
|
||||||
|
|
||||||
|
|
||||||
async def deepeval_on_instances(instances, context_provider, eval_metric):
|
async def deepeval_on_instances(instances, context_provider, eval_metrics):
|
||||||
answers = []
|
answers = []
|
||||||
for instance in tqdm(instances, desc="Getting answers"):
|
for instance in tqdm(instances, desc="Getting answers"):
|
||||||
answer = await answer_qa_instance(instance, context_provider)
|
answer = await answer_qa_instance(instance, context_provider)
|
||||||
answers.append(answer)
|
answers.append(answer)
|
||||||
|
|
||||||
eval_results = await deepeval_answers(instances, answers, eval_metric)
|
eval_results = await deepeval_answers(instances, answers, eval_metrics)
|
||||||
avg_score = statistics.mean(
|
score_lists_dict = {}
|
||||||
[result.metrics_data[0].score for result in eval_results.test_results]
|
for instance_result in eval_results.test_results:
|
||||||
)
|
for metric_result in instance_result.metrics_data:
|
||||||
|
if metric_result.name not in score_lists_dict:
|
||||||
|
score_lists_dict[metric_result.name] = []
|
||||||
|
score_lists_dict[metric_result.name].append(metric_result.score)
|
||||||
|
|
||||||
return avg_score
|
avg_scores = {
|
||||||
|
metric_name: statistics.mean(scorelist)
|
||||||
|
for metric_name, scorelist in score_lists_dict.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
return avg_scores
|
||||||
|
|
||||||
|
|
||||||
async def eval_on_QA_dataset(
|
async def eval_on_QA_dataset(
|
||||||
dataset_name_or_filename: str, context_provider_name, num_samples, eval_metric_name
|
dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list
|
||||||
):
|
):
|
||||||
dataset = load_qa_dataset(dataset_name_or_filename)
|
dataset = load_qa_dataset(dataset_name_or_filename)
|
||||||
context_provider = qa_context_providers[context_provider_name]
|
context_provider = qa_context_providers[context_provider_name]
|
||||||
eval_metric = get_metric(eval_metric_name)
|
eval_metrics = get_metrics(metric_name_list)
|
||||||
instances = dataset if not num_samples else dataset[:num_samples]
|
instances = dataset if not num_samples else dataset[:num_samples]
|
||||||
|
|
||||||
if eval_metric_name.startswith("promptfoo"):
|
if "promptfoo_metrics" in eval_metrics:
|
||||||
return await eval_metric.measure(instances, context_provider)
|
promptfoo_results = await eval_metrics["promptfoo_metrics"].measure(
|
||||||
|
instances, context_provider
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return await deepeval_on_instances(instances, context_provider, eval_metric)
|
promptfoo_results = {}
|
||||||
|
deepeval_results = await deepeval_on_instances(
|
||||||
|
instances, context_provider, eval_metrics["deepeval_metrics"]
|
||||||
|
)
|
||||||
|
|
||||||
|
results = promptfoo_results | deepeval_results
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
@ -89,11 +106,11 @@ if __name__ == "__main__":
|
||||||
help="RAG option to use for providing context",
|
help="RAG option to use for providing context",
|
||||||
)
|
)
|
||||||
parser.add_argument("--num_samples", type=int, default=500)
|
parser.add_argument("--num_samples", type=int, default=500)
|
||||||
parser.add_argument("--metric_name", type=str, default="Correctness")
|
parser.add_argument("--metrics", type=str, nargs="+", default=["Correctness"])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
avg_score = asyncio.run(
|
avg_scores = asyncio.run(
|
||||||
eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metric_name)
|
eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metrics)
|
||||||
)
|
)
|
||||||
logger.info(f"Average {args.metric_name}: {avg_score}")
|
logger.info(f"{avg_scores}")
|
||||||
|
|
|
||||||
|
|
@ -3,19 +3,42 @@ import os
|
||||||
import yaml
|
import yaml
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
|
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_promptfoo_metric(metric_name: str):
|
||||||
|
try:
|
||||||
|
prefix, suffix = metric_name.split(".")
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
if prefix != "promptfoo":
|
||||||
|
return False
|
||||||
|
if suffix not in llm_judge_prompts:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class PromptfooMetric:
|
class PromptfooMetric:
|
||||||
def __init__(self, judge_prompt):
|
def __init__(self, metric_name_list):
|
||||||
promptfoo_path = shutil.which("promptfoo")
|
promptfoo_path = shutil.which("promptfoo")
|
||||||
self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
|
self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
|
||||||
self.judge_prompt = judge_prompt
|
self.prompts = {}
|
||||||
|
for metric_name in metric_name_list:
|
||||||
|
if is_valid_promptfoo_metric(metric_name):
|
||||||
|
self.prompts[metric_name] = llm_judge_prompts[metric_name.split(".")[1]]
|
||||||
|
else:
|
||||||
|
raise Exception(f"{metric_name} is not a valid promptfoo metric")
|
||||||
|
|
||||||
async def measure(self, instances, context_provider):
|
async def measure(self, instances, context_provider):
|
||||||
with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
|
with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
|
||||||
config = yaml.safe_load(file)
|
config = yaml.safe_load(file)
|
||||||
|
|
||||||
config["defaultTest"] = [{"assert": {"type": "llm_rubric", "value": self.judge_prompt}}]
|
config["defaultTest"] = {
|
||||||
|
"assert": [
|
||||||
|
{"type": "llm-rubric", "value": prompt, "name": metric_name}
|
||||||
|
for metric_name, prompt in self.prompts.items()
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
# Fill config file with test cases
|
# Fill config file with test cases
|
||||||
tests = []
|
tests = []
|
||||||
|
|
@ -48,6 +71,9 @@ class PromptfooMetric:
|
||||||
with open(file_path, "r") as file:
|
with open(file_path, "r") as file:
|
||||||
results = json.load(file)
|
results = json.load(file)
|
||||||
|
|
||||||
self.score = results["results"]["prompts"][0]["metrics"]["score"]
|
scores = {}
|
||||||
|
|
||||||
return self.score
|
for result in results["results"]["results"][0]["gradingResult"]["componentResults"]:
|
||||||
|
scores[result["assertion"]["name"]] = result["score"]
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
|
||||||
|
|
@ -21,9 +21,11 @@ async def cognify_instance(instance: dict):
|
||||||
async def get_context_with_cognee(instance: dict) -> str:
|
async def get_context_with_cognee(instance: dict) -> str:
|
||||||
await cognify_instance(instance)
|
await cognify_instance(instance)
|
||||||
|
|
||||||
insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
|
# TODO: Fix insights
|
||||||
|
# insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
|
||||||
summaries = await cognee.search(SearchType.SUMMARIES, query_text=instance["question"])
|
summaries = await cognee.search(SearchType.SUMMARIES, query_text=instance["question"])
|
||||||
search_results = insights + summaries
|
# search_results = insights + summaries
|
||||||
|
search_results = summaries
|
||||||
|
|
||||||
search_results_str = "\n".join([context_item["text"] for context_item in search_results])
|
search_results_str = "\n".join([context_item["text"] for context_item in search_results])
|
||||||
|
|
||||||
|
|
@ -31,7 +33,11 @@ async def get_context_with_cognee(instance: dict) -> str:
|
||||||
|
|
||||||
|
|
||||||
async def get_context_with_simple_rag(instance: dict) -> str:
|
async def get_context_with_simple_rag(instance: dict) -> str:
|
||||||
await cognify_instance(instance)
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
for title, sentences in instance["context"]:
|
||||||
|
await cognee.add("\n".join(sentences), dataset_name="QA")
|
||||||
|
|
||||||
vector_engine = get_vector_engine()
|
vector_engine = get_vector_engine()
|
||||||
found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5)
|
found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5)
|
||||||
|
|
|
||||||
18
evals/qa_eval_parameters.json
Normal file
18
evals/qa_eval_parameters.json
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
"dataset": [
|
||||||
|
"hotpotqa"
|
||||||
|
],
|
||||||
|
"rag_option": [
|
||||||
|
"no_rag",
|
||||||
|
"cognee",
|
||||||
|
"simple_rag",
|
||||||
|
"brute_force"
|
||||||
|
],
|
||||||
|
"num_samples": [
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"metric_names": [
|
||||||
|
"Correctness",
|
||||||
|
"Comprehensiveness"
|
||||||
|
]
|
||||||
|
}
|
||||||
60
evals/qa_eval_utils.py
Normal file
60
evals/qa_eval_utils.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
import itertools
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from jsonschema import ValidationError, validate
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
paramset_json_schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"dataset": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
},
|
||||||
|
"rag_option": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
},
|
||||||
|
"num_samples": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "integer", "minimum": 1},
|
||||||
|
},
|
||||||
|
"metric_names": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["dataset", "rag_option", "num_samples", "metric_names"],
|
||||||
|
"additionalProperties": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_table_as_image(df, image_path):
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
plt.axis("tight")
|
||||||
|
plt.axis("off")
|
||||||
|
plt.table(cellText=df.values, colLabels=df.columns, rowLabels=df.index, loc="center")
|
||||||
|
plt.title(f"{df.index.name}")
|
||||||
|
plt.savefig(image_path, bbox_inches="tight")
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def save_results_as_image(results, out_path):
|
||||||
|
for dataset, num_samples_data in results.items():
|
||||||
|
for num_samples, table_data in num_samples_data.items():
|
||||||
|
df = pd.DataFrame.from_dict(table_data, orient="index")
|
||||||
|
df.index.name = f"Dataset: {dataset}, Num Samples: {num_samples}"
|
||||||
|
image_path = Path(out_path) / Path(f"table_{dataset}_{num_samples}.png")
|
||||||
|
save_table_as_image(df, image_path)
|
||||||
|
|
||||||
|
|
||||||
|
def get_combinations(parameters):
|
||||||
|
try:
|
||||||
|
validate(instance=parameters, schema=paramset_json_schema)
|
||||||
|
except ValidationError as e:
|
||||||
|
raise ValidationError(f"Invalid parameter set: {e.message}")
|
||||||
|
|
||||||
|
params_for_combos = {k: v for k, v in parameters.items() if k != "metric_name"}
|
||||||
|
keys, values = zip(*params_for_combos.items())
|
||||||
|
combinations = [dict(zip(keys, combo)) for combo in itertools.product(*values)]
|
||||||
|
return combinations
|
||||||
|
|
@ -7,10 +7,9 @@ from evals.deepeval_metrics import (
|
||||||
f1_score_metric,
|
f1_score_metric,
|
||||||
em_score_metric,
|
em_score_metric,
|
||||||
)
|
)
|
||||||
from evals.promptfoo_metrics import PromptfooMetric
|
|
||||||
from deepeval.metrics import AnswerRelevancyMetric
|
from deepeval.metrics import AnswerRelevancyMetric
|
||||||
import deepeval.metrics
|
import deepeval.metrics
|
||||||
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
|
from evals.promptfoo_metrics import is_valid_promptfoo_metric, PromptfooMetric
|
||||||
|
|
||||||
native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
|
native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
|
||||||
|
|
||||||
|
|
@ -24,18 +23,10 @@ custom_deepeval_metrics = {
|
||||||
"EM": em_score_metric,
|
"EM": em_score_metric,
|
||||||
}
|
}
|
||||||
|
|
||||||
promptfoo_metrics = {
|
qa_metrics = native_deepeval_metrics | custom_deepeval_metrics
|
||||||
"promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]),
|
|
||||||
"promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]),
|
|
||||||
"promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]),
|
|
||||||
"promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]),
|
|
||||||
"promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]),
|
|
||||||
}
|
|
||||||
|
|
||||||
qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics
|
|
||||||
|
|
||||||
|
|
||||||
def get_metric(metric_name: str):
|
def get_deepeval_metric(metric_name: str):
|
||||||
if metric_name in qa_metrics:
|
if metric_name in qa_metrics:
|
||||||
metric = qa_metrics[metric_name]
|
metric = qa_metrics[metric_name]
|
||||||
else:
|
else:
|
||||||
|
|
@ -49,3 +40,27 @@ def get_metric(metric_name: str):
|
||||||
metric = metric()
|
metric = metric()
|
||||||
|
|
||||||
return metric
|
return metric
|
||||||
|
|
||||||
|
|
||||||
|
def get_metrics(metric_name_list: list[str]):
|
||||||
|
metrics = {
|
||||||
|
"deepeval_metrics": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
promptfoo_metric_names = []
|
||||||
|
|
||||||
|
for metric_name in metric_name_list:
|
||||||
|
if (
|
||||||
|
(metric_name in native_deepeval_metrics)
|
||||||
|
or (metric_name in custom_deepeval_metrics)
|
||||||
|
or hasattr(deepeval.metrics, metric_name)
|
||||||
|
):
|
||||||
|
metric = get_deepeval_metric(metric_name)
|
||||||
|
metrics["deepeval_metrics"].append(metric)
|
||||||
|
elif is_valid_promptfoo_metric(metric_name):
|
||||||
|
promptfoo_metric_names.append(metric_name)
|
||||||
|
|
||||||
|
if len(promptfoo_metric_names) > 0:
|
||||||
|
metrics["promptfoo_metrics"] = PromptfooMetric(promptfoo_metric_names)
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
|
||||||
57
evals/run_qa_eval.py
Normal file
57
evals/run_qa_eval.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
import asyncio
|
||||||
|
from evals.eval_on_hotpot import eval_on_QA_dataset
|
||||||
|
from evals.qa_eval_utils import get_combinations, save_results_as_image
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
async def run_evals_on_paramset(paramset: dict, out_path: str):
|
||||||
|
combinations = get_combinations(paramset)
|
||||||
|
json_path = Path(out_path) / Path("results.json")
|
||||||
|
results = {}
|
||||||
|
for params in combinations:
|
||||||
|
dataset = params["dataset"]
|
||||||
|
num_samples = params["num_samples"]
|
||||||
|
rag_option = params["rag_option"]
|
||||||
|
|
||||||
|
result = await eval_on_QA_dataset(
|
||||||
|
dataset,
|
||||||
|
rag_option,
|
||||||
|
num_samples,
|
||||||
|
paramset["metric_names"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if dataset not in results:
|
||||||
|
results[dataset] = {}
|
||||||
|
if num_samples not in results[dataset]:
|
||||||
|
results[dataset][num_samples] = {}
|
||||||
|
|
||||||
|
results[dataset][num_samples][rag_option] = result
|
||||||
|
|
||||||
|
with open(json_path, "w") as file:
|
||||||
|
json.dump(results, file, indent=1)
|
||||||
|
|
||||||
|
save_results_as_image(results, out_path)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--params_file", type=str, required=True, help="Which dataset to evaluate on"
|
||||||
|
)
|
||||||
|
parser.add_argument("--out_dir", type=str, help="Dir to save eval results")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with open(args.params_file, "r") as file:
|
||||||
|
parameters = json.load(file)
|
||||||
|
|
||||||
|
await run_evals_on_paramset(parameters, args.out_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Loading…
Add table
Reference in a new issue