* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Support 4 different rag options in eval * Minor refactor and logger usage * Run eval on a set of parameters and save results as json and png * script for running all param combinations * bugfix in simple rag * potential fix: single asyncio run * temp fix: exclude insights * Remove insights, have single asyncio run, refactor --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
import asyncio
|
|
from evals.eval_on_hotpot import eval_on_QA_dataset
|
|
from evals.qa_eval_utils import get_combinations, save_results_as_image
|
|
import argparse
|
|
from pathlib import Path
|
|
import json
|
|
|
|
|
|
async def run_evals_on_paramset(paramset: dict, out_path: str):
|
|
combinations = get_combinations(paramset)
|
|
json_path = Path(out_path) / Path("results.json")
|
|
results = {}
|
|
for params in combinations:
|
|
dataset = params["dataset"]
|
|
num_samples = params["num_samples"]
|
|
rag_option = params["rag_option"]
|
|
|
|
result = await eval_on_QA_dataset(
|
|
dataset,
|
|
rag_option,
|
|
num_samples,
|
|
paramset["metric_names"],
|
|
)
|
|
|
|
if dataset not in results:
|
|
results[dataset] = {}
|
|
if num_samples not in results[dataset]:
|
|
results[dataset][num_samples] = {}
|
|
|
|
results[dataset][num_samples][rag_option] = result
|
|
|
|
with open(json_path, "w") as file:
|
|
json.dump(results, file, indent=1)
|
|
|
|
save_results_as_image(results, out_path)
|
|
|
|
return results
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
"--params_file", type=str, required=True, help="Which dataset to evaluate on"
|
|
)
|
|
parser.add_argument("--out_dir", type=str, help="Dir to save eval results")
|
|
|
|
args = parser.parse_args()
|
|
|
|
with open(args.params_file, "r") as file:
|
|
parameters = json.load(file)
|
|
|
|
await run_evals_on_paramset(parameters, args.out_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|