From 1c16a1744c7999f9a0849454562111f4f3929181 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 20 Jan 2025 18:42:09 +0100 Subject: [PATCH] Save and load contexts and answers --- evals/eval_on_hotpot.py | 60 +++++++++++++++++++++++++++++------ evals/promptfoo_metrics.py | 19 +++++++++-- evals/qa_eval_parameters.json | 6 +++- evals/run_qa_eval.py | 9 ++---- 4 files changed, 74 insertions(+), 20 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 6fa5748b9..fd36a7514 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -11,13 +11,29 @@ from evals.qa_dataset_utils import load_qa_dataset from evals.qa_metrics_utils import get_metrics from evals.qa_context_provider_utils import qa_context_providers, valid_pipeline_slices import random +import os +import json +from pathlib import Path logger = logging.getLogger(__name__) random.seed(42) -async def answer_qa_instance(instance, context_provider): - context = await context_provider(instance) +async def answer_qa_instance(instance, context_provider, contexts_filename): + if os.path.exists(contexts_filename): + with open(contexts_filename, "r") as file: + preloaded_contexts = json.load(file) + else: + preloaded_contexts = {} + + if instance["_id"] in preloaded_contexts: + context = preloaded_contexts[instance["_id"]] + else: + context = await context_provider(instance) + preloaded_contexts[instance["_id"]] = context + + with open(contexts_filename, "w") as file: + json.dump(preloaded_contexts, file) args = { "question": instance["question"], @@ -51,12 +67,27 @@ async def deepeval_answers(instances, answers, eval_metrics): return eval_results -async def deepeval_on_instances(instances, context_provider, eval_metrics): +async def deepeval_on_instances( + instances, context_provider, eval_metrics, answers_filename, contexts_filename +): + if os.path.exists(answers_filename): + with open(answers_filename, "r") as file: + preloaded_answers = json.load(file) + else: + preloaded_answers = {} + answers = [] for instance in tqdm(instances, desc="Getting answers"): - answer = await answer_qa_instance(instance, context_provider) + if instance["_id"] in preloaded_answers: + answer = preloaded_answers[instance["_id"]] + else: + answer = await answer_qa_instance(instance, context_provider, contexts_filename) + preloaded_answers[instance["_id"]] = answer answers.append(answer) + with open(answers_filename, "w") as file: + json.dump(preloaded_answers, file) + eval_results = await deepeval_answers(instances, answers, eval_metrics) score_lists_dict = {} for instance_result in eval_results.test_results: @@ -74,21 +105,32 @@ async def deepeval_on_instances(instances, context_provider, eval_metrics): async def eval_on_QA_dataset( - dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list + dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list, out_path ): dataset = load_qa_dataset(dataset_name_or_filename) context_provider = qa_context_providers[context_provider_name] eval_metrics = get_metrics(metric_name_list) instances = dataset if not num_samples else random.sample(dataset, num_samples) + contexts_filename = Path(out_path) / Path( + f"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json" + ) if "promptfoo_metrics" in eval_metrics: promptfoo_results = await eval_metrics["promptfoo_metrics"].measure( - instances, context_provider + instances, context_provider, contexts_filename ) else: promptfoo_results = {} + + answers_filename = Path(out_path) / Path( + f"answers_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json" + ) deepeval_results = await deepeval_on_instances( - instances, context_provider, eval_metrics["deepeval_metrics"] + instances, + context_provider, + eval_metrics["deepeval_metrics"], + answers_filename, + contexts_filename, ) results = promptfoo_results | deepeval_results @@ -97,14 +139,14 @@ async def eval_on_QA_dataset( async def incremental_eval_on_QA_dataset( - dataset_name_or_filename: str, num_samples, metric_name_list + dataset_name_or_filename: str, num_samples, metric_name_list, out_path ): pipeline_slice_names = valid_pipeline_slices.keys() incremental_results = {} for pipeline_slice_name in pipeline_slice_names: results = await eval_on_QA_dataset( - dataset_name_or_filename, pipeline_slice_name, num_samples, metric_name_list + dataset_name_or_filename, pipeline_slice_name, num_samples, metric_name_list, out_path ) incremental_results[pipeline_slice_name] = results diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py index ee0eaf80a..f21fab2f9 100644 --- a/evals/promptfoo_metrics.py +++ b/evals/promptfoo_metrics.py @@ -29,7 +29,7 @@ class PromptfooMetric: else: raise Exception(f"{metric_name} is not a valid promptfoo metric") - async def measure(self, instances, context_provider): + async def measure(self, instances, context_provider, contexts_filename): with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file: config = yaml.safe_load(file) @@ -40,10 +40,20 @@ class PromptfooMetric: ] } - # Fill config file with test cases tests = [] + if os.path.exists(contexts_filename): + with open(contexts_filename, "r") as file: + preloaded_contexts = json.load(file) + else: + preloaded_contexts = {} + for instance in instances: - context = await context_provider(instance) + if instance["_id"] in preloaded_contexts: + context = preloaded_contexts[instance["_id"]] + else: + context = await context_provider(instance) + preloaded_contexts[instance["_id"]] = context + test = { "vars": { "name": instance["question"][:15], @@ -52,7 +62,10 @@ class PromptfooMetric: } } tests.append(test) + config["tests"] = tests + with open(contexts_filename, "w") as file: + json.dump(preloaded_contexts, file) # Write the updated YAML back, preserving formatting and structure updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml") diff --git a/evals/qa_eval_parameters.json b/evals/qa_eval_parameters.json index 6ae07089a..6d60ab56f 100644 --- a/evals/qa_eval_parameters.json +++ b/evals/qa_eval_parameters.json @@ -14,6 +14,10 @@ ], "metric_names": [ "Correctness", - "Comprehensiveness" + "Comprehensiveness", + "Directness", + "Diversity", + "Empowerment", + "promptfoo.directness" ] } diff --git a/evals/run_qa_eval.py b/evals/run_qa_eval.py index f9f35d61d..26f53adaa 100644 --- a/evals/run_qa_eval.py +++ b/evals/run_qa_eval.py @@ -22,17 +22,12 @@ async def run_evals_on_paramset(paramset: dict, out_path: str): if rag_option == "cognee_incremental": result = await incremental_eval_on_QA_dataset( - dataset, - num_samples, - paramset["metric_names"], + dataset, num_samples, paramset["metric_names"], out_path ) results[dataset][num_samples] |= result else: result = await eval_on_QA_dataset( - dataset, - rag_option, - num_samples, - paramset["metric_names"], + dataset, rag_option, num_samples, paramset["metric_names"], out_path ) results[dataset][num_samples][rag_option] = result