From 4e3a666b33b80a3b2d191ea3021e791a9d3c46ff Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Wed, 22 Jan 2025 16:17:01 +0100 Subject: [PATCH] Feat: Save and load contexts and answers for eval (#462) * feat: make tasks a configurable argument in the cognify function * fix: add data points task * eval on random samples instead of first couple * Save and load contexts and answers * Fix random seed usage and handle empty descriptions * include insights search in cognee option * create output dir if doesnt exist --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com> --- evals/eval_on_hotpot.py | 69 +++++++++++++++++++++++++----- evals/promptfoo_metrics.py | 19 ++++++-- evals/qa_context_provider_utils.py | 18 ++++++-- evals/qa_eval_parameters.json | 6 ++- evals/run_qa_eval.py | 9 +--- 5 files changed, 97 insertions(+), 24 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index da102c8ee..b0591a1eb 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -10,12 +10,29 @@ from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt from evals.qa_dataset_utils import load_qa_dataset from evals.qa_metrics_utils import get_metrics from evals.qa_context_provider_utils import qa_context_providers, valid_pipeline_slices +import random +import os +import json +from pathlib import Path logger = logging.getLogger(__name__) -async def answer_qa_instance(instance, context_provider): - context = await context_provider(instance) +async def answer_qa_instance(instance, context_provider, contexts_filename): + if os.path.exists(contexts_filename): + with open(contexts_filename, "r") as file: + preloaded_contexts = json.load(file) + else: + preloaded_contexts = {} + + if instance["_id"] in preloaded_contexts: + context = preloaded_contexts[instance["_id"]] + else: + context = await context_provider(instance) + preloaded_contexts[instance["_id"]] = context + + with open(contexts_filename, "w") as file: + json.dump(preloaded_contexts, file) args = { "question": instance["question"], @@ -49,12 +66,27 @@ async def deepeval_answers(instances, answers, eval_metrics): return eval_results -async def deepeval_on_instances(instances, context_provider, eval_metrics): +async def deepeval_on_instances( + instances, context_provider, eval_metrics, answers_filename, contexts_filename +): + if os.path.exists(answers_filename): + with open(answers_filename, "r") as file: + preloaded_answers = json.load(file) + else: + preloaded_answers = {} + answers = [] for instance in tqdm(instances, desc="Getting answers"): - answer = await answer_qa_instance(instance, context_provider) + if instance["_id"] in preloaded_answers: + answer = preloaded_answers[instance["_id"]] + else: + answer = await answer_qa_instance(instance, context_provider, contexts_filename) + preloaded_answers[instance["_id"]] = answer answers.append(answer) + with open(answers_filename, "w") as file: + json.dump(preloaded_answers, file) + eval_results = await deepeval_answers(instances, answers, eval_metrics) score_lists_dict = {} for instance_result in eval_results.test_results: @@ -72,21 +104,38 @@ async def deepeval_on_instances(instances, context_provider, eval_metrics): async def eval_on_QA_dataset( - dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list + dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list, out_path ): dataset = load_qa_dataset(dataset_name_or_filename) context_provider = qa_context_providers[context_provider_name] eval_metrics = get_metrics(metric_name_list) - instances = dataset if not num_samples else dataset[:num_samples] + out_path = Path(out_path) + if not out_path.exists(): + out_path.mkdir(parents=True, exist_ok=True) + + random.seed(42) + instances = dataset if not num_samples else random.sample(dataset, num_samples) + + contexts_filename = out_path / Path( + f"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json" + ) if "promptfoo_metrics" in eval_metrics: promptfoo_results = await eval_metrics["promptfoo_metrics"].measure( - instances, context_provider + instances, context_provider, contexts_filename ) else: promptfoo_results = {} + + answers_filename = out_path / Path( + f"answers_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json" + ) deepeval_results = await deepeval_on_instances( - instances, context_provider, eval_metrics["deepeval_metrics"] + instances, + context_provider, + eval_metrics["deepeval_metrics"], + answers_filename, + contexts_filename, ) results = promptfoo_results | deepeval_results @@ -95,14 +144,14 @@ async def eval_on_QA_dataset( async def incremental_eval_on_QA_dataset( - dataset_name_or_filename: str, num_samples, metric_name_list + dataset_name_or_filename: str, num_samples, metric_name_list, out_path ): pipeline_slice_names = valid_pipeline_slices.keys() incremental_results = {} for pipeline_slice_name in pipeline_slice_names: results = await eval_on_QA_dataset( - dataset_name_or_filename, pipeline_slice_name, num_samples, metric_name_list + dataset_name_or_filename, pipeline_slice_name, num_samples, metric_name_list, out_path ) incremental_results[pipeline_slice_name] = results diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py index ee0eaf80a..f21fab2f9 100644 --- a/evals/promptfoo_metrics.py +++ b/evals/promptfoo_metrics.py @@ -29,7 +29,7 @@ class PromptfooMetric: else: raise Exception(f"{metric_name} is not a valid promptfoo metric") - async def measure(self, instances, context_provider): + async def measure(self, instances, context_provider, contexts_filename): with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file: config = yaml.safe_load(file) @@ -40,10 +40,20 @@ class PromptfooMetric: ] } - # Fill config file with test cases tests = [] + if os.path.exists(contexts_filename): + with open(contexts_filename, "r") as file: + preloaded_contexts = json.load(file) + else: + preloaded_contexts = {} + for instance in instances: - context = await context_provider(instance) + if instance["_id"] in preloaded_contexts: + context = preloaded_contexts[instance["_id"]] + else: + context = await context_provider(instance) + preloaded_contexts[instance["_id"]] = context + test = { "vars": { "name": instance["question"][:15], @@ -52,7 +62,10 @@ class PromptfooMetric: } } tests.append(test) + config["tests"] = tests + with open(contexts_filename, "w") as file: + json.dump(preloaded_contexts, file) # Write the updated YAML back, preserving formatting and structure updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml") diff --git a/evals/qa_context_provider_utils.py b/evals/qa_context_provider_utils.py index 6397d1054..2cef1e628 100644 --- a/evals/qa_context_provider_utils.py +++ b/evals/qa_context_provider_utils.py @@ -39,10 +39,22 @@ def _insight_to_string(triplet: tuple) -> str: return "" node1_name = node1["name"] if "name" in node1 else "N/A" - node1_description = node1["description"] if "description" in node1 else node1["text"] + node1_description = ( + node1["description"] + if "description" in node1 + else node1["text"] + if "text" in node1 + else "N/A" + ) node1_string = f"name: {node1_name}, description: {node1_description}" node2_name = node2["name"] if "name" in node2 else "N/A" - node2_description = node2["description"] if "description" in node2 else node2["text"] + node2_description = ( + node2["description"] + if "description" in node2 + else node2["text"] + if "text" in node2 + else "N/A" + ) node2_string = f"name: {node2_name}, description: {node2_description}" edge_string = edge.get("relationship_name", "") @@ -58,7 +70,7 @@ def _insight_to_string(triplet: tuple) -> str: async def get_context_with_cognee( instance: dict, task_indices: list[int] = None, - search_types: list[SearchType] = [SearchType.SUMMARIES, SearchType.CHUNKS], + search_types: list[SearchType] = [SearchType.INSIGHTS, SearchType.SUMMARIES, SearchType.CHUNKS], ) -> str: await cognify_instance(instance, task_indices) diff --git a/evals/qa_eval_parameters.json b/evals/qa_eval_parameters.json index 6ae07089a..6d60ab56f 100644 --- a/evals/qa_eval_parameters.json +++ b/evals/qa_eval_parameters.json @@ -14,6 +14,10 @@ ], "metric_names": [ "Correctness", - "Comprehensiveness" + "Comprehensiveness", + "Directness", + "Diversity", + "Empowerment", + "promptfoo.directness" ] } diff --git a/evals/run_qa_eval.py b/evals/run_qa_eval.py index f9f35d61d..26f53adaa 100644 --- a/evals/run_qa_eval.py +++ b/evals/run_qa_eval.py @@ -22,17 +22,12 @@ async def run_evals_on_paramset(paramset: dict, out_path: str): if rag_option == "cognee_incremental": result = await incremental_eval_on_QA_dataset( - dataset, - num_samples, - paramset["metric_names"], + dataset, num_samples, paramset["metric_names"], out_path ) results[dataset][num_samples] |= result else: result = await eval_on_QA_dataset( - dataset, - rag_option, - num_samples, - paramset["metric_names"], + dataset, rag_option, num_samples, paramset["metric_names"], out_path ) results[dataset][num_samples][rag_option] = result