diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 59129f599..adef0d160 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -8,7 +8,7 @@ import logging from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt from evals.qa_dataset_utils import load_qa_dataset -from evals.qa_metrics_utils import get_metric +from evals.qa_metrics_utils import get_metrics from evals.qa_context_provider_utils import qa_context_providers logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ async def answer_qa_instance(instance, context_provider): return answer_prediction -async def deepeval_answers(instances, answers, eval_metric): +async def deepeval_answers(instances, answers, eval_metrics): test_cases = [] for instance, answer in zip(instances, answers): @@ -44,37 +44,54 @@ async def deepeval_answers(instances, answers, eval_metric): test_cases.append(test_case) eval_set = EvaluationDataset(test_cases) - eval_results = eval_set.evaluate([eval_metric]) + eval_results = eval_set.evaluate(eval_metrics) return eval_results -async def deepeval_on_instances(instances, context_provider, eval_metric): +async def deepeval_on_instances(instances, context_provider, eval_metrics): answers = [] for instance in tqdm(instances, desc="Getting answers"): answer = await answer_qa_instance(instance, context_provider) answers.append(answer) - eval_results = await deepeval_answers(instances, answers, eval_metric) - avg_score = statistics.mean( - [result.metrics_data[0].score for result in eval_results.test_results] - ) + eval_results = await deepeval_answers(instances, answers, eval_metrics) + score_lists_dict = {} + for instance_result in eval_results.test_results: + for metric_result in instance_result.metrics_data: + if metric_result.name not in score_lists_dict: + score_lists_dict[metric_result.name] = [] + score_lists_dict[metric_result.name].append(metric_result.score) - return avg_score + avg_scores = { + metric_name: statistics.mean(scorelist) + for metric_name, scorelist in score_lists_dict.items() + } + + return avg_scores async def eval_on_QA_dataset( - dataset_name_or_filename: str, context_provider_name, num_samples, eval_metric_name + dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list ): dataset = load_qa_dataset(dataset_name_or_filename) context_provider = qa_context_providers[context_provider_name] - eval_metric = get_metric(eval_metric_name) + eval_metrics = get_metrics(metric_name_list) instances = dataset if not num_samples else dataset[:num_samples] - if eval_metric_name.startswith("promptfoo"): - return await eval_metric.measure(instances, context_provider) + if "promptfoo_metrics" in eval_metrics: + promptfoo_results = await eval_metrics["promptfoo_metrics"].measure( + instances, context_provider + ) else: - return await deepeval_on_instances(instances, context_provider, eval_metric) + promptfoo_results = {} + deepeval_results = await deepeval_on_instances( + instances, context_provider, eval_metrics["deepeval_metrics"] + ) + + results = promptfoo_results | deepeval_results + + return results if __name__ == "__main__": @@ -89,11 +106,11 @@ if __name__ == "__main__": help="RAG option to use for providing context", ) parser.add_argument("--num_samples", type=int, default=500) - parser.add_argument("--metric_name", type=str, default="Correctness") + parser.add_argument("--metrics", type=str, nargs="+", default=["Correctness"]) args = parser.parse_args() - avg_score = asyncio.run( - eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metric_name) + avg_scores = asyncio.run( + eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metrics) ) - logger.info(f"Average {args.metric_name}: {avg_score}") + logger.info(f"{avg_scores}") diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py index addd0030a..ee0eaf80a 100644 --- a/evals/promptfoo_metrics.py +++ b/evals/promptfoo_metrics.py @@ -3,19 +3,42 @@ import os import yaml import json import shutil +from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts + + +def is_valid_promptfoo_metric(metric_name: str): + try: + prefix, suffix = metric_name.split(".") + except ValueError: + return False + if prefix != "promptfoo": + return False + if suffix not in llm_judge_prompts: + return False + return True class PromptfooMetric: - def __init__(self, judge_prompt): + def __init__(self, metric_name_list): promptfoo_path = shutil.which("promptfoo") self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path) - self.judge_prompt = judge_prompt + self.prompts = {} + for metric_name in metric_name_list: + if is_valid_promptfoo_metric(metric_name): + self.prompts[metric_name] = llm_judge_prompts[metric_name.split(".")[1]] + else: + raise Exception(f"{metric_name} is not a valid promptfoo metric") async def measure(self, instances, context_provider): with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file: config = yaml.safe_load(file) - config["defaultTest"] = [{"assert": {"type": "llm_rubric", "value": self.judge_prompt}}] + config["defaultTest"] = { + "assert": [ + {"type": "llm-rubric", "value": prompt, "name": metric_name} + for metric_name, prompt in self.prompts.items() + ] + } # Fill config file with test cases tests = [] @@ -48,6 +71,9 @@ class PromptfooMetric: with open(file_path, "r") as file: results = json.load(file) - self.score = results["results"]["prompts"][0]["metrics"]["score"] + scores = {} - return self.score + for result in results["results"]["results"][0]["gradingResult"]["componentResults"]: + scores[result["assertion"]["name"]] = result["score"] + + return scores diff --git a/evals/qa_context_provider_utils.py b/evals/qa_context_provider_utils.py index 338031aff..b4c5daa7f 100644 --- a/evals/qa_context_provider_utils.py +++ b/evals/qa_context_provider_utils.py @@ -21,9 +21,11 @@ async def cognify_instance(instance: dict): async def get_context_with_cognee(instance: dict) -> str: await cognify_instance(instance) - insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"]) + # TODO: Fix insights + # insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"]) summaries = await cognee.search(SearchType.SUMMARIES, query_text=instance["question"]) - search_results = insights + summaries + # search_results = insights + summaries + search_results = summaries search_results_str = "\n".join([context_item["text"] for context_item in search_results]) @@ -31,7 +33,11 @@ async def get_context_with_cognee(instance: dict) -> str: async def get_context_with_simple_rag(instance: dict) -> str: - await cognify_instance(instance) + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + for title, sentences in instance["context"]: + await cognee.add("\n".join(sentences), dataset_name="QA") vector_engine = get_vector_engine() found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5) diff --git a/evals/qa_eval_parameters.json b/evals/qa_eval_parameters.json new file mode 100644 index 000000000..539e5f32c --- /dev/null +++ b/evals/qa_eval_parameters.json @@ -0,0 +1,18 @@ +{ + "dataset": [ + "hotpotqa" + ], + "rag_option": [ + "no_rag", + "cognee", + "simple_rag", + "brute_force" + ], + "num_samples": [ + 2 + ], + "metric_names": [ + "Correctness", + "Comprehensiveness" + ] +} diff --git a/evals/qa_eval_utils.py b/evals/qa_eval_utils.py new file mode 100644 index 000000000..395711c6a --- /dev/null +++ b/evals/qa_eval_utils.py @@ -0,0 +1,60 @@ +import itertools +import matplotlib.pyplot as plt +from jsonschema import ValidationError, validate +import pandas as pd +from pathlib import Path + +paramset_json_schema = { + "type": "object", + "properties": { + "dataset": { + "type": "array", + "items": {"type": "string"}, + }, + "rag_option": { + "type": "array", + "items": {"type": "string"}, + }, + "num_samples": { + "type": "array", + "items": {"type": "integer", "minimum": 1}, + }, + "metric_names": { + "type": "array", + "items": {"type": "string"}, + }, + }, + "required": ["dataset", "rag_option", "num_samples", "metric_names"], + "additionalProperties": False, +} + + +def save_table_as_image(df, image_path): + plt.figure(figsize=(10, 6)) + plt.axis("tight") + plt.axis("off") + plt.table(cellText=df.values, colLabels=df.columns, rowLabels=df.index, loc="center") + plt.title(f"{df.index.name}") + plt.savefig(image_path, bbox_inches="tight") + plt.close() + + +def save_results_as_image(results, out_path): + for dataset, num_samples_data in results.items(): + for num_samples, table_data in num_samples_data.items(): + df = pd.DataFrame.from_dict(table_data, orient="index") + df.index.name = f"Dataset: {dataset}, Num Samples: {num_samples}" + image_path = Path(out_path) / Path(f"table_{dataset}_{num_samples}.png") + save_table_as_image(df, image_path) + + +def get_combinations(parameters): + try: + validate(instance=parameters, schema=paramset_json_schema) + except ValidationError as e: + raise ValidationError(f"Invalid parameter set: {e.message}") + + params_for_combos = {k: v for k, v in parameters.items() if k != "metric_name"} + keys, values = zip(*params_for_combos.items()) + combinations = [dict(zip(keys, combo)) for combo in itertools.product(*values)] + return combinations diff --git a/evals/qa_metrics_utils.py b/evals/qa_metrics_utils.py index 107fe429d..80d3bc16f 100644 --- a/evals/qa_metrics_utils.py +++ b/evals/qa_metrics_utils.py @@ -7,10 +7,9 @@ from evals.deepeval_metrics import ( f1_score_metric, em_score_metric, ) -from evals.promptfoo_metrics import PromptfooMetric from deepeval.metrics import AnswerRelevancyMetric import deepeval.metrics -from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts +from evals.promptfoo_metrics import is_valid_promptfoo_metric, PromptfooMetric native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric} @@ -24,18 +23,10 @@ custom_deepeval_metrics = { "EM": em_score_metric, } -promptfoo_metrics = { - "promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]), - "promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]), - "promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]), - "promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]), - "promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]), -} - -qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics +qa_metrics = native_deepeval_metrics | custom_deepeval_metrics -def get_metric(metric_name: str): +def get_deepeval_metric(metric_name: str): if metric_name in qa_metrics: metric = qa_metrics[metric_name] else: @@ -49,3 +40,27 @@ def get_metric(metric_name: str): metric = metric() return metric + + +def get_metrics(metric_name_list: list[str]): + metrics = { + "deepeval_metrics": [], + } + + promptfoo_metric_names = [] + + for metric_name in metric_name_list: + if ( + (metric_name in native_deepeval_metrics) + or (metric_name in custom_deepeval_metrics) + or hasattr(deepeval.metrics, metric_name) + ): + metric = get_deepeval_metric(metric_name) + metrics["deepeval_metrics"].append(metric) + elif is_valid_promptfoo_metric(metric_name): + promptfoo_metric_names.append(metric_name) + + if len(promptfoo_metric_names) > 0: + metrics["promptfoo_metrics"] = PromptfooMetric(promptfoo_metric_names) + + return metrics diff --git a/evals/run_qa_eval.py b/evals/run_qa_eval.py new file mode 100644 index 000000000..97bea1847 --- /dev/null +++ b/evals/run_qa_eval.py @@ -0,0 +1,57 @@ +import asyncio +from evals.eval_on_hotpot import eval_on_QA_dataset +from evals.qa_eval_utils import get_combinations, save_results_as_image +import argparse +from pathlib import Path +import json + + +async def run_evals_on_paramset(paramset: dict, out_path: str): + combinations = get_combinations(paramset) + json_path = Path(out_path) / Path("results.json") + results = {} + for params in combinations: + dataset = params["dataset"] + num_samples = params["num_samples"] + rag_option = params["rag_option"] + + result = await eval_on_QA_dataset( + dataset, + rag_option, + num_samples, + paramset["metric_names"], + ) + + if dataset not in results: + results[dataset] = {} + if num_samples not in results[dataset]: + results[dataset][num_samples] = {} + + results[dataset][num_samples][rag_option] = result + + with open(json_path, "w") as file: + json.dump(results, file, indent=1) + + save_results_as_image(results, out_path) + + return results + + +async def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--params_file", type=str, required=True, help="Which dataset to evaluate on" + ) + parser.add_argument("--out_dir", type=str, help="Dir to save eval results") + + args = parser.parse_args() + + with open(args.params_file, "r") as file: + parameters = json.load(file) + + await run_evals_on_paramset(parameters, args.out_dir) + + +if __name__ == "__main__": + asyncio.run(main())