Run eval on a set of parameters and save them as png and json (#443)

* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Support 4 different rag options in eval * Minor refactor and logger usage * Run eval on a set of parameters and save results as json and png * script for running all param combinations * bugfix in simple rag * potential fix: single asyncio run * temp fix: exclude insights * Remove insights, have single asyncio run, refactor --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
2025-01-17 00:18:51 +01:00 · 2025-01-17 00:18:51 +01:00 · 8ec1e48ff6
commit 8ec1e48ff6
parent 6c6ba3270c
7 changed files with 237 additions and 38 deletions
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@ -8,7 +8,7 @@ import logging
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
 from evals.qa_dataset_utils import load_qa_dataset
-from evals.qa_metrics_utils import get_metric
+from evals.qa_metrics_utils import get_metrics
 from evals.qa_context_provider_utils import qa_context_providers

 logger = logging.getLogger(__name__)
@ -34,7 +34,7 @@ async def answer_qa_instance(instance, context_provider):
    return answer_prediction


-async def deepeval_answers(instances, answers, eval_metric):
+async def deepeval_answers(instances, answers, eval_metrics):
    test_cases = []

    for instance, answer in zip(instances, answers):
@ -44,37 +44,54 @@ async def deepeval_answers(instances, answers, eval_metric):
        test_cases.append(test_case)

    eval_set = EvaluationDataset(test_cases)
-    eval_results = eval_set.evaluate([eval_metric])
+    eval_results = eval_set.evaluate(eval_metrics)

    return eval_results


-async def deepeval_on_instances(instances, context_provider, eval_metric):
+async def deepeval_on_instances(instances, context_provider, eval_metrics):
    answers = []
    for instance in tqdm(instances, desc="Getting answers"):
        answer = await answer_qa_instance(instance, context_provider)
        answers.append(answer)

-    eval_results = await deepeval_answers(instances, answers, eval_metric)
-    avg_score = statistics.mean(
-        [result.metrics_data[0].score for result in eval_results.test_results]
-    )
+    eval_results = await deepeval_answers(instances, answers, eval_metrics)
+    score_lists_dict = {}
+    for instance_result in eval_results.test_results:
+        for metric_result in instance_result.metrics_data:
+            if metric_result.name not in score_lists_dict:
+                score_lists_dict[metric_result.name] = []
+            score_lists_dict[metric_result.name].append(metric_result.score)

-    return avg_score
+    avg_scores = {
+        metric_name: statistics.mean(scorelist)
+        for metric_name, scorelist in score_lists_dict.items()
+    }
+
+    return avg_scores


 async def eval_on_QA_dataset(
-    dataset_name_or_filename: str, context_provider_name, num_samples, eval_metric_name
+    dataset_name_or_filename: str, context_provider_name, num_samples, metric_name_list
 ):
    dataset = load_qa_dataset(dataset_name_or_filename)
    context_provider = qa_context_providers[context_provider_name]
-    eval_metric = get_metric(eval_metric_name)
+    eval_metrics = get_metrics(metric_name_list)
    instances = dataset if not num_samples else dataset[:num_samples]

-    if eval_metric_name.startswith("promptfoo"):
-        return await eval_metric.measure(instances, context_provider)
+    if "promptfoo_metrics" in eval_metrics:
+        promptfoo_results = await eval_metrics["promptfoo_metrics"].measure(
+            instances, context_provider
+        )
    else:
-        return await deepeval_on_instances(instances, context_provider, eval_metric)
+        promptfoo_results = {}
+    deepeval_results = await deepeval_on_instances(
+        instances, context_provider, eval_metrics["deepeval_metrics"]
+    )
+
+    results = promptfoo_results | deepeval_results
+
+    return results


 if __name__ == "__main__":
@ -89,11 +106,11 @@ if __name__ == "__main__":
        help="RAG option to use for providing context",
    )
    parser.add_argument("--num_samples", type=int, default=500)
-    parser.add_argument("--metric_name", type=str, default="Correctness")
+    parser.add_argument("--metrics", type=str, nargs="+", default=["Correctness"])

    args = parser.parse_args()

-    avg_score = asyncio.run(
-        eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metric_name)
+    avg_scores = asyncio.run(
+        eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metrics)
    )
-    logger.info(f"Average {args.metric_name}: {avg_score}")
+    logger.info(f"{avg_scores}")
--- a/evals/promptfoo_metrics.py
+++ b/evals/promptfoo_metrics.py
@ -3,19 +3,42 @@ import os
 import yaml
 import json
 import shutil
+from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
+
+
+def is_valid_promptfoo_metric(metric_name: str):
+    try:
+        prefix, suffix = metric_name.split(".")
+    except ValueError:
+        return False
+    if prefix != "promptfoo":
+        return False
+    if suffix not in llm_judge_prompts:
+        return False
+    return True


 class PromptfooMetric:
-    def __init__(self, judge_prompt):
+    def __init__(self, metric_name_list):
        promptfoo_path = shutil.which("promptfoo")
        self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
-        self.judge_prompt = judge_prompt
+        self.prompts = {}
+        for metric_name in metric_name_list:
+            if is_valid_promptfoo_metric(metric_name):
+                self.prompts[metric_name] = llm_judge_prompts[metric_name.split(".")[1]]
+            else:
+                raise Exception(f"{metric_name} is not a valid promptfoo metric")

    async def measure(self, instances, context_provider):
        with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
            config = yaml.safe_load(file)

-        config["defaultTest"] = [{"assert": {"type": "llm_rubric", "value": self.judge_prompt}}]
+        config["defaultTest"] = {
+            "assert": [
+                {"type": "llm-rubric", "value": prompt, "name": metric_name}
+                for metric_name, prompt in self.prompts.items()
+            ]
+        }

        # Fill config file with test cases
        tests = []
@ -48,6 +71,9 @@ class PromptfooMetric:
        with open(file_path, "r") as file:
            results = json.load(file)

-        self.score = results["results"]["prompts"][0]["metrics"]["score"]
+        scores = {}

-        return self.score
+        for result in results["results"]["results"][0]["gradingResult"]["componentResults"]:
+            scores[result["assertion"]["name"]] = result["score"]
+
+        return scores
--- a/evals/qa_context_provider_utils.py
+++ b/evals/qa_context_provider_utils.py
@ -21,9 +21,11 @@ async def cognify_instance(instance: dict):
 async def get_context_with_cognee(instance: dict) -> str:
    await cognify_instance(instance)

-    insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
+    # TODO: Fix insights
+    # insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
    summaries = await cognee.search(SearchType.SUMMARIES, query_text=instance["question"])
-    search_results = insights + summaries
+    # search_results = insights + summaries
+    search_results = summaries

    search_results_str = "\n".join([context_item["text"] for context_item in search_results])

@ -31,7 +33,11 @@ async def get_context_with_cognee(instance: dict) -> str:


 async def get_context_with_simple_rag(instance: dict) -> str:
-    await cognify_instance(instance)
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    for title, sentences in instance["context"]:
+        await cognee.add("\n".join(sentences), dataset_name="QA")

    vector_engine = get_vector_engine()
    found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5)
--- a/evals/qa_eval_parameters.json
+++ b/evals/qa_eval_parameters.json
@ -0,0 +1,18 @@
+{
+    "dataset": [
+        "hotpotqa"
+    ],
+    "rag_option": [
+        "no_rag",
+        "cognee",
+        "simple_rag",
+        "brute_force"
+    ],
+    "num_samples": [
+        2
+    ],
+    "metric_names": [
+        "Correctness",
+        "Comprehensiveness"
+    ]
+}
--- a/evals/qa_eval_utils.py
+++ b/evals/qa_eval_utils.py
@ -0,0 +1,60 @@
+import itertools
+import matplotlib.pyplot as plt
+from jsonschema import ValidationError, validate
+import pandas as pd
+from pathlib import Path
+
+paramset_json_schema = {
+    "type": "object",
+    "properties": {
+        "dataset": {
+            "type": "array",
+            "items": {"type": "string"},
+        },
+        "rag_option": {
+            "type": "array",
+            "items": {"type": "string"},
+        },
+        "num_samples": {
+            "type": "array",
+            "items": {"type": "integer", "minimum": 1},
+        },
+        "metric_names": {
+            "type": "array",
+            "items": {"type": "string"},
+        },
+    },
+    "required": ["dataset", "rag_option", "num_samples", "metric_names"],
+    "additionalProperties": False,
+}
+
+
+def save_table_as_image(df, image_path):
+    plt.figure(figsize=(10, 6))
+    plt.axis("tight")
+    plt.axis("off")
+    plt.table(cellText=df.values, colLabels=df.columns, rowLabels=df.index, loc="center")
+    plt.title(f"{df.index.name}")
+    plt.savefig(image_path, bbox_inches="tight")
+    plt.close()
+
+
+def save_results_as_image(results, out_path):
+    for dataset, num_samples_data in results.items():
+        for num_samples, table_data in num_samples_data.items():
+            df = pd.DataFrame.from_dict(table_data, orient="index")
+            df.index.name = f"Dataset: {dataset}, Num Samples: {num_samples}"
+            image_path = Path(out_path) / Path(f"table_{dataset}_{num_samples}.png")
+            save_table_as_image(df, image_path)
+
+
+def get_combinations(parameters):
+    try:
+        validate(instance=parameters, schema=paramset_json_schema)
+    except ValidationError as e:
+        raise ValidationError(f"Invalid parameter set: {e.message}")
+
+    params_for_combos = {k: v for k, v in parameters.items() if k != "metric_name"}
+    keys, values = zip(*params_for_combos.items())
+    combinations = [dict(zip(keys, combo)) for combo in itertools.product(*values)]
+    return combinations
--- a/evals/qa_metrics_utils.py
+++ b/evals/qa_metrics_utils.py
@ -7,10 +7,9 @@ from evals.deepeval_metrics import (
    f1_score_metric,
    em_score_metric,
 )
-from evals.promptfoo_metrics import PromptfooMetric
 from deepeval.metrics import AnswerRelevancyMetric
 import deepeval.metrics
-from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
+from evals.promptfoo_metrics import is_valid_promptfoo_metric, PromptfooMetric

 native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}

@ -24,18 +23,10 @@ custom_deepeval_metrics = {
    "EM": em_score_metric,
 }

-promptfoo_metrics = {
-    "promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]),
-    "promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]),
-    "promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]),
-    "promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]),
-    "promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]),
-}
-
-qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics
+qa_metrics = native_deepeval_metrics | custom_deepeval_metrics


-def get_metric(metric_name: str):
+def get_deepeval_metric(metric_name: str):
    if metric_name in qa_metrics:
        metric = qa_metrics[metric_name]
    else:
@ -49,3 +40,27 @@ def get_metric(metric_name: str):
        metric = metric()

    return metric
+
+
+def get_metrics(metric_name_list: list[str]):
+    metrics = {
+        "deepeval_metrics": [],
+    }
+
+    promptfoo_metric_names = []
+
+    for metric_name in metric_name_list:
+        if (
+            (metric_name in native_deepeval_metrics)
+            or (metric_name in custom_deepeval_metrics)
+            or hasattr(deepeval.metrics, metric_name)
+        ):
+            metric = get_deepeval_metric(metric_name)
+            metrics["deepeval_metrics"].append(metric)
+        elif is_valid_promptfoo_metric(metric_name):
+            promptfoo_metric_names.append(metric_name)
+
+    if len(promptfoo_metric_names) > 0:
+        metrics["promptfoo_metrics"] = PromptfooMetric(promptfoo_metric_names)
+
+    return metrics
--- a/evals/run_qa_eval.py
+++ b/evals/run_qa_eval.py
@ -0,0 +1,57 @@
+import asyncio
+from evals.eval_on_hotpot import eval_on_QA_dataset
+from evals.qa_eval_utils import get_combinations, save_results_as_image
+import argparse
+from pathlib import Path
+import json
+
+
+async def run_evals_on_paramset(paramset: dict, out_path: str):
+    combinations = get_combinations(paramset)
+    json_path = Path(out_path) / Path("results.json")
+    results = {}
+    for params in combinations:
+        dataset = params["dataset"]
+        num_samples = params["num_samples"]
+        rag_option = params["rag_option"]
+
+        result = await eval_on_QA_dataset(
+            dataset,
+            rag_option,
+            num_samples,
+            paramset["metric_names"],
+        )
+
+        if dataset not in results:
+            results[dataset] = {}
+        if num_samples not in results[dataset]:
+            results[dataset][num_samples] = {}
+
+        results[dataset][num_samples][rag_option] = result
+
+        with open(json_path, "w") as file:
+            json.dump(results, file, indent=1)
+
+        save_results_as_image(results, out_path)
+
+    return results
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--params_file", type=str, required=True, help="Which dataset to evaluate on"
+    )
+    parser.add_argument("--out_dir", type=str, help="Dir to save eval results")
+
+    args = parser.parse_args()
+
+    with open(args.params_file, "r") as file:
+        parameters = json.load(file)
+
+    await run_evals_on_paramset(parameters, args.out_dir)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())