From 5e076689adadba8d02f1a7de0a4562a868efad09 Mon Sep 17 00:00:00 2001
From: alekszievr <44192193+alekszievr@users.noreply.github.com>
Date: Tue, 28 Jan 2025 13:05:22 +0100
Subject: [PATCH] Feat: [COG-1074] fix multimetric eval bug (#463)

* feat: make tasks a configurable argument in the cognify function

* fix: add data points task

* Ugly hack for multi-metric eval bug

* some cleanup

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
---
 evals/eval_on_hotpot.py          |  5 ++-
 evals/multimetric_qa_eval_run.py | 75 ++++++++++++++++++++++++++++++++
 evals/qa_eval_utils.py           |  9 +++-
 3 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 evals/multimetric_qa_eval_run.py

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index b0591a1eb..4f3198d87 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -171,17 +171,18 @@ async def main():
     )
     parser.add_argument("--num_samples", type=int, default=500)
     parser.add_argument("--metrics", type=str, nargs="+", default=["Correctness"])
+    parser.add_argument("--out_dir", type=str, help="Dir to save eval results")
 
     args = parser.parse_args()
 
     if args.rag_option == "cognee_incremental":
         avg_scores = await incremental_eval_on_QA_dataset(
-            args.dataset, args.num_samples, args.metrics
+            args.dataset, args.num_samples, args.metrics, args.out_dir
         )
 
     else:
         avg_scores = await eval_on_QA_dataset(
-            args.dataset, args.rag_option, args.num_samples, args.metrics
+            args.dataset, args.rag_option, args.num_samples, args.metrics, args.out_dir
         )
 
     logger.info(f"{avg_scores}")
diff --git a/evals/multimetric_qa_eval_run.py b/evals/multimetric_qa_eval_run.py
new file mode 100644
index 000000000..7f219e8b9
--- /dev/null
+++ b/evals/multimetric_qa_eval_run.py
@@ -0,0 +1,75 @@
+import subprocess
+import json
+import argparse
+import os
+from typing import List
+import sys
+
+
+def run_command(command: List[str]):
+    try:
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1
+        )
+
+        while True:
+            stdout_line = process.stdout.readline()
+            stderr_line = process.stderr.readline()
+
+            if stdout_line == "" and stderr_line == "" and process.poll() is not None:
+                break
+
+            if stdout_line:
+                print(stdout_line.rstrip())
+            if stderr_line:
+                print(f"Error: {stderr_line.rstrip()}", file=sys.stderr)
+
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(process.returncode, command)
+    finally:
+        process.stdout.close()
+        process.stderr.close()
+
+
+def run_evals_for_paramsfile(params_file, out_dir):
+    with open(params_file, "r") as file:
+        parameters = json.load(file)
+
+    for metric in parameters["metric_names"]:
+        params = parameters
+        params["metric_names"] = [metric]
+
+        temp_paramfile = params_file.replace(".json", f"_{metric}.json")
+        with open(temp_paramfile, "w") as file:
+            json.dump(params, file)
+
+        command = [
+            "python",
+            "evals/run_qa_eval.py",
+            "--params_file",
+            temp_paramfile,
+            "--out_dir",
+            out_dir,
+        ]
+
+        run_command(command)
+
+        if os.path.exists(temp_paramfile):
+            os.remove(temp_paramfile)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--params_file", type=str, required=True, help="Which dataset to evaluate on"
+    )
+    parser.add_argument("--out_dir", type=str, help="Dir to save eval results")
+
+    args = parser.parse_args()
+
+    run_evals_for_paramsfile(args.params_file, args.out_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evals/qa_eval_utils.py b/evals/qa_eval_utils.py
index 16817ef28..f49f51f24 100644
--- a/evals/qa_eval_utils.py
+++ b/evals/qa_eval_utils.py
@@ -42,9 +42,13 @@ def save_table_as_image(df, image_path):
 def save_results_as_image(results, out_path):
     for dataset, num_samples_data in results.items():
         for num_samples, table_data in num_samples_data.items():
+            for rag_option, metric_data in table_data.items():
+                for name, value in metric_data.items():
+                    metric_name = name
+                    break
             df = pd.DataFrame.from_dict(table_data, orient="index")
             df.index.name = f"Dataset: {dataset}, Num Samples: {num_samples}"
-            image_path = out_path / Path(f"table_{dataset}_{num_samples}.png")
+            image_path = out_path / Path(f"table_{dataset}_{num_samples}_{metric_name}.png")
             save_table_as_image(df, image_path)
 
 
@@ -54,7 +58,8 @@ def get_combinations(parameters):
     except ValidationError as e:
         raise ValidationError(f"Invalid parameter set: {e.message}")
 
-    params_for_combos = {k: v for k, v in parameters.items() if k != "metric_name"}
+    # params_for_combos = {k: v for k, v in parameters.items() if k != "metric_name"}
+    params_for_combos = {k: v for k, v in parameters.items()}
     keys, values = zip(*params_for_combos.items())
     combinations = [dict(zip(keys, combo)) for combo in itertools.product(*values)]
     return combinations