cognee/evals/promptfoo_metrics.py
alekszievr 4e3a666b33
Feat: Save and load contexts and answers for eval (#462)
* feat: make tasks a configurable argument in the cognify function

* fix: add data points task

* eval on random samples instead of first couple

* Save and load contexts and answers

* Fix random seed usage and handle empty descriptions

* include insights search in cognee option

* create output dir if doesnt exist

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
2025-01-22 16:17:01 +01:00

92 lines
3.1 KiB
Python

from evals.promptfoo_wrapper import PromptfooWrapper
import os
import yaml
import json
import shutil
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
def is_valid_promptfoo_metric(metric_name: str):
try:
prefix, suffix = metric_name.split(".")
except ValueError:
return False
if prefix != "promptfoo":
return False
if suffix not in llm_judge_prompts:
return False
return True
class PromptfooMetric:
def __init__(self, metric_name_list):
promptfoo_path = shutil.which("promptfoo")
self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
self.prompts = {}
for metric_name in metric_name_list:
if is_valid_promptfoo_metric(metric_name):
self.prompts[metric_name] = llm_judge_prompts[metric_name.split(".")[1]]
else:
raise Exception(f"{metric_name} is not a valid promptfoo metric")
async def measure(self, instances, context_provider, contexts_filename):
with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
config = yaml.safe_load(file)
config["defaultTest"] = {
"assert": [
{"type": "llm-rubric", "value": prompt, "name": metric_name}
for metric_name, prompt in self.prompts.items()
]
}
tests = []
if os.path.exists(contexts_filename):
with open(contexts_filename, "r") as file:
preloaded_contexts = json.load(file)
else:
preloaded_contexts = {}
for instance in instances:
if instance["_id"] in preloaded_contexts:
context = preloaded_contexts[instance["_id"]]
else:
context = await context_provider(instance)
preloaded_contexts[instance["_id"]] = context
test = {
"vars": {
"name": instance["question"][:15],
"question": instance["question"],
"context": context,
}
}
tests.append(test)
config["tests"] = tests
with open(contexts_filename, "w") as file:
json.dump(preloaded_contexts, file)
# Write the updated YAML back, preserving formatting and structure
updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml")
with open(updated_yaml_file_path, "w") as file:
yaml.dump(config, file)
self.wrapper.run_eval(
prompt_file=os.path.join(os.getcwd(), "evals/promptfooprompt.json"),
config_file=os.path.join(os.getcwd(), "config_with_context.yaml"),
out_format="json",
)
file_path = os.path.join(os.getcwd(), "benchmark_results.json")
# Read and parse the JSON file
with open(file_path, "r") as file:
results = json.load(file)
scores = {}
for result in results["results"]["results"][0]["gradingResult"]["componentResults"]:
scores[result["assertion"]["name"]] = result["score"]
return scores