cognee/evals/promptfoo_metrics.py
alekszievr 8ec1e48ff6
Run eval on a set of parameters and save them as png and json (#443)
* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets.

* Load dataset file by filename, outsource utilities

* restructure metric selection

* Add comprehensiveness, diversity and empowerment metrics

* add promptfoo as an option

* refactor RAG solution in eval;2C

* LLM as a judge metrics implemented in a uniform way

* Use requests.get instead of wget

* clean up promptfoo config template

* minor fixes

* get promptfoo path instead of hardcoding

* minor fixes

* Add LLM as a judge prompts

* Support 4 different rag options in eval

* Minor refactor and logger usage

* Run eval on a set of parameters and save results as json and png

* script for running all param combinations

* bugfix in simple rag

* potential fix: single asyncio run

* temp fix: exclude insights

* Remove insights, have single asyncio run, refactor

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
2025-01-17 00:18:51 +01:00

79 lines
2.7 KiB
Python

from evals.promptfoo_wrapper import PromptfooWrapper
import os
import yaml
import json
import shutil
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
def is_valid_promptfoo_metric(metric_name: str):
try:
prefix, suffix = metric_name.split(".")
except ValueError:
return False
if prefix != "promptfoo":
return False
if suffix not in llm_judge_prompts:
return False
return True
class PromptfooMetric:
def __init__(self, metric_name_list):
promptfoo_path = shutil.which("promptfoo")
self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
self.prompts = {}
for metric_name in metric_name_list:
if is_valid_promptfoo_metric(metric_name):
self.prompts[metric_name] = llm_judge_prompts[metric_name.split(".")[1]]
else:
raise Exception(f"{metric_name} is not a valid promptfoo metric")
async def measure(self, instances, context_provider):
with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
config = yaml.safe_load(file)
config["defaultTest"] = {
"assert": [
{"type": "llm-rubric", "value": prompt, "name": metric_name}
for metric_name, prompt in self.prompts.items()
]
}
# Fill config file with test cases
tests = []
for instance in instances:
context = await context_provider(instance)
test = {
"vars": {
"name": instance["question"][:15],
"question": instance["question"],
"context": context,
}
}
tests.append(test)
config["tests"] = tests
# Write the updated YAML back, preserving formatting and structure
updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml")
with open(updated_yaml_file_path, "w") as file:
yaml.dump(config, file)
self.wrapper.run_eval(
prompt_file=os.path.join(os.getcwd(), "evals/promptfooprompt.json"),
config_file=os.path.join(os.getcwd(), "config_with_context.yaml"),
out_format="json",
)
file_path = os.path.join(os.getcwd(), "benchmark_results.json")
# Read and parse the JSON file
with open(file_path, "r") as file:
results = json.load(file)
scores = {}
for result in results["results"]["results"][0]["gradingResult"]["componentResults"]:
scores[result["assertion"]["name"]] = result["score"]
return scores