* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Minor refactor and logger usage
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
from evals.promptfoo_wrapper import PromptfooWrapper
|
|
import os
|
|
import yaml
|
|
import json
|
|
import shutil
|
|
|
|
|
|
class PromptfooMetric:
|
|
def __init__(self, judge_prompt):
|
|
promptfoo_path = shutil.which("promptfoo")
|
|
self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
|
|
self.judge_prompt = judge_prompt
|
|
|
|
async def measure(self, instances, context_provider):
|
|
with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
|
|
config = yaml.safe_load(file)
|
|
|
|
config["defaultTest"] = [{"assert": {"type": "llm_rubric", "value": self.judge_prompt}}]
|
|
|
|
# Fill config file with test cases
|
|
tests = []
|
|
for instance in instances:
|
|
context = await context_provider(instance)
|
|
test = {
|
|
"vars": {
|
|
"name": instance["question"][:15],
|
|
"question": instance["question"],
|
|
"context": context,
|
|
}
|
|
}
|
|
tests.append(test)
|
|
config["tests"] = tests
|
|
|
|
# Write the updated YAML back, preserving formatting and structure
|
|
updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml")
|
|
with open(updated_yaml_file_path, "w") as file:
|
|
yaml.dump(config, file)
|
|
|
|
self.wrapper.run_eval(
|
|
prompt_file=os.path.join(os.getcwd(), "evals/promptfooprompt.json"),
|
|
config_file=os.path.join(os.getcwd(), "config_with_context.yaml"),
|
|
out_format="json",
|
|
)
|
|
|
|
file_path = os.path.join(os.getcwd(), "benchmark_results.json")
|
|
|
|
# Read and parse the JSON file
|
|
with open(file_path, "r") as file:
|
|
results = json.load(file)
|
|
|
|
self.score = results["results"]["prompts"][0]["metrics"]["score"]
|
|
|
|
return self.score
|