diff --git a/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py b/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py index 325126b7b..9efa09e94 100644 --- a/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py +++ b/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py @@ -4,5 +4,7 @@ from typing import List, Optional class BaseBenchmarkAdapter(ABC): @abstractmethod - def load_corpus(self, limit: Optional[int] = None, seed: int = 42) -> List[str]: + def load_corpus( + self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False + ) -> List[str]: pass diff --git a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py index 2732a9d50..3020a5bb1 100644 --- a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py +++ b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py @@ -2,7 +2,7 @@ import requests import os import json import random -from typing import Optional, Any +from typing import Optional, Any, List, Tuple from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -14,9 +14,55 @@ class HotpotQAAdapter(BaseBenchmarkAdapter): # distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url } + def __init__(self): + super().__init__() + self.metadata_field_name = "level" + + def _is_valid_supporting_fact(self, sentences: List[str], sentence_idx: Any) -> bool: + """Validates if a supporting fact index is valid for the given sentences.""" + return sentences and isinstance(sentence_idx, int) and 0 <= sentence_idx < len(sentences) + + def _get_golden_context(self, item: dict[str, Any]) -> str: + """Extracts and formats the golden context from supporting facts.""" + # Create a mapping of title to sentences for easy lookup + context_dict = {title: sentences for (title, sentences) in item["context"]} + + # Get all supporting facts in order + golden_contexts = [] + for title, sentence_idx in item["supporting_facts"]: + sentences = context_dict.get(title, []) + if not self._is_valid_supporting_fact(sentences, sentence_idx): + continue + golden_contexts.append(f"{title}: {sentences[sentence_idx]}") + + return "\n".join(golden_contexts) + + def _process_item( + self, + item: dict[str, Any], + corpus_list: List[str], + question_answer_pairs: List[dict[str, Any]], + load_golden_context: bool = False, + ) -> None: + """Processes a single item and adds it to the corpus and QA pairs.""" + for title, sentences in item["context"]: + corpus_list.append(" ".join(sentences)) + + qa_pair = { + "question": item["question"], + "answer": item["answer"].lower(), + self.metadata_field_name: item[self.metadata_field_name], + } + + if load_golden_context: + qa_pair["golden_context"] = self._get_golden_context(item) + + question_answer_pairs.append(qa_pair) + def load_corpus( - self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[str], list[dict[str, Any]]]: + self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False + ) -> Tuple[List[str], List[dict[str, Any]]]: + """Loads and processes the HotpotQA corpus, optionally with golden context.""" filename = self.dataset_info["filename"] if os.path.exists(filename): @@ -36,16 +82,8 @@ class HotpotQAAdapter(BaseBenchmarkAdapter): corpus_list = [] question_answer_pairs = [] - for item in corpus_json: - for title, sentences in item["context"]: - corpus_list.append(" ".join(sentences)) - question_answer_pairs.append( - { - "question": item["question"], - "answer": item["answer"].lower(), - "level": item["level"], - } - ) + for item in corpus_json: + self._process_item(item, corpus_list, question_answer_pairs, load_golden_context) return corpus_list, question_answer_pairs diff --git a/evals/eval_framework/benchmark_adapters/musique_adapter.py b/evals/eval_framework/benchmark_adapters/musique_adapter.py index 73118d73f..27cfe554b 100644 --- a/evals/eval_framework/benchmark_adapters/musique_adapter.py +++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py @@ -1,7 +1,7 @@ import os import json import random -from typing import Optional, Any +from typing import Optional, Any, List import zipfile import gdown @@ -10,38 +10,71 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB class MusiqueQAAdapter(BaseBenchmarkAdapter): - """ - Adapter to load and process the Musique QA dataset from a local .jsonl file. - Optionally downloads and unzips the dataset if it does not exist locally. - """ + """Adapter for the Musique QA dataset with local file loading and optional download.""" dataset_info = { - # Name of the final file we want to load "filename": "data/musique_ans_v1.0_dev.jsonl", - # A Google Drive URL (or share link) to the ZIP containing this file "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing", - # The name of the ZIP archive we expect after downloading "zip_filename": "musique_v1.0.zip", } + def _get_golden_context(self, item: dict[str, Any]) -> str: + """Extracts golden context from question decomposition and supporting paragraphs.""" + golden_context = [] + paragraphs = item.get("paragraphs", []) + + # Process each decomposition step + for step in item.get("question_decomposition", []): + # Add the supporting paragraph if available + support_idx = step.get("paragraph_support_idx") + if isinstance(support_idx, int) and 0 <= support_idx < len(paragraphs): + para = paragraphs[support_idx] + golden_context.append(f"{para['title']}: {para['paragraph_text']}") + + # Add the step's question and answer + golden_context.append(f"Q: {step['question']}") + golden_context.append(f"A: {step['answer']}") + golden_context.append("") # Empty line between steps + + return "\n".join(golden_context) + + def _process_item( + self, + item: dict[str, Any], + corpus_list: List[str], + question_answer_pairs: List[dict[str, Any]], + load_golden_context: bool = False, + ) -> None: + """Processes a single item and adds it to the corpus and QA pairs.""" + # Add paragraphs to corpus + paragraphs = item.get("paragraphs", []) + for paragraph in paragraphs: + corpus_list.append(paragraph["paragraph_text"]) + + # Create QA pair + qa_pair = { + "id": item.get("id", ""), + "question": item.get("question", ""), + "answer": item.get("answer", "").lower() + if isinstance(item.get("answer"), str) + else item.get("answer"), + } + + if load_golden_context: + qa_pair["golden_context"] = self._get_golden_context(item) + + question_answer_pairs.append(qa_pair) + def load_corpus( self, limit: Optional[int] = None, seed: int = 42, + load_golden_context: bool = False, auto_download: bool = True, ) -> tuple[list[str], list[dict[str, Any]]]: - """ - Loads the Musique QA dataset. - - :param limit: If set, randomly sample 'limit' items. - :param seed: Random seed for sampling. - :param auto_download: If True, attempt to download + unzip the dataset - from Google Drive if the .jsonl file is not present locally. - :return: (corpus_list, question_answer_pairs) - """ + """Loads and processes the Musique QA dataset.""" target_filename = self.dataset_info["filename"] - # 1. Ensure the file is locally available; optionally download if missing if not os.path.exists(target_filename): if auto_download: self._musique_download_file() @@ -62,29 +95,12 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter): question_answer_pairs = [] for item in data: - # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text' - paragraphs = item.get("paragraphs", []) - for paragraph in paragraphs: - corpus_list.append(paragraph["paragraph_text"]) - - question = item.get("question", "") - answer = item.get("answer", "") - - question_answer_pairs.append( - { - "id": item.get("id", ""), - "question": question, - "answer": answer.lower() if isinstance(answer, str) else answer, - } - ) + self._process_item(item, corpus_list, question_answer_pairs, load_golden_context) return corpus_list, question_answer_pairs def _musique_download_file(self) -> None: - """ - Download and unzip the Musique dataset if not already present locally. - Uses gdown for Google Drive links. - """ + """Downloads and unzips the Musique dataset if not present locally.""" url = self.dataset_info["download_url"] zip_filename = self.dataset_info["zip_filename"] target_filename = self.dataset_info["filename"] diff --git a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py index b65a64364..a6bb017fb 100644 --- a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py +++ b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py @@ -2,48 +2,27 @@ import requests import os import json import random -from typing import Optional, Any -from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter +from typing import Optional, Any, List, Tuple +from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter -class TwoWikiMultihopAdapter(BaseBenchmarkAdapter): +class TwoWikiMultihopAdapter(HotpotQAAdapter): dataset_info = { "filename": "2wikimultihop_dev.json", - "URL": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json", + "url": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json", } - def load_corpus( - self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[str], list[dict[str, Any]]]: - filename = self.dataset_info["filename"] + def __init__(self): + super().__init__() + self.metadata_field_name = "type" - if os.path.exists(filename): - with open(filename, "r", encoding="utf-8") as f: - corpus_json = json.load(f) - else: - response = requests.get(self.dataset_info["URL"]) - response.raise_for_status() - corpus_json = response.json() + def _get_golden_context(self, item: dict[str, Any]) -> str: + """Extracts and formats the golden context from supporting facts and adds evidence if available.""" + golden_context = super()._get_golden_context(item) - with open(filename, "w", encoding="utf-8") as f: - json.dump(corpus_json, f, ensure_ascii=False, indent=4) + if "evidences" in item: + golden_context += "\nEvidence fact triplets:" + for subject, relation, obj in item["evidences"]: + golden_context += f"\n • {subject} - {relation} - {obj}" - if limit is not None and 0 < limit < len(corpus_json): - random.seed(seed) - corpus_json = random.sample(corpus_json, limit) - - corpus_list = [] - question_answer_pairs = [] - for dict in corpus_json: - for title, sentences in dict["context"]: - corpus_list.append(" ".join(sentences)) - - question_answer_pairs.append( - { - "question": dict["question"], - "answer": dict["answer"].lower(), - "type": dict["type"], - } - ) - - return corpus_list, question_answer_pairs + return golden_context