Add musique adapter base

2025-02-11 17:16:48 -05:00 · 2025-02-11 17:16:48 -05:00 · e6db870264
commit e6db870264
parent 9ba2e0d6c1
2 changed files with 127 additions and 0 deletions
--- a/evals/eval_framework/benchmark_adapters/benchmark_adapters.py
+++ b/evals/eval_framework/benchmark_adapters/benchmark_adapters.py
@ -2,6 +2,7 @@ from enum import Enum
 from typing import Type

 from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
+from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
 from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
 from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter

@ -9,6 +10,7 @@ from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoW
 class BenchmarkAdapter(Enum):
    DUMMY = ("Dummy", DummyAdapter)
    HOTPOTQA = ("HotPotQA", HotpotQAAdapter)
+    MUSIQUE = ('Musique', MusiqueQAAdapter)
    TWOWIKIMULTIHOP = ("TwoWikiMultiHop", TwoWikiMultihopAdapter)

    def __new__(cls, adapter_name: str, adapter_class: Type):
--- a/evals/eval_framework/benchmark_adapters/musique_adapter.py
+++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py
@ -0,0 +1,125 @@
+import os
+import json
+import random
+from typing import Optional, Union, Any, LiteralString
+import zipfile
+
+import gdown  # pip install gdown
+
+from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
+
+
+class MusiqueQAAdapter(BaseBenchmarkAdapter):
+    """
+    Adapter to load and process the Musique QA dataset from a local .jsonl file.
+    Optionally downloads and unzips the dataset if it does not exist locally.
+    """
+
+    dataset_info = {
+        # Name of the final file we want to load
+        "filename": "musique_ans_v1.0_dev.jsonl",
+
+        # A Google Drive URL (or share link) to the ZIP containing this file
+        "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
+
+        # The name of the ZIP archive we expect after downloading
+        "zip_filename": "musique_v1.0.zip",
+    }
+
+    def load_corpus(
+        self,
+        limit: Optional[int] = None,
+        seed: int = 42,
+        auto_download: bool = True,
+    ) -> tuple[list[str], list[dict[str, Any]]]:
+        """
+        Loads the Musique QA dataset.
+
+        :param limit: If set, randomly sample 'limit' items.
+        :param seed: Random seed for sampling.
+        :param auto_download: If True, attempt to download + unzip the dataset
+            from Google Drive if the .jsonl file is not present locally.
+        :return: (corpus_list, question_answer_pairs)
+        """
+        target_filename = self.dataset_info["filename"]
+
+        # 1. Ensure the file is locally available; optionally download if missing
+        if not os.path.exists(target_filename):
+            if auto_download:
+                self._musique_download_file()
+            else:
+                raise FileNotFoundError(
+                    f"Expected dataset file not found: {target_filename}\n"
+                    "Set auto_download=True or manually place the file."
+                )
+
+        # 2. Read the JSONL file
+        with open(target_filename, "r", encoding="utf-8") as f:
+            data = [json.loads(line) for line in f]
+
+        # 3. (Optional) sample a subset of items
+        if limit is not None and 0 < limit < len(data):
+            random.seed(seed)
+            data = random.sample(data, limit)
+
+        # 4. Build up corpus_list and question_answer_pairs
+        corpus_list = []
+        question_answer_pairs = []
+
+        for item in data:
+            # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
+            paragraphs = item.get("paragraphs", [])
+            combined_paragraphs = " ".join(
+                paragraph["paragraph_text"] for paragraph in paragraphs
+            )
+            corpus_list.append(combined_paragraphs)
+
+            # Example question & answer
+            # Adjust keys to match your actual JSON structure if needed
+            question = item.get("question", "")
+            # If you have a known 'answer' key, or sometimes it's "answer_aliases", adapt accordingly
+            answer = item.get("answer", "")
+
+            question_answer_pairs.append(
+                {
+                    "id": item.get("id", ""),
+                    "question": question,
+                    "answer": answer.lower() if isinstance(answer, str) else answer,
+                }
+            )
+
+        return corpus_list, question_answer_pairs
+
+    def _musique_download_file(self) -> None:
+        """
+        Download and unzip the Musique dataset if not already present locally.
+        Uses gdown for Google Drive links.
+        """
+        url = self.dataset_info["download_url"]
+        zip_filename = self.dataset_info["zip_filename"]
+        target_filename = self.dataset_info["filename"]
+
+        if os.path.exists(target_filename):
+            print(f"File '{target_filename}' is already present. Skipping download.")
+            return
+
+        print(f"Attempting to download from Google Drive: {url}")
+        # Using gdown to download the ZIP from a Google Drive link
+        gdown.download(url=url, output=zip_filename, quiet=False, fuzzy=True)
+
+        # Unzip the downloaded file
+        if os.path.exists(zip_filename):
+            print(f"Unzipping {zip_filename} ...")
+            with zipfile.ZipFile(zip_filename, "r") as zip_ref:
+                zip_ref.extractall()  # Extract to current directory
+            # Optionally remove the ZIP after extraction
+            os.remove(zip_filename)
+        else:
+            raise FileNotFoundError(f"Failed to download the zip file: {zip_filename}")
+
+        # Optional check: ensure the final .jsonl appeared
+        if not os.path.exists(target_filename):
+            raise FileNotFoundError(
+                f"After unzipping, '{target_filename}' not found. "
+                "Check the contents of the extracted files."
+            )