feat: retrieve golden contexts [COG-1364] (#579)

## Description  • Added load_golden_context parameter to BaseBenchmarkAdapter's abstract load_corpus method, establishing a common interface for retrieving supporting evidence • Refactored HotpotQAAdapter with a modular design: introduced _get_metadata_field_name method to handle dataset-specific fields (making it extensible for child classes), implemented get golden context functionality. • Refactored TwoWikiMultihopAdapter to inherit from HotpotQAAdapter, overriding only the necessary methods while reusing parent's functionality • Added golden context support to MusiqueQAAdapter with their decomposition-based format ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - Introduced an option to include additional context during corpus loading, enhancing the quality and flexibility of generated QA pairs. - **Refactor** - Streamlined and modularized the processing workflow across different adapters for improved consistency and maintainability. - Updated metadata extraction to refine the display of contextual information. - Shifted focus in the `TwoWikiMultihopAdapter` from corpus loading to context extraction.
2025-02-27 13:25:47 +01:00 · 2025-02-27 13:25:47 +01:00 · 4b7c21d7d8
commit 4b7c21d7d8
parent 4c3c811c1e
4 changed files with 122 additions and 87 deletions
--- a/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py
+++ b/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py
@ -4,5 +4,7 @@ from typing import List, Optional

 class BaseBenchmarkAdapter(ABC):
    @abstractmethod
-    def load_corpus(self, limit: Optional[int] = None, seed: int = 42) -> List[str]:
+    def load_corpus(
+        self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
+    ) -> List[str]:
        pass
--- a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py
+++ b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py
@ -2,7 +2,7 @@ import requests
 import os
 import json
 import random
-from typing import Optional, Any
+from typing import Optional, Any, List, Tuple
 from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


@ -14,9 +14,55 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
        # distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url
    }

+    def __init__(self):
+        super().__init__()
+        self.metadata_field_name = "level"
+
+    def _is_valid_supporting_fact(self, sentences: List[str], sentence_idx: Any) -> bool:
+        """Validates if a supporting fact index is valid for the given sentences."""
+        return sentences and isinstance(sentence_idx, int) and 0 <= sentence_idx < len(sentences)
+
+    def _get_golden_context(self, item: dict[str, Any]) -> str:
+        """Extracts and formats the golden context from supporting facts."""
+        # Create a mapping of title to sentences for easy lookup
+        context_dict = {title: sentences for (title, sentences) in item["context"]}
+
+        # Get all supporting facts in order
+        golden_contexts = []
+        for title, sentence_idx in item["supporting_facts"]:
+            sentences = context_dict.get(title, [])
+            if not self._is_valid_supporting_fact(sentences, sentence_idx):
+                continue
+            golden_contexts.append(f"{title}: {sentences[sentence_idx]}")
+
+        return "\n".join(golden_contexts)
+
+    def _process_item(
+        self,
+        item: dict[str, Any],
+        corpus_list: List[str],
+        question_answer_pairs: List[dict[str, Any]],
+        load_golden_context: bool = False,
+    ) -> None:
+        """Processes a single item and adds it to the corpus and QA pairs."""
+        for title, sentences in item["context"]:
+            corpus_list.append(" ".join(sentences))
+
+        qa_pair = {
+            "question": item["question"],
+            "answer": item["answer"].lower(),
+            self.metadata_field_name: item[self.metadata_field_name],
+        }
+
+        if load_golden_context:
+            qa_pair["golden_context"] = self._get_golden_context(item)
+
+        question_answer_pairs.append(qa_pair)
+
    def load_corpus(
-        self, limit: Optional[int] = None, seed: int = 42
-    ) -> tuple[list[str], list[dict[str, Any]]]:
+        self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
+    ) -> Tuple[List[str], List[dict[str, Any]]]:
+        """Loads and processes the HotpotQA corpus, optionally with golden context."""
        filename = self.dataset_info["filename"]

        if os.path.exists(filename):
@ -36,16 +82,8 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):

        corpus_list = []
        question_answer_pairs = []
-        for item in corpus_json:
-            for title, sentences in item["context"]:
-                corpus_list.append(" ".join(sentences))

-            question_answer_pairs.append(
-                {
-                    "question": item["question"],
-                    "answer": item["answer"].lower(),
-                    "level": item["level"],
-                }
-            )
+        for item in corpus_json:
+            self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)

        return corpus_list, question_answer_pairs
--- a/evals/eval_framework/benchmark_adapters/musique_adapter.py
+++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py
@ -1,7 +1,7 @@
 import os
 import json
 import random
-from typing import Optional, Any
+from typing import Optional, Any, List
 import zipfile

 import gdown
@ -10,38 +10,71 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB


 class MusiqueQAAdapter(BaseBenchmarkAdapter):
-    """
-    Adapter to load and process the Musique QA dataset from a local .jsonl file.
-    Optionally downloads and unzips the dataset if it does not exist locally.
-    """
+    """Adapter for the Musique QA dataset with local file loading and optional download."""

    dataset_info = {
-        # Name of the final file we want to load
        "filename": "data/musique_ans_v1.0_dev.jsonl",
-        # A Google Drive URL (or share link) to the ZIP containing this file
        "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
-        # The name of the ZIP archive we expect after downloading
        "zip_filename": "musique_v1.0.zip",
    }

+    def _get_golden_context(self, item: dict[str, Any]) -> str:
+        """Extracts golden context from question decomposition and supporting paragraphs."""
+        golden_context = []
+        paragraphs = item.get("paragraphs", [])
+
+        # Process each decomposition step
+        for step in item.get("question_decomposition", []):
+            # Add the supporting paragraph if available
+            support_idx = step.get("paragraph_support_idx")
+            if isinstance(support_idx, int) and 0 <= support_idx < len(paragraphs):
+                para = paragraphs[support_idx]
+                golden_context.append(f"{para['title']}: {para['paragraph_text']}")
+
+            # Add the step's question and answer
+            golden_context.append(f"Q: {step['question']}")
+            golden_context.append(f"A: {step['answer']}")
+            golden_context.append("")  # Empty line between steps
+
+        return "\n".join(golden_context)
+
+    def _process_item(
+        self,
+        item: dict[str, Any],
+        corpus_list: List[str],
+        question_answer_pairs: List[dict[str, Any]],
+        load_golden_context: bool = False,
+    ) -> None:
+        """Processes a single item and adds it to the corpus and QA pairs."""
+        # Add paragraphs to corpus
+        paragraphs = item.get("paragraphs", [])
+        for paragraph in paragraphs:
+            corpus_list.append(paragraph["paragraph_text"])
+
+        # Create QA pair
+        qa_pair = {
+            "id": item.get("id", ""),
+            "question": item.get("question", ""),
+            "answer": item.get("answer", "").lower()
+            if isinstance(item.get("answer"), str)
+            else item.get("answer"),
+        }
+
+        if load_golden_context:
+            qa_pair["golden_context"] = self._get_golden_context(item)
+
+        question_answer_pairs.append(qa_pair)
+
    def load_corpus(
        self,
        limit: Optional[int] = None,
        seed: int = 42,
+        load_golden_context: bool = False,
        auto_download: bool = True,
    ) -> tuple[list[str], list[dict[str, Any]]]:
-        """
-        Loads the Musique QA dataset.
-
-        :param limit: If set, randomly sample 'limit' items.
-        :param seed: Random seed for sampling.
-        :param auto_download: If True, attempt to download + unzip the dataset
-            from Google Drive if the .jsonl file is not present locally.
-        :return: (corpus_list, question_answer_pairs)
-        """
+        """Loads and processes the Musique QA dataset."""
        target_filename = self.dataset_info["filename"]

-        # 1. Ensure the file is locally available; optionally download if missing
        if not os.path.exists(target_filename):
            if auto_download:
                self._musique_download_file()
@ -62,29 +95,12 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
        question_answer_pairs = []

        for item in data:
-            # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
-            paragraphs = item.get("paragraphs", [])
-            for paragraph in paragraphs:
-                corpus_list.append(paragraph["paragraph_text"])
-
-            question = item.get("question", "")
-            answer = item.get("answer", "")
-
-            question_answer_pairs.append(
-                {
-                    "id": item.get("id", ""),
-                    "question": question,
-                    "answer": answer.lower() if isinstance(answer, str) else answer,
-                }
-            )
+            self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)

        return corpus_list, question_answer_pairs

    def _musique_download_file(self) -> None:
-        """
-        Download and unzip the Musique dataset if not already present locally.
-        Uses gdown for Google Drive links.
-        """
+        """Downloads and unzips the Musique dataset if not present locally."""
        url = self.dataset_info["download_url"]
        zip_filename = self.dataset_info["zip_filename"]
        target_filename = self.dataset_info["filename"]
--- a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py
+++ b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py
@ -2,48 +2,27 @@ import requests
 import os
 import json
 import random
-from typing import Optional, Any
-from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
+from typing import Optional, Any, List, Tuple
+from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter


-class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
+class TwoWikiMultihopAdapter(HotpotQAAdapter):
    dataset_info = {
        "filename": "2wikimultihop_dev.json",
-        "URL": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
+        "url": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
    }

-    def load_corpus(
-        self, limit: Optional[int] = None, seed: int = 42
-    ) -> tuple[list[str], list[dict[str, Any]]]:
-        filename = self.dataset_info["filename"]
+    def __init__(self):
+        super().__init__()
+        self.metadata_field_name = "type"

-        if os.path.exists(filename):
-            with open(filename, "r", encoding="utf-8") as f:
-                corpus_json = json.load(f)
-        else:
-            response = requests.get(self.dataset_info["URL"])
-            response.raise_for_status()
-            corpus_json = response.json()
+    def _get_golden_context(self, item: dict[str, Any]) -> str:
+        """Extracts and formats the golden context from supporting facts and adds evidence if available."""
+        golden_context = super()._get_golden_context(item)

-            with open(filename, "w", encoding="utf-8") as f:
-                json.dump(corpus_json, f, ensure_ascii=False, indent=4)
+        if "evidences" in item:
+            golden_context += "\nEvidence fact triplets:"
+            for subject, relation, obj in item["evidences"]:
+                golden_context += f"\n  • {subject} - {relation} - {obj}"

-        if limit is not None and 0 < limit < len(corpus_json):
-            random.seed(seed)
-            corpus_json = random.sample(corpus_json, limit)
-
-        corpus_list = []
-        question_answer_pairs = []
-        for dict in corpus_json:
-            for title, sentences in dict["context"]:
-                corpus_list.append(" ".join(sentences))
-
-            question_answer_pairs.append(
-                {
-                    "question": dict["question"],
-                    "answer": dict["answer"].lower(),
-                    "type": dict["type"],
-                }
-            )
-
-        return corpus_list, question_answer_pairs
+        return golden_context