feat: retrieve golden contexts [COG-1364] (#579)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
• Added load_golden_context parameter to BaseBenchmarkAdapter's abstract
load_corpus method, establishing a common interface for retrieving
supporting evidence
• Refactored HotpotQAAdapter with a modular design: introduced
_get_metadata_field_name method to handle dataset-specific fields
(making it extensible for child classes), implemented get golden context
functionality.
• Refactored TwoWikiMultihopAdapter to inherit from HotpotQAAdapter,
overriding only the necessary methods while reusing parent's
functionality
• Added golden context support to MusiqueQAAdapter with their
decomposition-based format
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced an option to include additional context during corpus
loading, enhancing the quality and flexibility of generated QA pairs.
- **Refactor**
- Streamlined and modularized the processing workflow across different
adapters for improved consistency and maintainability.
- Updated metadata extraction to refine the display of contextual
information.
- Shifted focus in the `TwoWikiMultihopAdapter` from corpus loading to
context extraction.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
lxobr 2025-02-27 13:25:47 +01:00 committed by GitHub
parent 4c3c811c1e
commit 4b7c21d7d8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 122 additions and 87 deletions

View file

@ -4,5 +4,7 @@ from typing import List, Optional
class BaseBenchmarkAdapter(ABC): class BaseBenchmarkAdapter(ABC):
@abstractmethod @abstractmethod
def load_corpus(self, limit: Optional[int] = None, seed: int = 42) -> List[str]: def load_corpus(
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
) -> List[str]:
pass pass

View file

@ -2,7 +2,7 @@ import requests
import os import os
import json import json
import random import random
from typing import Optional, Any from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -14,9 +14,55 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
# distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url # distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url
} }
def __init__(self):
super().__init__()
self.metadata_field_name = "level"
def _is_valid_supporting_fact(self, sentences: List[str], sentence_idx: Any) -> bool:
"""Validates if a supporting fact index is valid for the given sentences."""
return sentences and isinstance(sentence_idx, int) and 0 <= sentence_idx < len(sentences)
def _get_golden_context(self, item: dict[str, Any]) -> str:
"""Extracts and formats the golden context from supporting facts."""
# Create a mapping of title to sentences for easy lookup
context_dict = {title: sentences for (title, sentences) in item["context"]}
# Get all supporting facts in order
golden_contexts = []
for title, sentence_idx in item["supporting_facts"]:
sentences = context_dict.get(title, [])
if not self._is_valid_supporting_fact(sentences, sentence_idx):
continue
golden_contexts.append(f"{title}: {sentences[sentence_idx]}")
return "\n".join(golden_contexts)
def _process_item(
self,
item: dict[str, Any],
corpus_list: List[str],
question_answer_pairs: List[dict[str, Any]],
load_golden_context: bool = False,
) -> None:
"""Processes a single item and adds it to the corpus and QA pairs."""
for title, sentences in item["context"]:
corpus_list.append(" ".join(sentences))
qa_pair = {
"question": item["question"],
"answer": item["answer"].lower(),
self.metadata_field_name: item[self.metadata_field_name],
}
if load_golden_context:
qa_pair["golden_context"] = self._get_golden_context(item)
question_answer_pairs.append(qa_pair)
def load_corpus( def load_corpus(
self, limit: Optional[int] = None, seed: int = 42 self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
) -> tuple[list[str], list[dict[str, Any]]]: ) -> Tuple[List[str], List[dict[str, Any]]]:
"""Loads and processes the HotpotQA corpus, optionally with golden context."""
filename = self.dataset_info["filename"] filename = self.dataset_info["filename"]
if os.path.exists(filename): if os.path.exists(filename):
@ -36,16 +82,8 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
corpus_list = [] corpus_list = []
question_answer_pairs = [] question_answer_pairs = []
for item in corpus_json:
for title, sentences in item["context"]:
corpus_list.append(" ".join(sentences))
question_answer_pairs.append( for item in corpus_json:
{ self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)
"question": item["question"],
"answer": item["answer"].lower(),
"level": item["level"],
}
)
return corpus_list, question_answer_pairs return corpus_list, question_answer_pairs

View file

@ -1,7 +1,7 @@
import os import os
import json import json
import random import random
from typing import Optional, Any from typing import Optional, Any, List
import zipfile import zipfile
import gdown import gdown
@ -10,38 +10,71 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB
class MusiqueQAAdapter(BaseBenchmarkAdapter): class MusiqueQAAdapter(BaseBenchmarkAdapter):
""" """Adapter for the Musique QA dataset with local file loading and optional download."""
Adapter to load and process the Musique QA dataset from a local .jsonl file.
Optionally downloads and unzips the dataset if it does not exist locally.
"""
dataset_info = { dataset_info = {
# Name of the final file we want to load
"filename": "data/musique_ans_v1.0_dev.jsonl", "filename": "data/musique_ans_v1.0_dev.jsonl",
# A Google Drive URL (or share link) to the ZIP containing this file
"download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing", "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
# The name of the ZIP archive we expect after downloading
"zip_filename": "musique_v1.0.zip", "zip_filename": "musique_v1.0.zip",
} }
def _get_golden_context(self, item: dict[str, Any]) -> str:
"""Extracts golden context from question decomposition and supporting paragraphs."""
golden_context = []
paragraphs = item.get("paragraphs", [])
# Process each decomposition step
for step in item.get("question_decomposition", []):
# Add the supporting paragraph if available
support_idx = step.get("paragraph_support_idx")
if isinstance(support_idx, int) and 0 <= support_idx < len(paragraphs):
para = paragraphs[support_idx]
golden_context.append(f"{para['title']}: {para['paragraph_text']}")
# Add the step's question and answer
golden_context.append(f"Q: {step['question']}")
golden_context.append(f"A: {step['answer']}")
golden_context.append("") # Empty line between steps
return "\n".join(golden_context)
def _process_item(
self,
item: dict[str, Any],
corpus_list: List[str],
question_answer_pairs: List[dict[str, Any]],
load_golden_context: bool = False,
) -> None:
"""Processes a single item and adds it to the corpus and QA pairs."""
# Add paragraphs to corpus
paragraphs = item.get("paragraphs", [])
for paragraph in paragraphs:
corpus_list.append(paragraph["paragraph_text"])
# Create QA pair
qa_pair = {
"id": item.get("id", ""),
"question": item.get("question", ""),
"answer": item.get("answer", "").lower()
if isinstance(item.get("answer"), str)
else item.get("answer"),
}
if load_golden_context:
qa_pair["golden_context"] = self._get_golden_context(item)
question_answer_pairs.append(qa_pair)
def load_corpus( def load_corpus(
self, self,
limit: Optional[int] = None, limit: Optional[int] = None,
seed: int = 42, seed: int = 42,
load_golden_context: bool = False,
auto_download: bool = True, auto_download: bool = True,
) -> tuple[list[str], list[dict[str, Any]]]: ) -> tuple[list[str], list[dict[str, Any]]]:
""" """Loads and processes the Musique QA dataset."""
Loads the Musique QA dataset.
:param limit: If set, randomly sample 'limit' items.
:param seed: Random seed for sampling.
:param auto_download: If True, attempt to download + unzip the dataset
from Google Drive if the .jsonl file is not present locally.
:return: (corpus_list, question_answer_pairs)
"""
target_filename = self.dataset_info["filename"] target_filename = self.dataset_info["filename"]
# 1. Ensure the file is locally available; optionally download if missing
if not os.path.exists(target_filename): if not os.path.exists(target_filename):
if auto_download: if auto_download:
self._musique_download_file() self._musique_download_file()
@ -62,29 +95,12 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
question_answer_pairs = [] question_answer_pairs = []
for item in data: for item in data:
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text' self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)
paragraphs = item.get("paragraphs", [])
for paragraph in paragraphs:
corpus_list.append(paragraph["paragraph_text"])
question = item.get("question", "")
answer = item.get("answer", "")
question_answer_pairs.append(
{
"id": item.get("id", ""),
"question": question,
"answer": answer.lower() if isinstance(answer, str) else answer,
}
)
return corpus_list, question_answer_pairs return corpus_list, question_answer_pairs
def _musique_download_file(self) -> None: def _musique_download_file(self) -> None:
""" """Downloads and unzips the Musique dataset if not present locally."""
Download and unzip the Musique dataset if not already present locally.
Uses gdown for Google Drive links.
"""
url = self.dataset_info["download_url"] url = self.dataset_info["download_url"]
zip_filename = self.dataset_info["zip_filename"] zip_filename = self.dataset_info["zip_filename"]
target_filename = self.dataset_info["filename"] target_filename = self.dataset_info["filename"]

View file

@ -2,48 +2,27 @@ import requests
import os import os
import json import json
import random import random
from typing import Optional, Any from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
class TwoWikiMultihopAdapter(BaseBenchmarkAdapter): class TwoWikiMultihopAdapter(HotpotQAAdapter):
dataset_info = { dataset_info = {
"filename": "2wikimultihop_dev.json", "filename": "2wikimultihop_dev.json",
"URL": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json", "url": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
} }
def load_corpus( def __init__(self):
self, limit: Optional[int] = None, seed: int = 42 super().__init__()
) -> tuple[list[str], list[dict[str, Any]]]: self.metadata_field_name = "type"
filename = self.dataset_info["filename"]
if os.path.exists(filename): def _get_golden_context(self, item: dict[str, Any]) -> str:
with open(filename, "r", encoding="utf-8") as f: """Extracts and formats the golden context from supporting facts and adds evidence if available."""
corpus_json = json.load(f) golden_context = super()._get_golden_context(item)
else:
response = requests.get(self.dataset_info["URL"])
response.raise_for_status()
corpus_json = response.json()
with open(filename, "w", encoding="utf-8") as f: if "evidences" in item:
json.dump(corpus_json, f, ensure_ascii=False, indent=4) golden_context += "\nEvidence fact triplets:"
for subject, relation, obj in item["evidences"]:
golden_context += f"\n{subject} - {relation} - {obj}"
if limit is not None and 0 < limit < len(corpus_json): return golden_context
random.seed(seed)
corpus_json = random.sample(corpus_json, limit)
corpus_list = []
question_answer_pairs = []
for dict in corpus_json:
for title, sentences in dict["context"]:
corpus_list.append(" ".join(sentences))
question_answer_pairs.append(
{
"question": dict["question"],
"answer": dict["answer"].lower(),
"type": dict["type"],
}
)
return corpus_list, question_answer_pairs