feat: retrieve golden contexts [COG-1364] (#579)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> • Added load_golden_context parameter to BaseBenchmarkAdapter's abstract load_corpus method, establishing a common interface for retrieving supporting evidence • Refactored HotpotQAAdapter with a modular design: introduced _get_metadata_field_name method to handle dataset-specific fields (making it extensible for child classes), implemented get golden context functionality. • Refactored TwoWikiMultihopAdapter to inherit from HotpotQAAdapter, overriding only the necessary methods while reusing parent's functionality • Added golden context support to MusiqueQAAdapter with their decomposition-based format ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced an option to include additional context during corpus loading, enhancing the quality and flexibility of generated QA pairs. - **Refactor** - Streamlined and modularized the processing workflow across different adapters for improved consistency and maintainability. - Updated metadata extraction to refine the display of contextual information. - Shifted focus in the `TwoWikiMultihopAdapter` from corpus loading to context extraction. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
4c3c811c1e
commit
4b7c21d7d8
4 changed files with 122 additions and 87 deletions
|
|
@ -4,5 +4,7 @@ from typing import List, Optional
|
|||
|
||||
class BaseBenchmarkAdapter(ABC):
|
||||
@abstractmethod
|
||||
def load_corpus(self, limit: Optional[int] = None, seed: int = 42) -> List[str]:
|
||||
def load_corpus(
|
||||
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
|
||||
) -> List[str]:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import requests
|
|||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import Optional, Any
|
||||
from typing import Optional, Any, List, Tuple
|
||||
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
|
||||
|
||||
|
||||
|
|
@ -14,9 +14,55 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
|
|||
# distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.metadata_field_name = "level"
|
||||
|
||||
def _is_valid_supporting_fact(self, sentences: List[str], sentence_idx: Any) -> bool:
|
||||
"""Validates if a supporting fact index is valid for the given sentences."""
|
||||
return sentences and isinstance(sentence_idx, int) and 0 <= sentence_idx < len(sentences)
|
||||
|
||||
def _get_golden_context(self, item: dict[str, Any]) -> str:
|
||||
"""Extracts and formats the golden context from supporting facts."""
|
||||
# Create a mapping of title to sentences for easy lookup
|
||||
context_dict = {title: sentences for (title, sentences) in item["context"]}
|
||||
|
||||
# Get all supporting facts in order
|
||||
golden_contexts = []
|
||||
for title, sentence_idx in item["supporting_facts"]:
|
||||
sentences = context_dict.get(title, [])
|
||||
if not self._is_valid_supporting_fact(sentences, sentence_idx):
|
||||
continue
|
||||
golden_contexts.append(f"{title}: {sentences[sentence_idx]}")
|
||||
|
||||
return "\n".join(golden_contexts)
|
||||
|
||||
def _process_item(
|
||||
self,
|
||||
item: dict[str, Any],
|
||||
corpus_list: List[str],
|
||||
question_answer_pairs: List[dict[str, Any]],
|
||||
load_golden_context: bool = False,
|
||||
) -> None:
|
||||
"""Processes a single item and adds it to the corpus and QA pairs."""
|
||||
for title, sentences in item["context"]:
|
||||
corpus_list.append(" ".join(sentences))
|
||||
|
||||
qa_pair = {
|
||||
"question": item["question"],
|
||||
"answer": item["answer"].lower(),
|
||||
self.metadata_field_name: item[self.metadata_field_name],
|
||||
}
|
||||
|
||||
if load_golden_context:
|
||||
qa_pair["golden_context"] = self._get_golden_context(item)
|
||||
|
||||
question_answer_pairs.append(qa_pair)
|
||||
|
||||
def load_corpus(
|
||||
self, limit: Optional[int] = None, seed: int = 42
|
||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
|
||||
) -> Tuple[List[str], List[dict[str, Any]]]:
|
||||
"""Loads and processes the HotpotQA corpus, optionally with golden context."""
|
||||
filename = self.dataset_info["filename"]
|
||||
|
||||
if os.path.exists(filename):
|
||||
|
|
@ -36,16 +82,8 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
|
|||
|
||||
corpus_list = []
|
||||
question_answer_pairs = []
|
||||
for item in corpus_json:
|
||||
for title, sentences in item["context"]:
|
||||
corpus_list.append(" ".join(sentences))
|
||||
|
||||
question_answer_pairs.append(
|
||||
{
|
||||
"question": item["question"],
|
||||
"answer": item["answer"].lower(),
|
||||
"level": item["level"],
|
||||
}
|
||||
)
|
||||
for item in corpus_json:
|
||||
self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)
|
||||
|
||||
return corpus_list, question_answer_pairs
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import Optional, Any
|
||||
from typing import Optional, Any, List
|
||||
import zipfile
|
||||
|
||||
import gdown
|
||||
|
|
@ -10,38 +10,71 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB
|
|||
|
||||
|
||||
class MusiqueQAAdapter(BaseBenchmarkAdapter):
|
||||
"""
|
||||
Adapter to load and process the Musique QA dataset from a local .jsonl file.
|
||||
Optionally downloads and unzips the dataset if it does not exist locally.
|
||||
"""
|
||||
"""Adapter for the Musique QA dataset with local file loading and optional download."""
|
||||
|
||||
dataset_info = {
|
||||
# Name of the final file we want to load
|
||||
"filename": "data/musique_ans_v1.0_dev.jsonl",
|
||||
# A Google Drive URL (or share link) to the ZIP containing this file
|
||||
"download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
|
||||
# The name of the ZIP archive we expect after downloading
|
||||
"zip_filename": "musique_v1.0.zip",
|
||||
}
|
||||
|
||||
def _get_golden_context(self, item: dict[str, Any]) -> str:
|
||||
"""Extracts golden context from question decomposition and supporting paragraphs."""
|
||||
golden_context = []
|
||||
paragraphs = item.get("paragraphs", [])
|
||||
|
||||
# Process each decomposition step
|
||||
for step in item.get("question_decomposition", []):
|
||||
# Add the supporting paragraph if available
|
||||
support_idx = step.get("paragraph_support_idx")
|
||||
if isinstance(support_idx, int) and 0 <= support_idx < len(paragraphs):
|
||||
para = paragraphs[support_idx]
|
||||
golden_context.append(f"{para['title']}: {para['paragraph_text']}")
|
||||
|
||||
# Add the step's question and answer
|
||||
golden_context.append(f"Q: {step['question']}")
|
||||
golden_context.append(f"A: {step['answer']}")
|
||||
golden_context.append("") # Empty line between steps
|
||||
|
||||
return "\n".join(golden_context)
|
||||
|
||||
def _process_item(
|
||||
self,
|
||||
item: dict[str, Any],
|
||||
corpus_list: List[str],
|
||||
question_answer_pairs: List[dict[str, Any]],
|
||||
load_golden_context: bool = False,
|
||||
) -> None:
|
||||
"""Processes a single item and adds it to the corpus and QA pairs."""
|
||||
# Add paragraphs to corpus
|
||||
paragraphs = item.get("paragraphs", [])
|
||||
for paragraph in paragraphs:
|
||||
corpus_list.append(paragraph["paragraph_text"])
|
||||
|
||||
# Create QA pair
|
||||
qa_pair = {
|
||||
"id": item.get("id", ""),
|
||||
"question": item.get("question", ""),
|
||||
"answer": item.get("answer", "").lower()
|
||||
if isinstance(item.get("answer"), str)
|
||||
else item.get("answer"),
|
||||
}
|
||||
|
||||
if load_golden_context:
|
||||
qa_pair["golden_context"] = self._get_golden_context(item)
|
||||
|
||||
question_answer_pairs.append(qa_pair)
|
||||
|
||||
def load_corpus(
|
||||
self,
|
||||
limit: Optional[int] = None,
|
||||
seed: int = 42,
|
||||
load_golden_context: bool = False,
|
||||
auto_download: bool = True,
|
||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||
"""
|
||||
Loads the Musique QA dataset.
|
||||
|
||||
:param limit: If set, randomly sample 'limit' items.
|
||||
:param seed: Random seed for sampling.
|
||||
:param auto_download: If True, attempt to download + unzip the dataset
|
||||
from Google Drive if the .jsonl file is not present locally.
|
||||
:return: (corpus_list, question_answer_pairs)
|
||||
"""
|
||||
"""Loads and processes the Musique QA dataset."""
|
||||
target_filename = self.dataset_info["filename"]
|
||||
|
||||
# 1. Ensure the file is locally available; optionally download if missing
|
||||
if not os.path.exists(target_filename):
|
||||
if auto_download:
|
||||
self._musique_download_file()
|
||||
|
|
@ -62,29 +95,12 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
|
|||
question_answer_pairs = []
|
||||
|
||||
for item in data:
|
||||
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
|
||||
paragraphs = item.get("paragraphs", [])
|
||||
for paragraph in paragraphs:
|
||||
corpus_list.append(paragraph["paragraph_text"])
|
||||
|
||||
question = item.get("question", "")
|
||||
answer = item.get("answer", "")
|
||||
|
||||
question_answer_pairs.append(
|
||||
{
|
||||
"id": item.get("id", ""),
|
||||
"question": question,
|
||||
"answer": answer.lower() if isinstance(answer, str) else answer,
|
||||
}
|
||||
)
|
||||
self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)
|
||||
|
||||
return corpus_list, question_answer_pairs
|
||||
|
||||
def _musique_download_file(self) -> None:
|
||||
"""
|
||||
Download and unzip the Musique dataset if not already present locally.
|
||||
Uses gdown for Google Drive links.
|
||||
"""
|
||||
"""Downloads and unzips the Musique dataset if not present locally."""
|
||||
url = self.dataset_info["download_url"]
|
||||
zip_filename = self.dataset_info["zip_filename"]
|
||||
target_filename = self.dataset_info["filename"]
|
||||
|
|
|
|||
|
|
@ -2,48 +2,27 @@ import requests
|
|||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import Optional, Any
|
||||
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
|
||||
from typing import Optional, Any, List, Tuple
|
||||
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
|
||||
|
||||
|
||||
class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
|
||||
class TwoWikiMultihopAdapter(HotpotQAAdapter):
|
||||
dataset_info = {
|
||||
"filename": "2wikimultihop_dev.json",
|
||||
"URL": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
|
||||
"url": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
|
||||
}
|
||||
|
||||
def load_corpus(
|
||||
self, limit: Optional[int] = None, seed: int = 42
|
||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||
filename = self.dataset_info["filename"]
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.metadata_field_name = "type"
|
||||
|
||||
if os.path.exists(filename):
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
corpus_json = json.load(f)
|
||||
else:
|
||||
response = requests.get(self.dataset_info["URL"])
|
||||
response.raise_for_status()
|
||||
corpus_json = response.json()
|
||||
def _get_golden_context(self, item: dict[str, Any]) -> str:
|
||||
"""Extracts and formats the golden context from supporting facts and adds evidence if available."""
|
||||
golden_context = super()._get_golden_context(item)
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(corpus_json, f, ensure_ascii=False, indent=4)
|
||||
if "evidences" in item:
|
||||
golden_context += "\nEvidence fact triplets:"
|
||||
for subject, relation, obj in item["evidences"]:
|
||||
golden_context += f"\n • {subject} - {relation} - {obj}"
|
||||
|
||||
if limit is not None and 0 < limit < len(corpus_json):
|
||||
random.seed(seed)
|
||||
corpus_json = random.sample(corpus_json, limit)
|
||||
|
||||
corpus_list = []
|
||||
question_answer_pairs = []
|
||||
for dict in corpus_json:
|
||||
for title, sentences in dict["context"]:
|
||||
corpus_list.append(" ".join(sentences))
|
||||
|
||||
question_answer_pairs.append(
|
||||
{
|
||||
"question": dict["question"],
|
||||
"answer": dict["answer"].lower(),
|
||||
"type": dict["type"],
|
||||
}
|
||||
)
|
||||
|
||||
return corpus_list, question_answer_pairs
|
||||
return golden_context
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue