cognee/evals/eval_framework/benchmark_adapters/musique_adapter.py
lxobr bb8cb692e0
Cog 1293 corpus builder custom cognify tasks (#527)
<!-- .github/pull_request_template.md -->

## Description
- Enable custom tasks in corpus building
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced a configurable option to specify the task retrieval
strategy during corpus building.
- Enhanced the workflow with integrated task fetching, featuring a
default retrieval mechanism.
- Updated evaluation configuration to support customizable task
selection for more flexible operations.
- Added a new abstract base class for defining various task retrieval
strategies.
- Introduced a new enumeration to map task getter types to their
corresponding classes.
  
- **Dependencies**
  - Added a new dependency for downloading files from Google Drive.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-02-12 16:44:08 +01:00

121 lines
4.7 KiB
Python

import os
import json
import random
from typing import Optional, Union, Any, LiteralString
import zipfile
import gdown # pip install gdown
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
class MusiqueQAAdapter(BaseBenchmarkAdapter):
"""
Adapter to load and process the Musique QA dataset from a local .jsonl file.
Optionally downloads and unzips the dataset if it does not exist locally.
"""
dataset_info = {
# Name of the final file we want to load
"filename": "musique_ans_v1.0_dev.jsonl",
# A Google Drive URL (or share link) to the ZIP containing this file
"download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
# The name of the ZIP archive we expect after downloading
"zip_filename": "musique_v1.0.zip",
}
def load_corpus(
self,
limit: Optional[int] = None,
seed: int = 42,
auto_download: bool = True,
) -> tuple[list[str], list[dict[str, Any]]]:
"""
Loads the Musique QA dataset.
:param limit: If set, randomly sample 'limit' items.
:param seed: Random seed for sampling.
:param auto_download: If True, attempt to download + unzip the dataset
from Google Drive if the .jsonl file is not present locally.
:return: (corpus_list, question_answer_pairs)
"""
target_filename = self.dataset_info["filename"]
# 1. Ensure the file is locally available; optionally download if missing
if not os.path.exists(target_filename):
if auto_download:
self._musique_download_file()
else:
raise FileNotFoundError(
f"Expected dataset file not found: {target_filename}\n"
"Set auto_download=True or manually place the file."
)
# 2. Read the JSONL file
with open(target_filename, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# 3. (Optional) sample a subset of items
if limit is not None and 0 < limit < len(data):
random.seed(seed)
data = random.sample(data, limit)
# 4. Build up corpus_list and question_answer_pairs
corpus_list = []
question_answer_pairs = []
for item in data:
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
paragraphs = item.get("paragraphs", [])
combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
corpus_list.append(combined_paragraphs)
# Example question & answer
# Adjust keys to match your actual JSON structure if needed
question = item.get("question", "")
# If you have a known 'answer' key, or sometimes it's "answer_aliases", adapt accordingly
answer = item.get("answer", "")
question_answer_pairs.append(
{
"id": item.get("id", ""),
"question": question,
"answer": answer.lower() if isinstance(answer, str) else answer,
}
)
return corpus_list, question_answer_pairs
def _musique_download_file(self) -> None:
"""
Download and unzip the Musique dataset if not already present locally.
Uses gdown for Google Drive links.
"""
url = self.dataset_info["download_url"]
zip_filename = self.dataset_info["zip_filename"]
target_filename = self.dataset_info["filename"]
if os.path.exists(target_filename):
print(f"File '{target_filename}' is already present. Skipping download.")
return
print(f"Attempting to download from Google Drive: {url}")
# Using gdown to download the ZIP from a Google Drive link
gdown.download(url=url, output=zip_filename, quiet=False, fuzzy=True)
# Unzip the downloaded file
if os.path.exists(zip_filename):
print(f"Unzipping {zip_filename} ...")
with zipfile.ZipFile(zip_filename, "r") as zip_ref:
zip_ref.extractall() # Extract to current directory
# Optionally remove the ZIP after extraction
os.remove(zip_filename)
else:
raise FileNotFoundError(f"Failed to download the zip file: {zip_filename}")
# Optional check: ensure the final .jsonl appeared
if not os.path.exists(target_filename):
raise FileNotFoundError(
f"After unzipping, '{target_filename}' not found. "
"Check the contents of the extracted files."
)