cognee/evals/eval_framework/benchmark_adapters/musique_adapter.py

import os
import json
import random
from typing import Optional, Union, Any, LiteralString
import zipfile

import gdown  # pip install gdown

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class MusiqueQAAdapter(BaseBenchmarkAdapter):
    """
    Adapter to load and process the Musique QA dataset from a local .jsonl file.
    Optionally downloads and unzips the dataset if it does not exist locally.
    """

    dataset_info = {
        # Name of the final file we want to load
        "filename": "musique_ans_v1.0_dev.jsonl",
        # A Google Drive URL (or share link) to the ZIP containing this file
        "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
        # The name of the ZIP archive we expect after downloading
        "zip_filename": "musique_v1.0.zip",
    }

    def load_corpus(
        self,
        limit: Optional[int] = None,
        seed: int = 42,
        auto_download: bool = True,
    ) -> tuple[list[str], list[dict[str, Any]]]:
        """
        Loads the Musique QA dataset.

        :param limit: If set, randomly sample 'limit' items.
        :param seed: Random seed for sampling.
        :param auto_download: If True, attempt to download + unzip the dataset
            from Google Drive if the .jsonl file is not present locally.
        :return: (corpus_list, question_answer_pairs)
        """
        target_filename = self.dataset_info["filename"]

        # 1. Ensure the file is locally available; optionally download if missing
        if not os.path.exists(target_filename):
            if auto_download:
                self._musique_download_file()
            else:
                raise FileNotFoundError(
                    f"Expected dataset file not found: {target_filename}\n"
                    "Set auto_download=True or manually place the file."
                )

        # 2. Read the JSONL file
        with open(target_filename, "r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f]

        # 3. (Optional) sample a subset of items
        if limit is not None and 0 < limit < len(data):
            random.seed(seed)
            data = random.sample(data, limit)

        # 4. Build up corpus_list and question_answer_pairs
        corpus_list = []
        question_answer_pairs = []

        for item in data:
            # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
            paragraphs = item.get("paragraphs", [])
            combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
            corpus_list.append(combined_paragraphs)

            # Example question & answer
            # Adjust keys to match your actual JSON structure if needed
            question = item.get("question", "")
            # If you have a known 'answer' key, or sometimes it's "answer_aliases", adapt accordingly
            answer = item.get("answer", "")

            question_answer_pairs.append(
                {
                    "id": item.get("id", ""),
                    "question": question,
                    "answer": answer.lower() if isinstance(answer, str) else answer,
                }
            )

        return corpus_list, question_answer_pairs

    def _musique_download_file(self) -> None:
        """
        Download and unzip the Musique dataset if not already present locally.
        Uses gdown for Google Drive links.
        """
        url = self.dataset_info["download_url"]
        zip_filename = self.dataset_info["zip_filename"]
        target_filename = self.dataset_info["filename"]

        if os.path.exists(target_filename):
            print(f"File '{target_filename}' is already present. Skipping download.")
            return

        print(f"Attempting to download from Google Drive: {url}")
        # Using gdown to download the ZIP from a Google Drive link
        gdown.download(url=url, output=zip_filename, quiet=False, fuzzy=True)

        # Unzip the downloaded file
        if os.path.exists(zip_filename):
            print(f"Unzipping {zip_filename} ...")
            with zipfile.ZipFile(zip_filename, "r") as zip_ref:
                zip_ref.extractall()  # Extract to current directory
            # Optionally remove the ZIP after extraction
            os.remove(zip_filename)
        else:
            raise FileNotFoundError(f"Failed to download the zip file: {zip_filename}")

        # Optional check: ensure the final .jsonl appeared
        if not os.path.exists(target_filename):
            raise FileNotFoundError(
                f"After unzipping, '{target_filename}' not found. "
                "Check the contents of the extracted files."
            )