cognee/evals/qa_dataset_utils.py

from cognee.root_dir import get_absolute_path
import json
import requests
from jsonschema import ValidationError, validate
from pathlib import Path


qa_datasets = {
    "hotpotqa": {
        "filename": "hotpot_dev_fullwiki_v1.json",
        "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json",
    },
    "2wikimultihop": {
        "filename": "data/dev.json",
        "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1",
    },
}

qa_json_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "answer": {"type": "string"},
            "question": {"type": "string"},
            "context": {"type": "array"},
        },
        "required": ["answer", "question", "context"],
        "additionalProperties": True,
    },
}


def download_qa_dataset(dataset_name: str, filepath: Path):
    if dataset_name not in qa_datasets:
        raise ValueError(f"{dataset_name} is not a supported dataset.")

    url = qa_datasets[dataset_name]["URL"]

    if dataset_name == "2wikimultihop":
        raise Exception(
            "Please download 2wikimultihop dataset (data.zip) manually from \
                        https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \
                        and unzip it."
        )

    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(filepath, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Dataset {dataset_name} downloaded and saved to {filepath}")
    else:
        print(f"Failed to download {dataset_name}. Status code: {response.status_code}")


def load_qa_dataset(dataset_name_or_filename: str) -> list[dict]:
    if dataset_name_or_filename in qa_datasets:
        dataset_name = dataset_name_or_filename
        filename = qa_datasets[dataset_name]["filename"]

        data_root_dir = get_absolute_path("../.data")
        if not Path(data_root_dir).exists():
            Path(data_root_dir).mkdir()

        filepath = data_root_dir / Path(filename)
        if not filepath.exists():
            download_qa_dataset(dataset_name, filepath)
    else:
        filename = dataset_name_or_filename
        filepath = Path(filename)

    with open(filepath, "r") as file:
        dataset = json.load(file)

    try:
        validate(instance=dataset, schema=qa_json_schema)
    except ValidationError as e:
        raise ValidationError(f"Invalid QA dataset: {e.message}")

    return dataset