Delete cognee/modules/cognify/dataset.py
This commit is contained in:
parent
8e9040815f
commit
36ada5974d
1 changed files with 0 additions and 84 deletions
|
|
@ -1,84 +0,0 @@
|
|||
import random
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from dspy.datasets.dataset import Dataset
|
||||
|
||||
|
||||
class HotPotQA(Dataset):
|
||||
def __init__(self, *args, only_hard_examples=True, keep_details='dev_titles', unofficial_dev=True, **kwargs) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
assert only_hard_examples, "Care must be taken when adding support for easy examples." \
|
||||
"Dev must be all hard to match official dev, but training can be flexible."
|
||||
|
||||
hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train')
|
||||
hf_official_dev = load_dataset("hotpot_qa", 'fullwiki', split='validation')
|
||||
|
||||
official_train = []
|
||||
for raw_example in hf_official_train:
|
||||
if raw_example['level'] == 'hard':
|
||||
if keep_details is True:
|
||||
keys = ['id', 'question', 'answer', 'type', 'supporting_facts', 'context']
|
||||
elif keep_details == 'dev_titles':
|
||||
keys = ['question', 'answer', 'supporting_facts']
|
||||
else:
|
||||
keys = ['question', 'answer']
|
||||
|
||||
example = {k: raw_example[k] for k in keys}
|
||||
|
||||
if 'supporting_facts' in example:
|
||||
example['gold_titles'] = set(example['supporting_facts']['title'])
|
||||
del example['supporting_facts']
|
||||
|
||||
official_train.append(example)
|
||||
|
||||
rng = random.Random(0)
|
||||
rng.shuffle(official_train)
|
||||
|
||||
self._train = official_train[:len(official_train)*75//100]
|
||||
|
||||
if unofficial_dev:
|
||||
self._dev = official_train[len(official_train)*75//100:]
|
||||
else:
|
||||
self._dev = None
|
||||
|
||||
for example in self._train:
|
||||
if keep_details == 'dev_titles':
|
||||
del example['gold_titles']
|
||||
|
||||
test = []
|
||||
for raw_example in hf_official_dev:
|
||||
assert raw_example['level'] == 'hard'
|
||||
example = {k: raw_example[k] for k in ['id', 'question', 'answer', 'type', 'supporting_facts']}
|
||||
if 'supporting_facts' in example:
|
||||
example['gold_titles'] = set(example['supporting_facts']['title'])
|
||||
del example['supporting_facts']
|
||||
test.append(example)
|
||||
|
||||
self._test = test
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from dsp.utils import dotdict
|
||||
|
||||
data_args = dotdict(train_seed=1, train_size=16, eval_seed=2023, dev_size=200*5, test_size=0)
|
||||
dataset = HotPotQA(**data_args)
|
||||
|
||||
print(dataset)
|
||||
print(dataset.train[0].question)
|
||||
print(dataset.train[15].question)
|
||||
|
||||
print(len(dataset.train), len(dataset.dev), len(dataset.test))
|
||||
|
||||
print(dataset.dev[0].question)
|
||||
print(dataset.dev[340].question)
|
||||
print(dataset.dev[937].question)
|
||||
|
||||
"""
|
||||
What was the population of the city where Woodward Avenue ends in 2010?
|
||||
Where did the star , who is also an executive producer, of the Mick begin her carrer?
|
||||
16 1000 0
|
||||
Both London and German have seen attacks during war, there was one specific type of attack that Germany called the blitz, what did London call a similar attack?
|
||||
Pre-Madonna was a collection of demos by the singer who was a leading presence during the emergence of what network?
|
||||
Alan Mills composed the classic folk song that tells the story of what?
|
||||
"""
|
||||
Loading…
Add table
Reference in a new issue