Delete cognee/modules/cognify/dataset.py

2024-11-14 14:49:45 +01:00 · 2024-11-14 14:49:45 +01:00 · 36ada5974d
commit 36ada5974d
parent 8e9040815f
1 changed files with 0 additions and 84 deletions
--- a/cognee/modules/cognify/dataset.py
+++ b/cognee/modules/cognify/dataset.py
@ -1,84 +0,0 @@
-import random
-
-from datasets import load_dataset
-
-from dspy.datasets.dataset import Dataset
-
-
-class HotPotQA(Dataset):
-    def __init__(self, *args, only_hard_examples=True, keep_details='dev_titles', unofficial_dev=True, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        assert only_hard_examples, "Care must be taken when adding support for easy examples." \
-                                   "Dev must be all hard to match official dev, but training can be flexible."
-        
-        hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train')
-        hf_official_dev = load_dataset("hotpot_qa", 'fullwiki', split='validation')
-
-        official_train = []
-        for raw_example in hf_official_train:
-            if raw_example['level'] == 'hard':
-                if keep_details is True:
-                    keys = ['id', 'question', 'answer', 'type', 'supporting_facts', 'context']
-                elif keep_details == 'dev_titles':
-                    keys = ['question', 'answer', 'supporting_facts']
-                else:
-                    keys = ['question', 'answer']
-
-                example = {k: raw_example[k] for k in keys}
-                
-                if 'supporting_facts' in example:
-                    example['gold_titles'] = set(example['supporting_facts']['title'])
-                    del example['supporting_facts']
-
-                official_train.append(example)
-
-        rng = random.Random(0)
-        rng.shuffle(official_train)
-
-        self._train = official_train[:len(official_train)*75//100]
-
-        if unofficial_dev:
-            self._dev = official_train[len(official_train)*75//100:]
-        else:
-            self._dev = None
-
-        for example in self._train:
-            if keep_details == 'dev_titles':
-                del example['gold_titles']
-        
-        test = []
-        for raw_example in hf_official_dev:
-            assert raw_example['level'] == 'hard'
-            example = {k: raw_example[k] for k in ['id', 'question', 'answer', 'type', 'supporting_facts']}
-            if 'supporting_facts' in example:
-                example['gold_titles'] = set(example['supporting_facts']['title'])
-                del example['supporting_facts']
-            test.append(example)
-
-        self._test = test
-
-
-if __name__ == '__main__':
-    from dsp.utils import dotdict
-
-    data_args = dotdict(train_seed=1, train_size=16, eval_seed=2023, dev_size=200*5, test_size=0)
-    dataset = HotPotQA(**data_args)
-
-    print(dataset)
-    print(dataset.train[0].question)
-    print(dataset.train[15].question)
-
-    print(len(dataset.train), len(dataset.dev), len(dataset.test))
-
-    print(dataset.dev[0].question)
-    print(dataset.dev[340].question)
-    print(dataset.dev[937].question)
-
-"""
-What was the population of the city where Woodward Avenue ends in 2010?
-Where did the star , who is also an executive producer, of the Mick begin her carrer? 
-16 1000 0
-Both London and German have seen attacks during war, there was one specific type of attack that Germany called the blitz, what did London call a similar attack?
-Pre-Madonna was a collection of demos by the singer who was a leading presence during the emergence of what network?
-Alan Mills composed the classic folk song that tells the story of what? 
-"""