Merge pull request #215 from topoteretes/clean_dspy

Remove dspy logic that confuses
2024-11-14 14:51:51 +01:00 · 2024-11-14 14:51:51 +01:00 · 535d8281b4
commit 535d8281b4
parent c1007091d1 bc2e17592d
4 changed files with 0 additions and 306 deletions
--- a/cognee/modules/cognify/dataset.py
+++ b/cognee/modules/cognify/dataset.py
@ -1,84 +0,0 @@
-import random
-
-from datasets import load_dataset
-
-from dspy.datasets.dataset import Dataset
-
-
-class HotPotQA(Dataset):
-    def __init__(self, *args, only_hard_examples=True, keep_details='dev_titles', unofficial_dev=True, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        assert only_hard_examples, "Care must be taken when adding support for easy examples." \
-                                   "Dev must be all hard to match official dev, but training can be flexible."
-        
-        hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train')
-        hf_official_dev = load_dataset("hotpot_qa", 'fullwiki', split='validation')
-
-        official_train = []
-        for raw_example in hf_official_train:
-            if raw_example['level'] == 'hard':
-                if keep_details is True:
-                    keys = ['id', 'question', 'answer', 'type', 'supporting_facts', 'context']
-                elif keep_details == 'dev_titles':
-                    keys = ['question', 'answer', 'supporting_facts']
-                else:
-                    keys = ['question', 'answer']
-
-                example = {k: raw_example[k] for k in keys}
-                
-                if 'supporting_facts' in example:
-                    example['gold_titles'] = set(example['supporting_facts']['title'])
-                    del example['supporting_facts']
-
-                official_train.append(example)
-
-        rng = random.Random(0)
-        rng.shuffle(official_train)
-
-        self._train = official_train[:len(official_train)*75//100]
-
-        if unofficial_dev:
-            self._dev = official_train[len(official_train)*75//100:]
-        else:
-            self._dev = None
-
-        for example in self._train:
-            if keep_details == 'dev_titles':
-                del example['gold_titles']
-        
-        test = []
-        for raw_example in hf_official_dev:
-            assert raw_example['level'] == 'hard'
-            example = {k: raw_example[k] for k in ['id', 'question', 'answer', 'type', 'supporting_facts']}
-            if 'supporting_facts' in example:
-                example['gold_titles'] = set(example['supporting_facts']['title'])
-                del example['supporting_facts']
-            test.append(example)
-
-        self._test = test
-
-
-if __name__ == '__main__':
-    from dsp.utils import dotdict
-
-    data_args = dotdict(train_seed=1, train_size=16, eval_seed=2023, dev_size=200*5, test_size=0)
-    dataset = HotPotQA(**data_args)
-
-    print(dataset)
-    print(dataset.train[0].question)
-    print(dataset.train[15].question)
-
-    print(len(dataset.train), len(dataset.dev), len(dataset.test))
-
-    print(dataset.dev[0].question)
-    print(dataset.dev[340].question)
-    print(dataset.dev[937].question)
-
-"""
-What was the population of the city where Woodward Avenue ends in 2010?
-Where did the star , who is also an executive producer, of the Mick begin her carrer? 
-16 1000 0
-Both London and German have seen attacks during war, there was one specific type of attack that Germany called the blitz, what did London call a similar attack?
-Pre-Madonna was a collection of demos by the singer who was a leading presence during the emergence of what network?
-Alan Mills composed the classic folk song that tells the story of what? 
-"""
--- a/cognee/modules/cognify/evaluate.py
+++ b/cognee/modules/cognify/evaluate.py
@ -1,65 +0,0 @@
-import dspy
-from dspy.evaluate.evaluate import Evaluate
-from dspy.primitives.example import Example
-from cognee.modules.data.extraction.knowledge_graph.extract_knowledge_graph_module import ExtractKnowledgeGraph
-from cognee.root_dir import get_absolute_path
-from cognee.shared.data_models import Answer
-from cognee.infrastructure.llm import get_llm_config
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.cognify.dataset import HotPotQA
-
-def evaluate():
-    dataset = HotPotQA(
-        train_seed = 1,
-        train_size = 16,
-        eval_seed = 2023,
-        dev_size = 8,
-        test_size = 0,
-        keep_details = True,
-    )
-
-    #Evaluate
-    evaluate_examples = [
-        Example(
-            base = None,
-            question = None,
-            context = "\r\n".join("".join(sentences) for sentences in example.context["sentences"]),
-            answer = example.answer,
-        ) for example in dataset.dev
-    ]
-
-    devset = [example.with_inputs("context", "question") for example in evaluate_examples]
-
-    evaluate_on_hotpotqa = Evaluate(devset = devset, num_threads = 1, display_progress = True, display_table = 5, max_tokens = 4096)
-
-    llm_config = get_llm_config()
-    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
-    compiled_extract_knowledge_graph = ExtractKnowledgeGraph(lm = gpt4)
-    compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
-
-    def evaluate_answer(example, graph_prediction, trace = None):
-        llm_client = get_llm_client()
-
-        try:
-            answer_prediction = llm_client.create_structured_output(
-                text_input = example.question,
-                system_prompt = f"""Answer the question by looking at the provided knowledge graph.
-                Use only the graph to answer the question and be very brief.
-                This is the knowledge graph:
-                {graph_prediction.graph.model_dump(mode = "json")}""",
-                response_model = Answer,
-            )
-        except:
-            return False
-
-        return dsp.answer_match(example.answer, [answer_prediction.answer], frac = 0.8) or \
-            dsp.passage_match([example.answer], [answer_prediction.answer])
-
-    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
-    dspy.settings.configure(lm = gpt4)
-
-    evaluate_on_hotpotqa(compiled_extract_knowledge_graph, metric = evaluate_answer)
-
-
-if __name__ == "__main__":
-    evaluate()
--- a/cognee/modules/cognify/test.py
+++ b/cognee/modules/cognify/test.py
@ -1,89 +0,0 @@
-import dspy
-from cognee.modules.data.extraction.knowledge_graph.extract_knowledge_graph_module import ExtractKnowledgeGraph
-from cognee.root_dir import get_absolute_path
-from cognee.infrastructure.llm import get_llm_config
-
-def run():
-    llm_config = get_llm_config()
-    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
-    compiled_extract_knowledge_graph = ExtractKnowledgeGraph(lm = gpt4)
-    compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
-
-    text = """The 1985 FA Charity Shield (also known as the General Motors FA
-              Charity Shield for sponsorship reasons) was the 63rd FA Charity Shield,
-              an annual football match played between the winners of the previous
-              season's First Division and FA Cup competitions. The match was played on
-              10 August 1985 at Wembley Stadium and contested by Everton,
-              who had won the 1984\u201385 First Division, and Manchester United,
-              who had won the 1984\u201385 FA Cup. Everton won 2\u20130 with goals from
-              Trevor Steven and Adrian Heath. Trevor Steven put Everton into the lead
-              when he swept home from six yards after a cross from the left in the first half.
-              The second goal came in the second half when Manchester United goalkeeper
-              Gary Bailey dropped a cross from the left to allow Adrian Heath to tip the
-              ball past him into the left corner of the net.\r\nThe 1995 FA Charity Shield
-              (also known as the Littlewoods FA Charity Shield for sponsorship reasons) was the
-              73rd FA Charity Shield, an annual football match played between the winners of
-              the previous season's Premier League and FA Cup competitions. The match was
-              played on 13 August 1995 at Wembley Stadium and contested by Blackburn Rovers,
-              who had won the Premier League and FA Cup winners Everton. It was Blackburn's
-              second successive Charity Shield appearance, while Everton were appearing in
-              their eleventh and their first since 1987. Everton won the match 1\u20130
-              with a goal from Vinny Samways when he caught Tim Flowers off his line and
-              lifted the ball over him from the left of the penalty area and into the right
-              corner of the net. Dave Watson lifted the trophy for Everton.\r\nThe 1972 FA
-              Charity Shield was contested between Manchester City and Aston Villa.\r\nThe
-              1997 FA Charity Shield (known as the Littlewoods FA Charity Shield for
-              sponsorship reasons) was the 75th FA Charity Shield, an annual football match
-              played between the winners of the previous season's Premier League and
-              FA Cup competitions. The match was played on 3 August 1997 at Wembley Stadium
-              and contested by Manchester United, who had won the 1996\u201397 FA Premier League,
-              and Chelsea, who had won the 1996\u201397 FA Cup. Manchester United won the match
-              4\u20132 on penalties after the match had finished at 1\u20131 after 90 minutes.
-              \r\nThe 1956 FA Charity Shield was the 34th FA Charity Shield, an annual football
-              match held between the winners of the previous season's Football League and
-              FA Cup competitions. The match was contested by Manchester United, who had won
-              the 1955\u201356 Football League, and Manchester City, who had won the
-              1955\u201356 FA Cup, at Maine Road, Manchester, on 24 October 1956. Manchester
-              United won the match 1\u20130, Dennis Viollet scoring the winning goal.
-              Manchester United goalkeeper David Gaskell made his debut for the club during
-              the game, taking the place of injured goalkeeper Ray Wood, and, at the age of
-              16 years and 19 days, became the youngest player ever to play for the club.
-              \r\nThe 1937 FA Charity Shield was the 24th FA Charity Shield, a football match
-              between the winners of the previous season's First Division and FA Cup competitions.
-              The match was contested by league champions Manchester City and FA Cup winners
-              Sunderland, and was played at Maine Road, the home ground of Manchester City.
-              Manchester City won the game, 2\u20130.\r\nThe 2000 FA Charity Shield (also known
-              as the One 2 One FA Charity Shield for sponsorship reasons) was the
-              78th FA Charity Shield, an annual football match played between the winners
-              of the previous season's Premier League and FA Cup competitions. The match
-              was played between Manchester United, who won the 1999\u20132000 Premier League,
-              and Chelsea, who won the 1999\u20132000 FA Cup, and resulted in a 2\u20130 Chelsea win.
-              The goals were scored by Jimmy Floyd Hasselbaink and Mario Melchiot. Roy Keane
-              was sent off for a challenge on Gustavo Poyet and was the last person to be
-              sent off at the old Wembley Stadium.\r\nThe 2001 FA Charity Shield (also known
-              as the One 2 One FA Charity Shield for sponsorship reasons) was the 79th FA Charity Shield,
-              an annual football match played between the winners of the previous season's
-              Premier League and FA Cup. The match was contested between Liverpool, winners of
-              the 2000\u201301 FA Cup and Manchester United, who won the 2000\u201301 Premier
-              League on 12 August 2001. It was the first Shield match to be held at the
-              Millennium Stadium following the closure of Wembley Stadium for reconstruction.
-              \r\nAston Villa Football Club ( ; nicknamed Villa, The Villa, The Villans
-              and The Lions) is a professional football club in Aston, Birmingham, that plays
-              in the Championship, the second level of English football. Founded in 1874,
-              they have played at their current home ground, Villa Park, since 1897. Aston Villa
-              were one of the founder members of the Football League in 1888 and of the
-              Premier League in 1992.\r\nThe 1996 FA Charity Shield (also known as the
-              Littlewoods FA Charity Shield for sponsorship reasons) was the 74th FA Charity Shield,
-              an annual football match played between the winners of the previous season's Premier
-              League and FA Cup competitions. The match was played on 11 August 1996 at Wembley
-              Stadium and contested by Manchester United, who had won the Double of Premier League
-              and FA Cup in 1995\u201396, and Newcastle United, who had finished as runners-up
-              in the Premier League. Manchester United won the match 4\u20130 with goals from
-              Eric Cantona, Nicky Butt, David Beckham and Roy Keane."""
-
-    prediction = compiled_extract_knowledge_graph(context = text, question = "")
-
-    print(prediction.graph)
-
-if __name__ == "__main__":
-    run()
--- a/cognee/modules/cognify/train.py
+++ b/cognee/modules/cognify/train.py
@ -1,68 +0,0 @@
-import dspy
-from dspy.teleprompt import BootstrapFewShot
-from dspy.primitives.example import Example
-from cognee.modules.data.extraction.knowledge_graph.extract_knowledge_graph import ExtractKnowledgeGraph
-from cognee.root_dir import get_absolute_path
-from cognee.infrastructure.files.storage import LocalStorage
-from cognee.shared.data_models import Answer
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.cognify.dataset import HotPotQA
-from cognee.infrastructure.llm import get_llm_config
-
-def train():
-    colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = "http://20.102.90.50:2017/wiki17_abstracts")
-
-    dspy.configure(rm = colbertv2_wiki17_abstracts)
-
-    def evaluate_answer(example, graph_prediction, trace = None):
-        llm_client = get_llm_client()
-
-        try:
-            answer_prediction = llm_client.create_structured_output(
-                text_input = example.question,
-                system_prompt = f"""Answer the question by looking at the provided knowledge graph.
-                Use only the graph to answer the question and be very brief.
-                This is the knowledge graph:
-                {graph_prediction.graph.model_dump(mode = "json")}""",
-                response_model = Answer,
-            )
-        except:
-            return False
-
-        return dsp.answer_match(example.answer, [answer_prediction.answer], frac = 0.8) or \
-            dsp.passage_match([example.answer], [answer_prediction.answer])
-
-    optimizer = BootstrapFewShot(metric = evaluate_answer)
-
-    dataset = HotPotQA(
-        train_seed = 1,
-        train_size = 16,
-        eval_seed = 2023,
-        dev_size = 8,
-        test_size = 0,
-        keep_details = True,
-    )
-
-    # Train
-    train_examples = [
-        Example(
-            base = None,
-            question = example.question,
-            context = "\r\n".join("".join(sentences) for sentences in example.context["sentences"]),
-            answer = example.answer,
-        ) for example in dataset.train
-    ]
-
-    trainset = [example.with_inputs("context", "question") for example in train_examples]
-
-    llm_config = get_llm_config()
-    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
-
-    compiled_extract_knowledge_graph = optimizer.compile(ExtractKnowledgeGraph(lm = gpt4), trainset = trainset)
-
-    # Save program
-    LocalStorage.ensure_directory_exists(get_absolute_path("./programs/extract_knowledge_graph"))
-    compiled_extract_knowledge_graph.save(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
-
-if __name__ == "__main__":
-    train()