Merge pull request #215 from topoteretes/clean_dspy

Remove dspy logic that confuses
2024-11-14 14:51:51 +01:00 · 2024-11-14 14:51:51 +01:00 · 535d8281b4
commit 535d8281b4
parent c1007091d1 bc2e17592d
4 changed files with 0 additions and 306 deletions
--- a/cognee/modules/cognify/dataset.py
+++ b/cognee/modules/cognify/dataset.py
@ -1,84 +0,0 @@
 import random
 from datasets import load_dataset
 from dspy.datasets.dataset import Dataset
 class HotPotQA(Dataset):
    def __init__(self, *args, only_hard_examples=True, keep_details='dev_titles', unofficial_dev=True, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        assert only_hard_examples, "Care must be taken when adding support for easy examples." \
                                   "Dev must be all hard to match official dev, but training can be flexible."
        hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train')
        hf_official_dev = load_dataset("hotpot_qa", 'fullwiki', split='validation')
        official_train = []
        for raw_example in hf_official_train:
            if raw_example['level'] == 'hard':
                if keep_details is True:
                    keys = ['id', 'question', 'answer', 'type', 'supporting_facts', 'context']
                elif keep_details == 'dev_titles':
                    keys = ['question', 'answer', 'supporting_facts']
                else:
                    keys = ['question', 'answer']
                example = {k: raw_example[k] for k in keys}
                if 'supporting_facts' in example:
                    example['gold_titles'] = set(example['supporting_facts']['title'])
                    del example['supporting_facts']
                official_train.append(example)
        rng = random.Random(0)
        rng.shuffle(official_train)
        self._train = official_train[:len(official_train)*75//100]
        if unofficial_dev:
            self._dev = official_train[len(official_train)*75//100:]
        else:
            self._dev = None
        for example in self._train:
            if keep_details == 'dev_titles':
                del example['gold_titles']
        test = []
        for raw_example in hf_official_dev:
            assert raw_example['level'] == 'hard'
            example = {k: raw_example[k] for k in ['id', 'question', 'answer', 'type', 'supporting_facts']}
            if 'supporting_facts' in example:
                example['gold_titles'] = set(example['supporting_facts']['title'])
                del example['supporting_facts']
            test.append(example)
        self._test = test
 if __name__ == '__main__':
    from dsp.utils import dotdict
    data_args = dotdict(train_seed=1, train_size=16, eval_seed=2023, dev_size=200*5, test_size=0)
    dataset = HotPotQA(**data_args)
    print(dataset)
    print(dataset.train[0].question)
    print(dataset.train[15].question)
    print(len(dataset.train), len(dataset.dev), len(dataset.test))
    print(dataset.dev[0].question)
    print(dataset.dev[340].question)
    print(dataset.dev[937].question)
 """
 What was the population of the city where Woodward Avenue ends in 2010?
 Where did the star , who is also an executive producer, of the Mick begin her carrer? 
 16 1000 0
 Both London and German have seen attacks during war, there was one specific type of attack that Germany called the blitz, what did London call a similar attack?
 Pre-Madonna was a collection of demos by the singer who was a leading presence during the emergence of what network?
 Alan Mills composed the classic folk song that tells the story of what? 
 """
--- a/cognee/modules/cognify/evaluate.py
+++ b/cognee/modules/cognify/evaluate.py
@ -1,65 +0,0 @@
 import dspy
 from dspy.evaluate.evaluate import Evaluate
 from dspy.primitives.example import Example
 from cognee.modules.data.extraction.knowledge_graph.extract_knowledge_graph_module import ExtractKnowledgeGraph
 from cognee.root_dir import get_absolute_path
 from cognee.shared.data_models import Answer
 from cognee.infrastructure.llm import get_llm_config
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.modules.cognify.dataset import HotPotQA
 def evaluate():
    dataset = HotPotQA(
        train_seed = 1,
        train_size = 16,
        eval_seed = 2023,
        dev_size = 8,
        test_size = 0,
        keep_details = True,
    )
    #Evaluate
    evaluate_examples = [
        Example(
            base = None,
            question = None,
            context = "\r\n".join("".join(sentences) for sentences in example.context["sentences"]),
            answer = example.answer,
        ) for example in dataset.dev
    ]
    devset = [example.with_inputs("context", "question") for example in evaluate_examples]
    evaluate_on_hotpotqa = Evaluate(devset = devset, num_threads = 1, display_progress = True, display_table = 5, max_tokens = 4096)
    llm_config = get_llm_config()
    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
    compiled_extract_knowledge_graph = ExtractKnowledgeGraph(lm = gpt4)
    compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
    def evaluate_answer(example, graph_prediction, trace = None):
        llm_client = get_llm_client()
        try:
            answer_prediction = llm_client.create_structured_output(
                text_input = example.question,
                system_prompt = f"""Answer the question by looking at the provided knowledge graph.
                Use only the graph to answer the question and be very brief.
                This is the knowledge graph:
                {graph_prediction.graph.model_dump(mode = "json")}""",
                response_model = Answer,
            )
        except:
            return False
        return dsp.answer_match(example.answer, [answer_prediction.answer], frac = 0.8) or \
            dsp.passage_match([example.answer], [answer_prediction.answer])
    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
    dspy.settings.configure(lm = gpt4)
    evaluate_on_hotpotqa(compiled_extract_knowledge_graph, metric = evaluate_answer)
 if __name__ == "__main__":
    evaluate()
--- a/cognee/modules/cognify/test.py
+++ b/cognee/modules/cognify/test.py
@ -1,89 +0,0 @@
 import dspy
 from cognee.modules.data.extraction.knowledge_graph.extract_knowledge_graph_module import ExtractKnowledgeGraph
 from cognee.root_dir import get_absolute_path
 from cognee.infrastructure.llm import get_llm_config
 def run():
    llm_config = get_llm_config()
    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
    compiled_extract_knowledge_graph = ExtractKnowledgeGraph(lm = gpt4)
    compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
    text = """The 1985 FA Charity Shield (also known as the General Motors FA
              Charity Shield for sponsorship reasons) was the 63rd FA Charity Shield,
              an annual football match played between the winners of the previous
              season's First Division and FA Cup competitions. The match was played on
              10 August 1985 at Wembley Stadium and contested by Everton,
              who had won the 1984\u201385 First Division, and Manchester United,
              who had won the 1984\u201385 FA Cup. Everton won 2\u20130 with goals from
              Trevor Steven and Adrian Heath. Trevor Steven put Everton into the lead
              when he swept home from six yards after a cross from the left in the first half.
              The second goal came in the second half when Manchester United goalkeeper
              Gary Bailey dropped a cross from the left to allow Adrian Heath to tip the
              ball past him into the left corner of the net.\r\nThe 1995 FA Charity Shield
              (also known as the Littlewoods FA Charity Shield for sponsorship reasons) was the
              73rd FA Charity Shield, an annual football match played between the winners of
              the previous season's Premier League and FA Cup competitions. The match was
              played on 13 August 1995 at Wembley Stadium and contested by Blackburn Rovers,
              who had won the Premier League and FA Cup winners Everton. It was Blackburn's
              second successive Charity Shield appearance, while Everton were appearing in
              their eleventh and their first since 1987. Everton won the match 1\u20130
              with a goal from Vinny Samways when he caught Tim Flowers off his line and
              lifted the ball over him from the left of the penalty area and into the right
              corner of the net. Dave Watson lifted the trophy for Everton.\r\nThe 1972 FA
              Charity Shield was contested between Manchester City and Aston Villa.\r\nThe
              1997 FA Charity Shield (known as the Littlewoods FA Charity Shield for
              sponsorship reasons) was the 75th FA Charity Shield, an annual football match
              played between the winners of the previous season's Premier League and
              FA Cup competitions. The match was played on 3 August 1997 at Wembley Stadium
              and contested by Manchester United, who had won the 1996\u201397 FA Premier League,
              and Chelsea, who had won the 1996\u201397 FA Cup. Manchester United won the match
              4\u20132 on penalties after the match had finished at 1\u20131 after 90 minutes.
              \r\nThe 1956 FA Charity Shield was the 34th FA Charity Shield, an annual football
              match held between the winners of the previous season's Football League and
              FA Cup competitions. The match was contested by Manchester United, who had won
              the 1955\u201356 Football League, and Manchester City, who had won the
              1955\u201356 FA Cup, at Maine Road, Manchester, on 24 October 1956. Manchester
              United won the match 1\u20130, Dennis Viollet scoring the winning goal.
              Manchester United goalkeeper David Gaskell made his debut for the club during
              the game, taking the place of injured goalkeeper Ray Wood, and, at the age of
              16 years and 19 days, became the youngest player ever to play for the club.
              \r\nThe 1937 FA Charity Shield was the 24th FA Charity Shield, a football match
              between the winners of the previous season's First Division and FA Cup competitions.
              The match was contested by league champions Manchester City and FA Cup winners
              Sunderland, and was played at Maine Road, the home ground of Manchester City.
              Manchester City won the game, 2\u20130.\r\nThe 2000 FA Charity Shield (also known
              as the One 2 One FA Charity Shield for sponsorship reasons) was the
              78th FA Charity Shield, an annual football match played between the winners
              of the previous season's Premier League and FA Cup competitions. The match
              was played between Manchester United, who won the 1999\u20132000 Premier League,
              and Chelsea, who won the 1999\u20132000 FA Cup, and resulted in a 2\u20130 Chelsea win.
              The goals were scored by Jimmy Floyd Hasselbaink and Mario Melchiot. Roy Keane
              was sent off for a challenge on Gustavo Poyet and was the last person to be
              sent off at the old Wembley Stadium.\r\nThe 2001 FA Charity Shield (also known
              as the One 2 One FA Charity Shield for sponsorship reasons) was the 79th FA Charity Shield,
              an annual football match played between the winners of the previous season's
              Premier League and FA Cup. The match was contested between Liverpool, winners of
              the 2000\u201301 FA Cup and Manchester United, who won the 2000\u201301 Premier
              League on 12 August 2001. It was the first Shield match to be held at the
              Millennium Stadium following the closure of Wembley Stadium for reconstruction.
              \r\nAston Villa Football Club ( ; nicknamed Villa, The Villa, The Villans
              and The Lions) is a professional football club in Aston, Birmingham, that plays
              in the Championship, the second level of English football. Founded in 1874,
              they have played at their current home ground, Villa Park, since 1897. Aston Villa
              were one of the founder members of the Football League in 1888 and of the
              Premier League in 1992.\r\nThe 1996 FA Charity Shield (also known as the
              Littlewoods FA Charity Shield for sponsorship reasons) was the 74th FA Charity Shield,
              an annual football match played between the winners of the previous season's Premier
              League and FA Cup competitions. The match was played on 11 August 1996 at Wembley
              Stadium and contested by Manchester United, who had won the Double of Premier League
              and FA Cup in 1995\u201396, and Newcastle United, who had finished as runners-up
              in the Premier League. Manchester United won the match 4\u20130 with goals from
              Eric Cantona, Nicky Butt, David Beckham and Roy Keane."""
    prediction = compiled_extract_knowledge_graph(context = text, question = "")
    print(prediction.graph)
 if __name__ == "__main__":
    run()
--- a/cognee/modules/cognify/train.py
+++ b/cognee/modules/cognify/train.py
@ -1,68 +0,0 @@
 import dspy
 from dspy.teleprompt import BootstrapFewShot
 from dspy.primitives.example import Example
 from cognee.modules.data.extraction.knowledge_graph.extract_knowledge_graph import ExtractKnowledgeGraph
 from cognee.root_dir import get_absolute_path
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.shared.data_models import Answer
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.modules.cognify.dataset import HotPotQA
 from cognee.infrastructure.llm import get_llm_config
 def train():
    colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = "http://20.102.90.50:2017/wiki17_abstracts")
    dspy.configure(rm = colbertv2_wiki17_abstracts)
    def evaluate_answer(example, graph_prediction, trace = None):
        llm_client = get_llm_client()
        try:
            answer_prediction = llm_client.create_structured_output(
                text_input = example.question,
                system_prompt = f"""Answer the question by looking at the provided knowledge graph.
                Use only the graph to answer the question and be very brief.
                This is the knowledge graph:
                {graph_prediction.graph.model_dump(mode = "json")}""",
                response_model = Answer,
            )
        except:
            return False
        return dsp.answer_match(example.answer, [answer_prediction.answer], frac = 0.8) or \
            dsp.passage_match([example.answer], [answer_prediction.answer])
    optimizer = BootstrapFewShot(metric = evaluate_answer)
    dataset = HotPotQA(
        train_seed = 1,
        train_size = 16,
        eval_seed = 2023,
        dev_size = 8,
        test_size = 0,
        keep_details = True,
    )
    # Train
    train_examples = [
        Example(
            base = None,
            question = example.question,
            context = "\r\n".join("".join(sentences) for sentences in example.context["sentences"]),
            answer = example.answer,
        ) for example in dataset.train
    ]
    trainset = [example.with_inputs("context", "question") for example in train_examples]
    llm_config = get_llm_config()
    gpt4 = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)
    compiled_extract_knowledge_graph = optimizer.compile(ExtractKnowledgeGraph(lm = gpt4), trainset = trainset)
    # Save program
    LocalStorage.ensure_directory_exists(get_absolute_path("./programs/extract_knowledge_graph"))
    compiled_extract_knowledge_graph.save(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
 if __name__ == "__main__":
    train()