From 2e010f8dd1c892ef63143153618320057f62b4ea Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Fri, 17 Jan 2025 14:16:48 +0100 Subject: [PATCH] Incremental eval of cognee pipeline (#445) * QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Support 4 different rag options in eval * Minor refactor and logger usage * feat: make tasks a configurable argument in the cognify function * Run eval on a set of parameters and save results as json and png * fix: add data points task * script for running all param combinations * enable context provider to get tasks as param * bugfix in simple rag * Incremental eval of cognee pipeline * potential fix: single asyncio run * temp fix: exclude insights * Remove insights, have single asyncio run, refactor * minor fixes * handle pipeline slices in utils * include all options in params json --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com> Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- evals/eval_on_hotpot.py | 38 ++++++++++++++++++++++---- evals/official_hotpot_metrics.py | 3 -- evals/qa_context_provider_utils.py | 44 +++++++++++++++++++++++------- evals/qa_eval_parameters.json | 3 +- evals/qa_eval_utils.py | 2 +- 5 files changed, 69 insertions(+), 21 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index adef0d160..c6bb86baa 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -9,7 +9,7 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt from evals.qa_dataset_utils import load_qa_dataset from evals.qa_metrics_utils import get_metrics -from evals.qa_context_provider_utils import qa_context_providers +from evals.qa_context_provider_utils import qa_context_providers, create_cognee_context_getter logger = logging.getLogger(__name__) @@ -94,14 +94,29 @@ async def eval_on_QA_dataset( return results -if __name__ == "__main__": +async def incremental_eval_on_QA_dataset( + dataset_name_or_filename: str, num_samples, metric_name_list +): + pipeline_slice_names = ["base", "extract_chunks", "extract_graph", "summarize"] + + incremental_results = {} + for pipeline_slice_name in pipeline_slice_names: + results = await eval_on_QA_dataset( + dataset_name_or_filename, pipeline_slice_name, num_samples, metric_name_list + ) + incremental_results[pipeline_slice_name] = results + + return incremental_results + + +async def main(): parser = argparse.ArgumentParser() parser.add_argument("--dataset", type=str, required=True, help="Which dataset to evaluate on") parser.add_argument( "--rag_option", type=str, - choices=qa_context_providers.keys(), + choices=list(qa_context_providers.keys()) + ["cognee_incremental"], required=True, help="RAG option to use for providing context", ) @@ -110,7 +125,18 @@ if __name__ == "__main__": args = parser.parse_args() - avg_scores = asyncio.run( - eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metrics) - ) + if args.rag_option == "cognee_incremental": + avg_scores = await incremental_eval_on_QA_dataset( + args.dataset, args.num_samples, args.metrics + ) + + else: + avg_scores = await eval_on_QA_dataset( + args.dataset, args.rag_option, args.num_samples, args.metrics + ) + logger.info(f"{avg_scores}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/evals/official_hotpot_metrics.py b/evals/official_hotpot_metrics.py index 70444f7be..c09ab2a9d 100644 --- a/evals/official_hotpot_metrics.py +++ b/evals/official_hotpot_metrics.py @@ -4,11 +4,8 @@ These are the official evaluation metrics for HotpotQA taken from https://hotpot import re import string -import sys from collections import Counter -import ujson as json - def normalize_answer(s): def remove_articles(text): diff --git a/evals/qa_context_provider_utils.py b/evals/qa_context_provider_utils.py index b4c5daa7f..0591d7c92 100644 --- a/evals/qa_context_provider_utils.py +++ b/evals/qa_context_provider_utils.py @@ -3,35 +3,49 @@ from cognee.api.v1.search import SearchType from cognee.infrastructure.databases.vector import get_vector_engine from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search from cognee.tasks.completion.graph_query_completion import retrieved_edges_to_string +from functools import partial +from cognee.api.v1.cognify.cognify_v2 import get_default_tasks async def get_raw_context(instance: dict) -> str: return instance["context"] -async def cognify_instance(instance: dict): +async def cognify_instance(instance: dict, task_indices: list[int] = None): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - for title, sentences in instance["context"]: await cognee.add("\n".join(sentences), dataset_name="QA") - await cognee.cognify("QA") + all_cognify_tasks = await get_default_tasks() + if task_indices: + selected_tasks = [all_cognify_tasks[ind] for ind in task_indices] + else: + selected_tasks = all_cognify_tasks + await cognee.cognify("QA", tasks=selected_tasks) -async def get_context_with_cognee(instance: dict) -> str: - await cognify_instance(instance) +async def get_context_with_cognee( + instance: dict, + task_indices: list[int] = None, + search_types: list[SearchType] = [SearchType.SUMMARIES, SearchType.CHUNKS], +) -> str: + await cognify_instance(instance, task_indices) - # TODO: Fix insights - # insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"]) - summaries = await cognee.search(SearchType.SUMMARIES, query_text=instance["question"]) - # search_results = insights + summaries - search_results = summaries + search_results = [] + for search_type in search_types: + search_results += await cognee.search(search_type, query_text=instance["question"]) search_results_str = "\n".join([context_item["text"] for context_item in search_results]) return search_results_str +def create_cognee_context_getter( + task_indices=None, search_types=[SearchType.SUMMARIES, SearchType.CHUNKS] +): + return partial(get_context_with_cognee, task_indices=task_indices, search_types=search_types) + + async def get_context_with_simple_rag(instance: dict) -> str: await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -57,9 +71,19 @@ async def get_context_with_brute_force_triplet_search(instance: dict) -> str: return search_results_str +valid_pipeline_slices = { + "base": [0, 1, 5], + "extract_chunks": [0, 1, 2, 5], + "extract_graph": [0, 1, 2, 3, 5], + "summarize": [0, 1, 2, 3, 4, 5], +} + qa_context_providers = { "no_rag": get_raw_context, "cognee": get_context_with_cognee, "simple_rag": get_context_with_simple_rag, "brute_force": get_context_with_brute_force_triplet_search, +} | { + name: create_cognee_context_getter(task_indices=slice) + for name, slice in valid_pipeline_slices.items() } diff --git a/evals/qa_eval_parameters.json b/evals/qa_eval_parameters.json index 539e5f32c..6ae07089a 100644 --- a/evals/qa_eval_parameters.json +++ b/evals/qa_eval_parameters.json @@ -3,8 +3,9 @@ "hotpotqa" ], "rag_option": [ - "no_rag", + "cognee_incremental", "cognee", + "no_rag", "simple_rag", "brute_force" ], diff --git a/evals/qa_eval_utils.py b/evals/qa_eval_utils.py index 395711c6a..16817ef28 100644 --- a/evals/qa_eval_utils.py +++ b/evals/qa_eval_utils.py @@ -44,7 +44,7 @@ def save_results_as_image(results, out_path): for num_samples, table_data in num_samples_data.items(): df = pd.DataFrame.from_dict(table_data, orient="index") df.index.name = f"Dataset: {dataset}, Num Samples: {num_samples}" - image_path = Path(out_path) / Path(f"table_{dataset}_{num_samples}.png") + image_path = out_path / Path(f"table_{dataset}_{num_samples}.png") save_table_as_image(df, image_path)