Merge pull request #464 from topoteretes/cog-1069-update-notebooks-evals

Cog 1069 update notebooks evals
This commit is contained in:
Hande 2025-01-27 08:49:24 +01:00 committed by GitHub
commit bd4980c2e1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 1747 additions and 752 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -618,76 +618,339 @@
"cell_type": "markdown",
"id": "e519e30c0423c2a",
"metadata": {},
"source": "## Let's add evals"
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b22ae3d868fa5606",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-19T18:01:11.387716Z",
"start_time": "2024-12-19T18:01:11.278042Z"
}
},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'deepeval'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m eval_on_hotpotQA\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m answer_with_cognee\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m answer_without_cognee\n",
"File \u001b[0;32m~/cognee/evals/eval_on_hotpot.py:7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mstatistics\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdeepeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwget\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EvaluationDataset\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'deepeval'"
]
}
],
"source": [
"from evals.eval_on_hotpot import eval_on_hotpotQA\n",
"from evals.eval_on_hotpot import answer_with_cognee\n",
"from evals.eval_on_hotpot import answer_without_cognee\n",
"from evals.eval_on_hotpot import eval_answers\n",
"from cognee.base_config import get_base_config\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"import wget\n",
"import json\n",
"import statistics"
"## Let's add evals"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "728355d390e3a01b",
"id": "3845443e",
"metadata": {},
"outputs": [],
"source": [
"!pip install \"cognee[deepeval]\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a2c3c70",
"metadata": {},
"outputs": [],
"source": [
"from evals.eval_on_hotpot import deepeval_answers, answer_qa_instance\n",
"from evals.qa_dataset_utils import load_qa_dataset\n",
"from evals.qa_metrics_utils import get_metrics\n",
"from evals.qa_context_provider_utils import qa_context_providers\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"import statistics\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53a609d8",
"metadata": {},
"outputs": [],
"source": [
"answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee\n",
"num_samples = 10 # With cognee, it takes ~1m10s per sample\n",
"dataset_name_or_filename = \"hotpotqa\"\n",
"dataset = load_qa_dataset(dataset_name_or_filename)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7351ab8f",
"metadata": {},
"outputs": [],
"source": [
"context_provider_name = \"cognee\"\n",
"context_provider = qa_context_providers[context_provider_name]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9346115b",
"metadata": {},
"outputs": [],
"source": [
"random.seed(42)\n",
"instances = dataset if not num_samples else random.sample(dataset, num_samples)\n",
"\n",
"base_config = get_base_config()\n",
"data_root_dir = base_config.data_root_directory\n",
"out_path = \"out\" \n",
"if not Path(out_path).exists():\n",
" Path(out_path).mkdir()\n",
"contexts_filename = out_path / Path(\n",
" f\"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json\"\n",
" )\n",
"\n",
"if not Path(data_root_dir).exists():\n",
" Path(data_root_dir).mkdir()\n",
"\n",
"filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
"if not filepath.exists():\n",
" url = \"http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json\"\n",
" wget.download(url, out=data_root_dir)\n",
"\n",
"with open(filepath, \"r\") as file:\n",
" dataset = json.load(file)\n",
"\n",
"instances = dataset if not num_samples else dataset[:num_samples]\n",
"answers = []\n",
"for instance in tqdm(instances, desc=\"Getting answers\"):\n",
" answer = answer_provider(instance)\n",
" answer = await answer_qa_instance(instance, context_provider, contexts_filename)\n",
" answers.append(answer)"
]
},
{
"cell_type": "markdown",
"id": "1e7d872d",
"metadata": {},
"source": [
"#### Define Metrics for Evaluation and Calculate Score\n",
"**Options**: \n",
"- **Correctness**: Is the actual output factually correct based on the expected output?\n",
"- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?\n",
"- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?\n",
"- **Empowerment**: How well does the answer help the reader understand and make informed judgements about the topic?\n",
"- **Directness**: How specifically and clearly does the answer address the question?\n",
"- **F1 Score**: the harmonic mean of the precision and recall, using word-level Exact Match\n",
"- **EM Score**: the rate at which the predicted strings exactly match their references, ignoring white spaces and capitalization."
]
},
{
"cell_type": "markdown",
"id": "c81e2b46",
"metadata": {},
"source": [
"##### Calculate `\"Correctness\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae728344",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"Correctness\"]\n",
"eval_metrics = get_metrics(metric_name_list)\n",
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "764aac6d",
"metadata": {},
"outputs": [],
"source": [
"Correctness = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(Correctness)"
]
},
{
"cell_type": "markdown",
"id": "6d3bbdc5",
"metadata": {},
"source": [
"##### Calculating `\"Comprehensiveness\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9793ef78",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"Comprehensiveness\"]\n",
"eval_metrics = get_metrics(metric_name_list)\n",
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9add448a",
"metadata": {},
"outputs": [],
"source": [
"Comprehensiveness = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(Comprehensiveness)"
]
},
{
"cell_type": "markdown",
"id": "bce2fa25",
"metadata": {},
"source": [
"##### Calculating `\"Diversity\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f60a179e",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"Diversity\"]\n",
"eval_metrics = get_metrics(metric_name_list)\n",
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ccbd0ab",
"metadata": {},
"outputs": [],
"source": [
"Diversity = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(Diversity)"
]
},
{
"cell_type": "markdown",
"id": "191cab63",
"metadata": {},
"source": [
"##### Calculating`\"Empowerment\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66bec0bf",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"Empowerment\"]\n",
"eval_metrics = get_metrics(metric_name_list)\n",
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b043a8f",
"metadata": {},
"outputs": [],
"source": [
"Empowerment = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(Empowerment)"
]
},
{
"cell_type": "markdown",
"id": "2cac3be9",
"metadata": {},
"source": [
"##### Calculating `\"Directness\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "adaa17c0",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"Directness\"]\n",
"eval_metrics = get_metrics(metric_name_list)\n",
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a8f97c9",
"metadata": {},
"outputs": [],
"source": [
"Directness = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(Directness)"
]
},
{
"cell_type": "markdown",
"id": "1ad6feb8",
"metadata": {},
"source": [
"##### Calculating `\"F1\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdc48259",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"F1\"]\n",
"eval_metrics = get_metrics(metric_name_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c43c17c8",
"metadata": {},
"outputs": [],
"source": [
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8bfcc46d",
"metadata": {},
"outputs": [],
"source": [
"F1_score = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(F1_score)"
]
},
{
"cell_type": "markdown",
"id": "2583f948",
"metadata": {},
"source": [
"##### Calculating `\"EM\"`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90a8f630",
"metadata": {},
"outputs": [],
"source": [
"metric_name_list = [\"EM\"]\n",
"eval_metrics = get_metrics(metric_name_list)\n",
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d1b1ea1",
"metadata": {},
"outputs": [],
"source": [
"EM = statistics.mean(\n",
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
")\n",
"print(EM)"
]
},
{
"cell_type": "markdown",
"id": "288ab570",
@ -700,7 +963,7 @@
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"display_name": "cognee-c83GrcRT-py3.11",
"language": "python",
"name": "python3"
},
@ -714,7 +977,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.10"
}
},
"nbformat": 4,