update notebooks with latest eval
This commit is contained in:
parent
b2f7f733d9
commit
343de01d5a
3 changed files with 1745 additions and 752 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load diff
|
|
@ -618,76 +618,339 @@
|
|||
"cell_type": "markdown",
|
||||
"id": "e519e30c0423c2a",
|
||||
"metadata": {},
|
||||
"source": "## Let's add evals"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b22ae3d868fa5606",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-12-19T18:01:11.387716Z",
|
||||
"start_time": "2024-12-19T18:01:11.278042Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'deepeval'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m eval_on_hotpotQA\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m answer_with_cognee\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m answer_without_cognee\n",
|
||||
"File \u001b[0;32m~/cognee/evals/eval_on_hotpot.py:7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mstatistics\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdeepeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwget\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EvaluationDataset\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'deepeval'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from evals.eval_on_hotpot import eval_on_hotpotQA\n",
|
||||
"from evals.eval_on_hotpot import answer_with_cognee\n",
|
||||
"from evals.eval_on_hotpot import answer_without_cognee\n",
|
||||
"from evals.eval_on_hotpot import eval_answers\n",
|
||||
"from cognee.base_config import get_base_config\n",
|
||||
"from pathlib import Path\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import wget\n",
|
||||
"import json\n",
|
||||
"import statistics"
|
||||
"## Let's add evals"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "728355d390e3a01b",
|
||||
"id": "3845443e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install \"cognee[deepeval]\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a2c3c70",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from evals.eval_on_hotpot import deepeval_answers, answer_qa_instance\n",
|
||||
"from evals.qa_dataset_utils import load_qa_dataset\n",
|
||||
"from evals.qa_metrics_utils import get_metrics\n",
|
||||
"from evals.qa_context_provider_utils import qa_context_providers\n",
|
||||
"from pathlib import Path\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import statistics\n",
|
||||
"import random"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "53a609d8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee\n",
|
||||
"num_samples = 10 # With cognee, it takes ~1m10s per sample\n",
|
||||
"dataset_name_or_filename = \"hotpotqa\"\n",
|
||||
"dataset = load_qa_dataset(dataset_name_or_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7351ab8f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"context_provider_name = \"cognee\"\n",
|
||||
"context_provider = qa_context_providers[context_provider_name]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9346115b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"random.seed(42)\n",
|
||||
"instances = dataset if not num_samples else random.sample(dataset, num_samples)\n",
|
||||
"\n",
|
||||
"base_config = get_base_config()\n",
|
||||
"data_root_dir = base_config.data_root_directory\n",
|
||||
"out_path = \"out\" \n",
|
||||
"if not Path(out_path).exists():\n",
|
||||
" Path(out_path).mkdir()\n",
|
||||
"contexts_filename = out_path / Path(\n",
|
||||
" f\"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"if not Path(data_root_dir).exists():\n",
|
||||
" Path(data_root_dir).mkdir()\n",
|
||||
"\n",
|
||||
"filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
|
||||
"if not filepath.exists():\n",
|
||||
" url = \"http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json\"\n",
|
||||
" wget.download(url, out=data_root_dir)\n",
|
||||
"\n",
|
||||
"with open(filepath, \"r\") as file:\n",
|
||||
" dataset = json.load(file)\n",
|
||||
"\n",
|
||||
"instances = dataset if not num_samples else dataset[:num_samples]\n",
|
||||
"answers = []\n",
|
||||
"for instance in tqdm(instances, desc=\"Getting answers\"):\n",
|
||||
" answer = answer_provider(instance)\n",
|
||||
" answer = await answer_qa_instance(instance, context_provider, contexts_filename)\n",
|
||||
" answers.append(answer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1e7d872d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Define Metrics for Evaluation and Calculate Score\n",
|
||||
"**Options**: \n",
|
||||
"- **Correctness**: Is the actual output factually correct based on the expected output?\n",
|
||||
"- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?\n",
|
||||
"- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?\n",
|
||||
"- **Empowerment**: How well does the answer help the reader understand and make informed judgements about the topic?\n",
|
||||
"- **Directness**: How specifically and clearly does the answer address the question?\n",
|
||||
"- **F1 Score**: the harmonic mean of the precision and recall, using word-level Exact Match\n",
|
||||
"- **EM Score**: the rate at which the predicted strings exactly match their references, ignoring white spaces and capitalization."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c81e2b46",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculate `\"Correctness\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ae728344",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"Correctness\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)\n",
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "764aac6d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Correctness = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(Correctness)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6d3bbdc5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculating `\"Comprehensiveness\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9793ef78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"Comprehensiveness\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)\n",
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9add448a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Comprehensiveness = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(Comprehensiveness)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bce2fa25",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculating `\"Diversity\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f60a179e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"Diversity\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)\n",
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7ccbd0ab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Diversity = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(Diversity)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "191cab63",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculating`\"Empowerment\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "66bec0bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"Empowerment\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)\n",
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1b043a8f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Empowerment = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(Empowerment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cac3be9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculating `\"Directness\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "adaa17c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"Directness\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)\n",
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a8f97c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Directness = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(Directness)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1ad6feb8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculating `\"F1\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bdc48259",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"F1\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c43c17c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8bfcc46d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"F1_score = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(F1_score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2583f948",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Calculating `\"EM\"`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "90a8f630",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric_name_list = [\"EM\"]\n",
|
||||
"eval_metrics = get_metrics(metric_name_list)\n",
|
||||
"eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8d1b1ea1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"EM = statistics.mean(\n",
|
||||
" [result.metrics_data[0].score for result in eval_results.test_results]\n",
|
||||
")\n",
|
||||
"print(EM)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "288ab570",
|
||||
|
|
@ -700,7 +963,7 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"display_name": "cognee-c83GrcRT-py3.11",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
|
@ -714,7 +977,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.8"
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue