Merge pull request #464 from topoteretes/cog-1069-update-notebooks-evals

Cog 1069 update notebooks evals
2025-01-27 08:49:24 +01:00 · 2025-01-27 08:49:24 +01:00 · bd4980c2e1
commit bd4980c2e1
parent 3e2ac3b331 f4b45761ce
3 changed files with 1747 additions and 752 deletions
--- a/notebooks/cognee_demo.ipynb
+++ b/notebooks/cognee_demo.ipynb
--- a/notebooks/cognee_hotpot_eval.ipynb
+++ b/notebooks/cognee_hotpot_eval.ipynb
--- a/notebooks/hr_demo.ipynb
+++ b/notebooks/hr_demo.ipynb
@ -618,76 +618,339 @@
   "cell_type": "markdown",
   "id": "e519e30c0423c2a",
   "metadata": {},
-   "source": "## Let's add evals"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "b22ae3d868fa5606",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-19T18:01:11.387716Z",
-     "start_time": "2024-12-19T18:01:11.278042Z"
-    }
-   },
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'deepeval'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m eval_on_hotpotQA\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m answer_with_cognee\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mevals\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_on_hotpot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m answer_without_cognee\n",
-      "File \u001b[0;32m~/cognee/evals/eval_on_hotpot.py:7\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mstatistics\u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdeepeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwget\u001b[39;00m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EvaluationDataset\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'deepeval'"
-     ]
-    }
-   ],
   "source": [
-    "from evals.eval_on_hotpot import eval_on_hotpotQA\n",
-    "from evals.eval_on_hotpot import answer_with_cognee\n",
-    "from evals.eval_on_hotpot import answer_without_cognee\n",
-    "from evals.eval_on_hotpot import eval_answers\n",
-    "from cognee.base_config import get_base_config\n",
-    "from pathlib import Path\n",
-    "from tqdm import tqdm\n",
-    "import wget\n",
-    "import json\n",
-    "import statistics"
+    "## Let's add evals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "728355d390e3a01b",
+   "id": "3845443e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install \"cognee[deepeval]\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a2c3c70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evals.eval_on_hotpot import deepeval_answers, answer_qa_instance\n",
+    "from evals.qa_dataset_utils import load_qa_dataset\n",
+    "from evals.qa_metrics_utils import get_metrics\n",
+    "from evals.qa_context_provider_utils import qa_context_providers\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "import statistics\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53a609d8",
   "metadata": {},
   "outputs": [],
   "source": [
-    "answer_provider = answer_with_cognee  # For native LLM answers use answer_without_cognee\n",
    "num_samples = 10  # With cognee, it takes ~1m10s per sample\n",
+    "dataset_name_or_filename = \"hotpotqa\"\n",
+    "dataset = load_qa_dataset(dataset_name_or_filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7351ab8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context_provider_name = \"cognee\"\n",
+    "context_provider = qa_context_providers[context_provider_name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9346115b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(42)\n",
+    "instances = dataset if not num_samples else random.sample(dataset, num_samples)\n",
    "\n",
-    "base_config = get_base_config()\n",
-    "data_root_dir = base_config.data_root_directory\n",
+    "out_path = \"out\" \n",
+    "if not Path(out_path).exists():\n",
+    "    Path(out_path).mkdir()\n",
+    "contexts_filename = out_path / Path(\n",
+    "        f\"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json\"\n",
+    "    )\n",
    "\n",
-    "if not Path(data_root_dir).exists():\n",
-    "    Path(data_root_dir).mkdir()\n",
-    "\n",
-    "filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
-    "if not filepath.exists():\n",
-    "    url = \"http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json\"\n",
-    "    wget.download(url, out=data_root_dir)\n",
-    "\n",
-    "with open(filepath, \"r\") as file:\n",
-    "    dataset = json.load(file)\n",
-    "\n",
-    "instances = dataset if not num_samples else dataset[:num_samples]\n",
    "answers = []\n",
    "for instance in tqdm(instances, desc=\"Getting answers\"):\n",
-    "    answer = answer_provider(instance)\n",
+    "    answer = await answer_qa_instance(instance, context_provider, contexts_filename)\n",
    "    answers.append(answer)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "1e7d872d",
+   "metadata": {},
+   "source": [
+    "#### Define Metrics for Evaluation and Calculate Score\n",
+    "**Options**: \n",
+    "- **Correctness**: Is the actual output factually correct based on the expected output?\n",
+    "- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?\n",
+    "- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?\n",
+    "- **Empowerment**: How well does the answer help the reader understand and make informed judgements about the topic?\n",
+    "- **Directness**: How specifically and clearly does the answer address the question?\n",
+    "- **F1 Score**: the harmonic mean of the precision and recall, using word-level Exact Match\n",
+    "- **EM Score**: the rate at which the predicted strings exactly match their references, ignoring white spaces and capitalization."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c81e2b46",
+   "metadata": {},
+   "source": [
+    "##### Calculate `\"Correctness\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae728344",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"Correctness\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)\n",
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "764aac6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Correctness = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(Correctness)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d3bbdc5",
+   "metadata": {},
+   "source": [
+    "##### Calculating `\"Comprehensiveness\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9793ef78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"Comprehensiveness\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)\n",
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9add448a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Comprehensiveness = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(Comprehensiveness)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bce2fa25",
+   "metadata": {},
+   "source": [
+    "##### Calculating `\"Diversity\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f60a179e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"Diversity\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)\n",
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ccbd0ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Diversity = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(Diversity)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "191cab63",
+   "metadata": {},
+   "source": [
+    "##### Calculating`\"Empowerment\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66bec0bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"Empowerment\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)\n",
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b043a8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Empowerment = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(Empowerment)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2cac3be9",
+   "metadata": {},
+   "source": [
+    "##### Calculating `\"Directness\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adaa17c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"Directness\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)\n",
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a8f97c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Directness = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(Directness)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ad6feb8",
+   "metadata": {},
+   "source": [
+    "##### Calculating `\"F1\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdc48259",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"F1\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c43c17c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8bfcc46d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "F1_score = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(F1_score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2583f948",
+   "metadata": {},
+   "source": [
+    "##### Calculating `\"EM\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90a8f630",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name_list = [\"EM\"]\n",
+    "eval_metrics = get_metrics(metric_name_list)\n",
+    "eval_results = await deepeval_answers(instances, answers, eval_metrics[\"deepeval_metrics\"])    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d1b1ea1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EM = statistics.mean(\n",
+    "    [result.metrics_data[0].score for result in eval_results.test_results]\n",
+    ")\n",
+    "print(EM)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "288ab570",
@ -700,7 +963,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "cognee-c83GrcRT-py3.11",
   "language": "python",
   "name": "python3"
  },
@ -714,7 +977,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.8"
+   "version": "3.11.10"
  }
 },
 "nbformat": 4,