{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i9i9uUZ3pWQE", "outputId": "84404bb8-5841-4f2f-dd87-7f909c6e95aa" }, "outputs": [], "source": [ "######## Installations - BE SURE TO MAKE YOUR OWN LOCAL VENV FIRST\n", "\n", "%pip install gdown pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KgzveXyAp35v" }, "outputs": [], "source": [ "######## Imports\n", "\n", "import csv\n", "import json\n", "import os\n", "import tarfile\n", "from datetime import datetime\n", "\n", "import gdown\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iwcBxgxgqES8", "outputId": "4af8de17-57bd-4857-f49c-2ee5d39ed248" }, "outputs": [], "source": [ "######## Download the eval dataset from the official Google Drive source\n", "\n", "file_id = '1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80'\n", "url = f'https://drive.google.com/uc?id={file_id}'\n", "folder_path = './longmemeval_data'\n", "file_path = os.path.join(folder_path, 'longmemeval_data.tar.gz')\n", "\n", "# If it doesn't exist, create a \"/.longmemeval_data/\" directory\n", "if not os.path.exists(folder_path):\n", " os.makedirs(folder_path)\n", "\n", "# Download the compressed dataset\n", "if not os.path.exists(file_path):\n", " gdown.download(url, file_path, quiet=False)\n", "else:\n", " print(f\"'{file_path}' already exists, skipping download.\")\n", "\n", "# Extract the tar.gz\n", "if not os.path.exists(os.path.join(folder_path, 'longmemeval_oracle.json')):\n", " with tarfile.open(file_path, 'r:gz') as tar:\n", " tar.extractall(path=folder_path)\n", "else:\n", " print(\"'longmemeval_oracle.json' already exists, so skipping extraction.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_RjEZnk5v530" }, "outputs": [], "source": [ "######## Load the eval dataset\n", "\n", "lme_dataset_option = os.path.join(\n", " folder_path, 'longmemeval_oracle.json'\n", ") # Can be _oracle, _s, or _m\n", "lme_dataset_df = pd.read_json(lme_dataset_option)\n", "lme_dataset_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "######## Method to save all of the snippets (or only firsts/lasts) of the specified multi-sessions to a CSV file\n", "\n", "\n", "def snippetize_lme_dataset(lme_filename, max_num_previous_messages=5):\n", " \"\"\"\n", " Creates a csv where each row is a \"snippet\" from longmemeval. A snippet is a message and set of previous messages.\n", " \"\"\"\n", "\n", " lme_dataset_option = os.path.join(folder_path, lme_filename)\n", " lme_dataset_df = pd.read_json(lme_dataset_option)\n", "\n", " all_snippets = []\n", " for index, row in lme_dataset_df.iterrows():\n", " question_id = row['question_id']\n", "\n", " # Extract the haystack_sessions and dates\n", " sessions = row['haystack_sessions']\n", " session_dates = row['haystack_dates']\n", "\n", " # Combine into list of dictionaries\n", " sessions_data = [\n", " {'session': session, 'date': datetime.strptime(date, '%Y/%m/%d (%a) %H:%M')}\n", " for session, date in zip(sessions, session_dates)\n", " ]\n", "\n", " # Sort by date from earliest to latest\n", " sessions_data.sort(key=lambda x: x['date'])\n", "\n", " all_snippets_this_session = []\n", "\n", " message_index_across_sessions = 0\n", " for session_index, session_and_date in enumerate(sessions_data):\n", " for message_index_within_session, message in enumerate(session_and_date['session']):\n", " num_previous_messages = min(\n", " max_num_previous_messages, message_index_across_sessions\n", " )\n", " previous_snippets = all_snippets_this_session[\n", " message_index_across_sessions - num_previous_messages :\n", " ]\n", " previous_messages_only = [\n", " {\n", " 'role': previous_snippet['message']['role'],\n", " 'content': previous_snippet['message']['content'],\n", " }\n", " for previous_snippet in previous_snippets\n", " ]\n", "\n", " snippet = {\n", " 'question_id': question_id,\n", " 'question_type': row['question_type'],\n", " 'multisession_index': index,\n", " 'session_index': session_index,\n", " 'message_index_within_session': message_index_within_session,\n", " 'message_index_across_sessions': message_index_across_sessions,\n", " 'session_date': session_and_date['date'],\n", " 'message': message,\n", " 'previous_messages': previous_messages_only,\n", " 'num_previous_messages': num_previous_messages,\n", " }\n", "\n", " if lme_filename == 'longmemeval_oracle.json':\n", " snippet['message_has_answer'] = message['has_answer']\n", "\n", " all_snippets_this_session.append(snippet)\n", " message_index_across_sessions += 1\n", "\n", " all_snippets.extend(all_snippets_this_session)\n", "\n", " snippetized_folder = os.path.join(folder_path, 'snippetized_data')\n", " if not os.path.exists(snippetized_folder):\n", " os.makedirs(snippetized_folder)\n", "\n", " filename = lme_filename.replace('.json', '_snippetized.csv')\n", " filepath = os.path.join(snippetized_folder, filename)\n", "\n", " with open(filepath, 'w', newline='') as csvfile:\n", " writer = csv.DictWriter(csvfile, fieldnames=all_snippets[0].keys())\n", " writer.writeheader()\n", " for snippet in all_snippets:\n", " snippet['message'] = json.dumps(snippet['message'])\n", " snippet['previous_messages'] = json.dumps(snippet['previous_messages'])\n", " writer.writerow(snippet)\n", "\n", "\n", "def snippetize_and_check(lme_filename):\n", " folder_path = './longmemeval_data/snippetized_data'\n", " file_path = os.path.join(folder_path, lme_filename.replace('.json', '_snippetized.csv'))\n", " if not os.path.exists(file_path):\n", " print(f'Snippetizing {lme_filename}...')\n", " snippetize_lme_dataset(lme_filename)\n", " else:\n", " print(f'Skipping snippetization for {lme_filename} because it already exists.')\n", "\n", " # Check first few rows of the csv\n", " df = pd.read_csv(file_path)\n", " display(df.head(10))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2025-03-20T17:17:33.985547Z", "start_time": "2025-03-20T17:17:33.569496Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Skipping snippetization for longmemeval_oracle.json because it already exists.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
question_idquestion_typemultisession_indexsession_indexmessage_index_within_sessionmessage_index_across_sessionssession_datemessageprevious_messagesnum_previous_messagesmessage_has_answer
0gpt4_2655b836temporal-reasoning00002023-04-10 14:47:00{\"role\": \"user\", \"content\": \"I'm thinking of g...[]0False
1gpt4_2655b836temporal-reasoning00112023-04-10 14:47:00{\"role\": \"assistant\", \"content\": \"Choosing the...[{\"role\": \"user\", \"content\": \"I'm thinking of ...1False
2gpt4_2655b836temporal-reasoning00222023-04-10 14:47:00{\"role\": \"user\", \"content\": \"I've been doing s...[{\"role\": \"user\", \"content\": \"I'm thinking of ...2True
3gpt4_2655b836temporal-reasoning00332023-04-10 14:47:00{\"role\": \"assistant\", \"content\": \"That's great...[{\"role\": \"user\", \"content\": \"I'm thinking of ...3False
4gpt4_2655b836temporal-reasoning00442023-04-10 14:47:00{\"role\": \"user\", \"content\": \"I'll definitely a...[{\"role\": \"user\", \"content\": \"I'm thinking of ...4False
5gpt4_2655b836temporal-reasoning00552023-04-10 14:47:00{\"role\": \"assistant\", \"content\": \"Advanced pai...[{\"role\": \"user\", \"content\": \"I'm thinking of ...5False
6gpt4_2655b836temporal-reasoning00662023-04-10 14:47:00{\"role\": \"user\", \"content\": \"I'll definitely a...[{\"role\": \"assistant\", \"content\": \"Choosing th...5False
7gpt4_2655b836temporal-reasoning00772023-04-10 14:47:00{\"role\": \"assistant\", \"content\": \"Congratulati...[{\"role\": \"user\", \"content\": \"I've been doing ...5False
8gpt4_2655b836temporal-reasoning00882023-04-10 14:47:00{\"role\": \"user\", \"content\": \"That's really hel...[{\"role\": \"assistant\", \"content\": \"That's grea...5False
9gpt4_2655b836temporal-reasoning00992023-04-10 14:47:00{\"role\": \"assistant\", \"content\": \"Congratulati...[{\"role\": \"user\", \"content\": \"I'll definitely ...5False
\n", "
" ], "text/plain": [ " question_id question_type multisession_index session_index \\\n", "0 gpt4_2655b836 temporal-reasoning 0 0 \n", "1 gpt4_2655b836 temporal-reasoning 0 0 \n", "2 gpt4_2655b836 temporal-reasoning 0 0 \n", "3 gpt4_2655b836 temporal-reasoning 0 0 \n", "4 gpt4_2655b836 temporal-reasoning 0 0 \n", "5 gpt4_2655b836 temporal-reasoning 0 0 \n", "6 gpt4_2655b836 temporal-reasoning 0 0 \n", "7 gpt4_2655b836 temporal-reasoning 0 0 \n", "8 gpt4_2655b836 temporal-reasoning 0 0 \n", "9 gpt4_2655b836 temporal-reasoning 0 0 \n", "\n", " message_index_within_session message_index_across_sessions \\\n", "0 0 0 \n", "1 1 1 \n", "2 2 2 \n", "3 3 3 \n", "4 4 4 \n", "5 5 5 \n", "6 6 6 \n", "7 7 7 \n", "8 8 8 \n", "9 9 9 \n", "\n", " session_date message \\\n", "0 2023-04-10 14:47:00 {\"role\": \"user\", \"content\": \"I'm thinking of g... \n", "1 2023-04-10 14:47:00 {\"role\": \"assistant\", \"content\": \"Choosing the... \n", "2 2023-04-10 14:47:00 {\"role\": \"user\", \"content\": \"I've been doing s... \n", "3 2023-04-10 14:47:00 {\"role\": \"assistant\", \"content\": \"That's great... \n", "4 2023-04-10 14:47:00 {\"role\": \"user\", \"content\": \"I'll definitely a... \n", "5 2023-04-10 14:47:00 {\"role\": \"assistant\", \"content\": \"Advanced pai... \n", "6 2023-04-10 14:47:00 {\"role\": \"user\", \"content\": \"I'll definitely a... \n", "7 2023-04-10 14:47:00 {\"role\": \"assistant\", \"content\": \"Congratulati... \n", "8 2023-04-10 14:47:00 {\"role\": \"user\", \"content\": \"That's really hel... \n", "9 2023-04-10 14:47:00 {\"role\": \"assistant\", \"content\": \"Congratulati... \n", "\n", " previous_messages num_previous_messages \\\n", "0 [] 0 \n", "1 [{\"role\": \"user\", \"content\": \"I'm thinking of ... 1 \n", "2 [{\"role\": \"user\", \"content\": \"I'm thinking of ... 2 \n", "3 [{\"role\": \"user\", \"content\": \"I'm thinking of ... 3 \n", "4 [{\"role\": \"user\", \"content\": \"I'm thinking of ... 4 \n", "5 [{\"role\": \"user\", \"content\": \"I'm thinking of ... 5 \n", "6 [{\"role\": \"assistant\", \"content\": \"Choosing th... 5 \n", "7 [{\"role\": \"user\", \"content\": \"I've been doing ... 5 \n", "8 [{\"role\": \"assistant\", \"content\": \"That's grea... 5 \n", "9 [{\"role\": \"user\", \"content\": \"I'll definitely ... 5 \n", "\n", " message_has_answer \n", "0 False \n", "1 False \n", "2 True \n", "3 False \n", "4 False \n", "5 False \n", "6 False \n", "7 False \n", "8 False \n", "9 False " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "lme_filename = 'longmemeval_oracle.json'\n", "snippetize_and_check(lme_filename)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lme_filename = 'longmemeval_s.json'\n", "snippetize_and_check(lme_filename)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lme_filename = 'longmemeval_m.json'\n", "snippetize_and_check(lme_filename)" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 0 }