{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i9i9uUZ3pWQE", "outputId": "84404bb8-5841-4f2f-dd87-7f909c6e95aa" }, "outputs": [], "source": [ "######## Installations - BE SURE TO MAKE YOUR OWN LOCAL VENV FIRST\n", "\n", "%pip install gdown pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KgzveXyAp35v" }, "outputs": [], "source": [ "######## Imports\n", "\n", "import csv\n", "import json\n", "import os\n", "import tarfile\n", "from datetime import datetime\n", "\n", "import gdown\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iwcBxgxgqES8", "outputId": "4af8de17-57bd-4857-f49c-2ee5d39ed248" }, "outputs": [], "source": [ "######## Download the eval dataset from the official Google Drive source\n", "\n", "file_id = '1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80'\n", "url = f'https://drive.google.com/uc?id={file_id}'\n", "folder_path = './longmemeval_data'\n", "file_path = os.path.join(folder_path, 'longmemeval_data.tar.gz')\n", "\n", "# If it doesn't exist, create a \"/.longmemeval_data/\" directory\n", "if not os.path.exists(folder_path):\n", " os.makedirs(folder_path)\n", "\n", "# Download the compressed dataset\n", "if not os.path.exists(file_path):\n", " gdown.download(url, file_path, quiet=False)\n", "else:\n", " print(f\"'{file_path}' already exists, skipping download.\")\n", "\n", "# Extract the tar.gz\n", "if not os.path.exists(os.path.join(folder_path, 'longmemeval_oracle.json')):\n", " with tarfile.open(file_path, 'r:gz') as tar:\n", " tar.extractall(path=folder_path)\n", "else:\n", " print(\"'longmemeval_oracle.json' already exists, so skipping extraction.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_RjEZnk5v530" }, "outputs": [], "source": [ "######## Load the eval dataset\n", "\n", "lme_dataset_option = os.path.join(\n", " folder_path, 'longmemeval_oracle.json'\n", ") # Can be _oracle, _s, or _m\n", "lme_dataset_df = pd.read_json(lme_dataset_option)\n", "lme_dataset_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "######## Method to save all of the snippets (or only firsts/lasts) of the specified multi-sessions to a CSV file\n", "\n", "\n", "def snippetize_lme_dataset(lme_filename, max_num_previous_messages=5):\n", " \"\"\"\n", " Creates a csv where each row is a \"snippet\" from longmemeval. A snippet is a message and set of previous messages.\n", " \"\"\"\n", "\n", " lme_dataset_option = os.path.join(folder_path, lme_filename)\n", " lme_dataset_df = pd.read_json(lme_dataset_option)\n", "\n", " all_snippets = []\n", " for index, row in lme_dataset_df.iterrows():\n", " question_id = row['question_id']\n", "\n", " # Extract the haystack_sessions and dates\n", " sessions = row['haystack_sessions']\n", " session_dates = row['haystack_dates']\n", "\n", " # Combine into list of dictionaries\n", " sessions_data = [\n", " {'session': session, 'date': datetime.strptime(date, '%Y/%m/%d (%a) %H:%M')}\n", " for session, date in zip(sessions, session_dates)\n", " ]\n", "\n", " # Sort by date from earliest to latest\n", " sessions_data.sort(key=lambda x: x['date'])\n", "\n", " all_snippets_this_session = []\n", "\n", " message_index_across_sessions = 0\n", " for session_index, session_and_date in enumerate(sessions_data):\n", " for message_index_within_session, message in enumerate(session_and_date['session']):\n", " num_previous_messages = min(\n", " max_num_previous_messages, message_index_across_sessions\n", " )\n", " previous_snippets = all_snippets_this_session[\n", " message_index_across_sessions - num_previous_messages :\n", " ]\n", " previous_messages_only = [\n", " {\n", " 'role': previous_snippet['message']['role'],\n", " 'content': previous_snippet['message']['content'],\n", " }\n", " for previous_snippet in previous_snippets\n", " ]\n", "\n", " snippet = {\n", " 'question_id': question_id,\n", " 'question_type': row['question_type'],\n", " 'multisession_index': index,\n", " 'session_index': session_index,\n", " 'message_index_within_session': message_index_within_session,\n", " 'message_index_across_sessions': message_index_across_sessions,\n", " 'session_date': session_and_date['date'],\n", " 'message': message,\n", " 'previous_messages': previous_messages_only,\n", " 'num_previous_messages': num_previous_messages,\n", " }\n", "\n", " if lme_filename == 'longmemeval_oracle.json':\n", " snippet['message_has_answer'] = message['has_answer']\n", "\n", " all_snippets_this_session.append(snippet)\n", " message_index_across_sessions += 1\n", "\n", " all_snippets.extend(all_snippets_this_session)\n", "\n", " snippetized_folder = os.path.join(folder_path, 'snippetized_data')\n", " if not os.path.exists(snippetized_folder):\n", " os.makedirs(snippetized_folder)\n", "\n", " filename = lme_filename.replace('.json', '_snippetized.csv')\n", " filepath = os.path.join(snippetized_folder, filename)\n", "\n", " with open(filepath, 'w', newline='') as csvfile:\n", " writer = csv.DictWriter(csvfile, fieldnames=all_snippets[0].keys())\n", " writer.writeheader()\n", " for snippet in all_snippets:\n", " snippet['message'] = json.dumps(snippet['message'])\n", " snippet['previous_messages'] = json.dumps(snippet['previous_messages'])\n", " writer.writerow(snippet)\n", "\n", "\n", "def snippetize_and_check(lme_filename):\n", " folder_path = './longmemeval_data/snippetized_data'\n", " file_path = os.path.join(folder_path, lme_filename.replace('.json', '_snippetized.csv'))\n", " if not os.path.exists(file_path):\n", " print(f'Snippetizing {lme_filename}...')\n", " snippetize_lme_dataset(lme_filename)\n", " else:\n", " print(f'Skipping snippetization for {lme_filename} because it already exists.')\n", "\n", " # Check first few rows of the csv\n", " df = pd.read_csv(file_path)\n", " display(df.head(10))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2025-03-20T17:17:33.985547Z", "start_time": "2025-03-20T17:17:33.569496Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Skipping snippetization for longmemeval_oracle.json because it already exists.\n" ] }, { "data": { "text/html": [ "
| \n", " | question_id | \n", "question_type | \n", "multisession_index | \n", "session_index | \n", "message_index_within_session | \n", "message_index_across_sessions | \n", "session_date | \n", "message | \n", "previous_messages | \n", "num_previous_messages | \n", "message_has_answer | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"user\", \"content\": \"I'm thinking of g... | \n", "[] | \n", "0 | \n", "False | \n", "
| 1 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"assistant\", \"content\": \"Choosing the... | \n", "[{\"role\": \"user\", \"content\": \"I'm thinking of ... | \n", "1 | \n", "False | \n", "
| 2 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "2 | \n", "2 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"user\", \"content\": \"I've been doing s... | \n", "[{\"role\": \"user\", \"content\": \"I'm thinking of ... | \n", "2 | \n", "True | \n", "
| 3 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "3 | \n", "3 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"assistant\", \"content\": \"That's great... | \n", "[{\"role\": \"user\", \"content\": \"I'm thinking of ... | \n", "3 | \n", "False | \n", "
| 4 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "4 | \n", "4 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"user\", \"content\": \"I'll definitely a... | \n", "[{\"role\": \"user\", \"content\": \"I'm thinking of ... | \n", "4 | \n", "False | \n", "
| 5 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "5 | \n", "5 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"assistant\", \"content\": \"Advanced pai... | \n", "[{\"role\": \"user\", \"content\": \"I'm thinking of ... | \n", "5 | \n", "False | \n", "
| 6 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "6 | \n", "6 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"user\", \"content\": \"I'll definitely a... | \n", "[{\"role\": \"assistant\", \"content\": \"Choosing th... | \n", "5 | \n", "False | \n", "
| 7 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "7 | \n", "7 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"assistant\", \"content\": \"Congratulati... | \n", "[{\"role\": \"user\", \"content\": \"I've been doing ... | \n", "5 | \n", "False | \n", "
| 8 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "8 | \n", "8 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"user\", \"content\": \"That's really hel... | \n", "[{\"role\": \"assistant\", \"content\": \"That's grea... | \n", "5 | \n", "False | \n", "
| 9 | \n", "gpt4_2655b836 | \n", "temporal-reasoning | \n", "0 | \n", "0 | \n", "9 | \n", "9 | \n", "2023-04-10 14:47:00 | \n", "{\"role\": \"assistant\", \"content\": \"Congratulati... | \n", "[{\"role\": \"user\", \"content\": \"I'll definitely ... | \n", "5 | \n", "False | \n", "