{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:58:00.193158Z", "start_time": "2025-03-04T11:58:00.190238Z" } }, "cell_type": "code", "source": [ "import nest_asyncio\n", "nest_asyncio.apply()" ], "id": "2efba278d106bb5f", "outputs": [], "execution_count": 2 }, { "metadata": {}, "cell_type": "markdown", "source": [ "### Environment Configuration\n", "#### Setup required directories and environment variables.\n" ], "id": "ccbb2bc23aa456ee" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:33.879188Z", "start_time": "2025-03-04T11:59:33.873682Z" } }, "cell_type": "code", "source": [ "import pathlib\n", "import os\n", "import cognee\n", "\n", "notebook_dir = pathlib.Path().resolve()\n", "data_directory_path = str(notebook_dir / \".data_storage\")\n", "cognee_directory_path = str(notebook_dir / \".cognee_system\")\n", "\n", "cognee.config.data_root_directory(data_directory_path)\n", "cognee.config.system_root_directory(cognee_directory_path)\n", "\n", "BASE_URL = \"https://pokeapi.co/api/v2/\"\n", "os.environ[\"BUCKET_URL\"] = data_directory_path\n", "os.environ[\"DATA_WRITER__DISABLE_COMPRESSION\"] = \"true\"\n" ], "id": "662d554f96f211d9", "outputs": [], "execution_count": 8 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Initialize DLT Pipeline\n", "### Create the DLT pipeline to fetch Pokémon data.\n" ], "id": "36ae0be71f6e9167" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:58:03.982939Z", "start_time": "2025-03-04T11:58:03.819676Z" } }, "cell_type": "code", "source": [ "import dlt\n", "from pathlib import Path\n", "\n", "pipeline = dlt.pipeline(\n", " pipeline_name=\"pokemon_pipeline\",\n", " destination=\"filesystem\",\n", " dataset_name=\"pokemon_data\",\n", ")\n" ], "id": "25101ae5f016ce0c", "outputs": [], "execution_count": 4 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Fetch Pokémon List\n", "### Retrieve a list of Pokémon from the API.\n" ], "id": "9a87ce05a072c48b" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:58:03.990076Z", "start_time": "2025-03-04T11:58:03.987199Z" } }, "cell_type": "code", "source": [ "@dlt.resource(write_disposition=\"replace\")\n", "def pokemon_list(limit: int = 50):\n", " import requests\n", " response = requests.get(f\"{BASE_URL}pokemon\", params={\"limit\": limit})\n", " response.raise_for_status()\n", " yield response.json()[\"results\"]\n" ], "id": "3b6e60778c61e24a", "outputs": [], "execution_count": 5 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Fetch Pokémon Details\n", "### Fetch detailed information about each Pokémon.\n" ], "id": "9952767846194e97" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:58:03.996394Z", "start_time": "2025-03-04T11:58:03.994122Z" } }, "cell_type": "code", "source": [ "@dlt.transformer(data_from=pokemon_list)\n", "def pokemon_details(pokemons):\n", " \"\"\"Fetches detailed info for each Pokémon\"\"\"\n", " import requests\n", " for pokemon in pokemons:\n", " response = requests.get(pokemon[\"url\"])\n", " response.raise_for_status()\n", " yield response.json()\n" ], "id": "79ec9fef12267485", "outputs": [], "execution_count": 6 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Run Data Pipeline\n", "### Execute the pipeline and store Pokémon data.\n" ], "id": "41e05f660bf9e9d2" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:41.571015Z", "start_time": "2025-03-04T11:59:36.840744Z" } }, "cell_type": "code", "source": [ "info = pipeline.run([pokemon_list, pokemon_details])\n", "print(info)\n" ], "id": "20a3b2c7f404677f", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline pokemon_pipeline load step completed in 0.06 seconds\n", "1 load package(s) were loaded to destination filesystem and into dataset pokemon_data\n", "The filesystem destination used file:///Users/lazar/PycharmProjects/cognee/.data_storage location to store data\n", "Load package 1741089576.860229 is LOADED and contains no failed jobs\n" ] } ], "execution_count": 9 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Load Pokémon Abilities\n", "### Load Pokémon ability data from stored files.\n" ], "id": "937f10b8d1037743" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:44.377719Z", "start_time": "2025-03-04T11:59:44.363718Z" } }, "cell_type": "code", "source": [ "import json\n", "from cognee.low_level import DataPoint\n", "from uuid import uuid5, NAMESPACE_OID\n", "\n", "class Abilities(DataPoint):\n", " name: str = \"Abilities\"\n", " metadata: dict = {\"index_fields\": [\"name\"]}\n", "\n", "def load_abilities_data(jsonl_abilities):\n", " abilities_root = Abilities()\n", " pokemon_abilities = []\n", "\n", " for jsonl_ability in jsonl_abilities:\n", " with open(jsonl_ability, \"r\") as f:\n", " for line in f:\n", " ability = json.loads(line)\n", " ability[\"id\"] = uuid5(NAMESPACE_OID, ability[\"_dlt_id\"])\n", " ability[\"name\"] = ability[\"ability__name\"]\n", " ability[\"is_type\"] = abilities_root\n", " pokemon_abilities.append(ability)\n", "\n", " return abilities_root, pokemon_abilities\n" ], "id": "be73050036439ea1", "outputs": [], "execution_count": 10 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Load Pokémon Data\n", "### Load Pokémon details and associate them with abilities.\n" ], "id": "98c97f799f73df77" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:46.251306Z", "start_time": "2025-03-04T11:59:46.238283Z" } }, "cell_type": "code", "source": [ "from typing import List, Optional\n", "\n", "class Pokemons(DataPoint):\n", " name: str = \"Pokemons\"\n", " have: Abilities\n", " metadata: dict = {\"index_fields\": [\"name\"]}\n", "\n", "class PokemonAbility(DataPoint):\n", " name: str\n", " ability__name: str\n", " ability__url: str\n", " is_hidden: bool\n", " slot: int\n", " _dlt_load_id: str\n", " _dlt_id: str\n", " _dlt_parent_id: str\n", " _dlt_list_idx: str\n", " is_type: Abilities\n", " metadata: dict = {\"index_fields\": [\"ability__name\"]}\n", "\n", "class Pokemon(DataPoint):\n", " name: str\n", " base_experience: int\n", " height: int\n", " weight: int\n", " is_default: bool\n", " order: int\n", " location_area_encounters: str\n", " species__name: str\n", " species__url: str\n", " cries__latest: str\n", " cries__legacy: str\n", " sprites__front_default: str\n", " sprites__front_shiny: str\n", " sprites__back_default: Optional[str]\n", " sprites__back_shiny: Optional[str]\n", " _dlt_load_id: str\n", " _dlt_id: str\n", " is_type: Pokemons\n", " abilities: List[PokemonAbility]\n", " metadata: dict = {\"index_fields\": [\"name\"]}\n", "\n", "def load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root):\n", " pokemons = []\n", "\n", " for jsonl_pokemon in jsonl_pokemons:\n", " with open(jsonl_pokemon, \"r\") as f:\n", " for line in f:\n", " pokemon_data = json.loads(line)\n", " abilities = [\n", " ability for ability in pokemon_abilities\n", " if ability[\"_dlt_parent_id\"] == pokemon_data[\"_dlt_id\"]\n", " ]\n", " pokemon_data[\"external_id\"] = pokemon_data[\"id\"]\n", " pokemon_data[\"id\"] = uuid5(NAMESPACE_OID, str(pokemon_data[\"id\"]))\n", " pokemon_data[\"abilities\"] = [PokemonAbility(**ability) for ability in abilities]\n", " pokemon_data[\"is_type\"] = pokemon_root\n", " pokemons.append(Pokemon(**pokemon_data))\n", "\n", " return pokemons\n" ], "id": "7862951248df0bf5", "outputs": [], "execution_count": 11 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Process Pokémon Data\n", "### Load and associate Pokémon abilities.\n" ], "id": "676fa5a2b61c2107" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:47.365226Z", "start_time": "2025-03-04T11:59:47.356722Z" } }, "cell_type": "code", "source": [ "STORAGE_PATH = Path(\".data_storage/pokemon_data/pokemon_details\")\n", "jsonl_pokemons = sorted(STORAGE_PATH.glob(\"*.jsonl\"))\n", "\n", "ABILITIES_PATH = Path(\".data_storage/pokemon_data/pokemon_details__abilities\")\n", "jsonl_abilities = sorted(ABILITIES_PATH.glob(\"*.jsonl\"))\n", "\n", "abilities_root, pokemon_abilities = load_abilities_data(jsonl_abilities)\n", "pokemon_root = Pokemons(have=abilities_root)\n", "pokemons = load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root)\n" ], "id": "ad14cdecdccd71bb", "outputs": [], "execution_count": 12 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Initialize Cognee\n", "### Setup Cognee for data processing.\n" ], "id": "59dec67b2ae50f0f" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:49.244577Z", "start_time": "2025-03-04T11:59:48.618261Z" } }, "cell_type": "code", "source": [ "import asyncio\n", "from cognee.low_level import setup as cognee_setup\n", "\n", "async def initialize_cognee():\n", " await cognee.prune.prune_data()\n", " await cognee.prune.prune_system(metadata=True)\n", " await cognee_setup()\n", "\n", "await initialize_cognee()\n" ], "id": "d2e095ae576a02c1", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:cognee.infrastructure.databases.relational.sqlalchemy.SqlAlchemyAdapter:Database deleted successfully.INFO:cognee.infrastructure.databases.relational.sqlalchemy.SqlAlchemyAdapter:Database deleted successfully." ] } ], "execution_count": 13 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Process Pokémon Data\n", "### Add Pokémon data points to Cognee.\n" ], "id": "5f0b8090bc7b1fe6" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T11:59:57.744035Z", "start_time": "2025-03-04T11:59:50.574033Z" } }, "cell_type": "code", "source": [ "from cognee.modules.pipelines.tasks.Task import Task\n", "from cognee.tasks.storage import add_data_points\n", "from cognee.modules.pipelines import run_tasks\n", "\n", "tasks = [Task(add_data_points, task_config={\"batch_size\": 50})]\n", "results = run_tasks(\n", " tasks=tasks,\n", " data=pokemons,\n", " dataset_id=uuid5(NAMESPACE_OID, \"Pokemon\"),\n", " pipeline_name='pokemon_pipeline',\n", ")\n", "\n", "async for result in results:\n", " print(result)\n", "print(\"Done\")\n" ], "id": "ffa12fc1f5350d95", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:run_tasks(tasks: [Task], data):Pipeline run started: `fd2ed59d-b550-5b05-bbe6-7b708fe12483`INFO:run_tasks(tasks: [Task], data):Coroutine task started: `add_data_points`" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "User d347ea85-e512-4cae-b9d7-496fe1745424 has registered.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/lazar/PycharmProjects/cognee/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py:79: SAWarning: This declarative base already contains a class with the same class name and module name as cognee.infrastructure.databases.vector.pgvector.PGVectorAdapter.PGVectorDataPoint, and will be replaced in the string-lookup table.\n", " class PGVectorDataPoint(Base):\n", "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"/Users/lazar/PycharmProjects/cognee/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py:113: SAWarning: This declarative base already contains a class with the same class name and module name as cognee.infrastructure.databases.vector.pgvector.PGVectorAdapter.PGVectorDataPoint, and will be replaced in the string-lookup table.\n", " class PGVectorDataPoint(Base):\n", "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 8, column: 16, offset: 335} for query: '\\n UNWIND $nodes AS node\\n MERGE (n {id: node.node_id})\\n ON CREATE SET n += node.properties, n.updated_at = timestamp()\\n ON MATCH SET n += node.properties, n.updated_at = timestamp()\\n WITH n, node.node_id AS label\\n CALL apoc.create.addLabels(n, [label]) YIELD node AS labeledNode\\n RETURN ID(labeledNode) AS internal_id, labeledNode.id AS nodeId\\n 'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 1, column: 18, offset: 17} for query: 'MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 16, offset: 43} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 33, offset: 60} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:run_tasks(tasks: [Task], data):Coroutine task completed: `add_data_points`INFO:run_tasks(tasks: [Task], data):Pipeline run completed: `fd2ed59d-b550-5b05-bbe6-7b708fe12483`" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Done\n" ] } ], "execution_count": 14 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Search Pokémon Data\n", "### Execute a search query using Cognee.\n" ], "id": "e0d98d9832a2797a" }, { "metadata": { "ExecuteTime": { "end_time": "2025-03-04T12:00:02.878871Z", "start_time": "2025-03-04T11:59:59.571965Z" } }, "cell_type": "code", "source": [ "from cognee.api.v1.search import SearchType\n", "\n", "search_results = await cognee.search(\n", " query_type=SearchType.GRAPH_COMPLETION,\n", " query_text=\"pokemons?\"\n", ")\n", "\n", "print(\"Search results:\")\n", "for result_text in search_results:\n", " print(result_text)" ], "id": "bb2476b6b0c2aff", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 1, column: 18, offset: 17} for query: 'MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 16, offset: 43} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 33, offset: 60} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\u001B[92m13:00:02 - LiteLLM:INFO\u001B[0m: utils.py:2784 - \n", "LiteLLM completion() model= gpt-4o-mini; provider = openaiINFO:LiteLLM:\n", "LiteLLM completion() model= gpt-4o-mini; provider = openai" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Search results:\n", "The Pokemons mentioned are: golbat, jigglypuff, raichu, vulpix, and pikachu.\n" ] } ], "execution_count": 15 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "a4c2d3e9c15b017" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }