cognee/notebooks/cognee_llama_index.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cognee GraphRAG with LlamaIndex Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index-core"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data\n",
    "\n",
    "We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.\n",
    "\n",
    "The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from llama_index.core import Document\n",
    "\n",
    "news = pd.read_csv(\n",
    "    \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv\"\n",
    ")[:5]\n",
    "\n",
    "news.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare documents as required by LlamaIndex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = [Document(text=f\"{row['title']}: {row['text']}\") for i, row in news.iterrows()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set environment variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Setting environment variables\n",
    "if \"GRAPHISTRY_USERNAME\" not in os.environ:\n",
    "    os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
    "\n",
    "if \"GRAPHISTRY_PASSWORD\" not in os.environ:\n",
    "    os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
    "\n",
    "if \"LLM_API_KEY\" not in os.environ:\n",
    "    os.environ[\"LLM_API_KEY\"] = \"\"\n",
    "\n",
    "# \"neo4j\" or \"networkx\"\n",
    "os.environ[\"GRAPH_DATABASE_PROVIDER\"] = \"networkx\"\n",
    "# Not needed if using networkx\n",
    "# os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
    "# os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
    "# os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
    "\n",
    "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
    "os.environ[\"VECTOR_DB_PROVIDER\"] = \"lancedb\"\n",
    "# Not needed if using \"lancedb\" or \"pgvector\"\n",
    "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
    "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
    "\n",
    "# Relational Database provider \"sqlite\" or \"postgres\"\n",
    "os.environ[\"DB_PROVIDER\"] = \"sqlite\"\n",
    "\n",
    "# Database name\n",
    "os.environ[\"DB_NAME\"] = \"cognee_db\"\n",
    "\n",
    "# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
    "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
    "# os.environ[\"DB_PORT\"]=\"5432\"\n",
    "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
    "# os.environ[\"DB_PASSWORD\"]=\"cognee\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run Cognee with LlamaIndex Documents"
   ]
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "from typing import Union, BinaryIO\n",
    "\n",
    "from cognee.infrastructure.databases.vector.pgvector import (\n",
    "    create_db_and_tables as create_pgvector_db_and_tables,\n",
    ")\n",
    "from cognee.infrastructure.databases.relational import (\n",
    "    create_db_and_tables as create_relational_db_and_tables,\n",
    ")\n",
    "from cognee.modules.users.models import User\n",
    "from cognee.modules.users.methods import get_default_user\n",
    "from cognee.tasks.ingestion.ingest_data import ingest_data\n",
    "import cognee\n",
    "\n",
    "# Create a clean slate for cognee -- reset data and system state\n",
    "await cognee.prune.prune_data()\n",
    "await cognee.prune.prune_system(metadata=True)\n",
    "\n",
    "\n",
    "# Add the LlamaIndex documents, and make it available for cognify\n",
    "async def add(\n",
    "    data: Union[BinaryIO, list[BinaryIO], str, list[str]],\n",
    "    dataset_name: str = \"main_dataset\",\n",
    "    user: User = None,\n",
    "):\n",
    "    await create_relational_db_and_tables()\n",
    "    await create_pgvector_db_and_tables()\n",
    "\n",
    "    if user is None:\n",
    "        user = await get_default_user()\n",
    "\n",
    "    await ingest_data(data, dataset_name, user)\n",
    "\n",
    "\n",
    "await add(documents)\n",
    "\n",
    "# Use LLMs and cognee to create knowledge graph\n",
    "await cognee.cognify()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Query Cognee for summaries related to data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cognee import SearchType\n",
    "\n",
    "# Query cognee for summaries\n",
    "search_results = await cognee.search(\n",
    "    query_type=SearchType.SUMMARIES, query_text=\"What are the main news discussed in the document?\"\n",
    ")\n",
    "# Display search results\n",
    "print(\"\\n Summary of main news discussed:\\n\")\n",
    "print(search_results[0][\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Render Knowledge Graph generated from provided data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import graphistry\n",
    "\n",
    "from cognee.infrastructure.databases.graph import get_graph_engine\n",
    "from cognee.shared.utils import render_graph\n",
    "\n",
    "# Get graph\n",
    "graphistry.login(\n",
    "    username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\")\n",
    ")\n",
    "graph_engine = await get_graph_engine()\n",
    "\n",
    "graph_url = await render_graph(graph_engine.graph)\n",
    "print(graph_url)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}