{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Cognee GraphRAG with LlamaIndex Documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install llama-index-core\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data\n", "\n", "We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.\n", "\n", "The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from llama_index.core import Document\n", "\n", "news = pd.read_csv(\n", " \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv\"\n", ")[:5]\n", "\n", "news.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare documents as required by LlamaIndex" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "documents = [Document(text=f\"{row['title']}: {row['text']}\") for i, row in news.iterrows()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set environment variables" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# Setting environment variables\n", "if \"GRAPHISTRY_USERNAME\" not in os.environ:\n", " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", "\n", "if \"GRAPHISTRY_PASSWORD\" not in os.environ:\n", " os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", "\n", "if \"LLM_API_KEY\" not in os.environ:\n", " os.environ[\"LLM_API_KEY\"] = \"\"\n", "\n", "# \"neo4j\" or \"networkx\"\n", "os.environ[\"GRAPH_DATABASE_PROVIDER\"] = \"networkx\"\n", "# Not needed if using networkx\n", "# os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", "# os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", "# os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", "\n", "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", "os.environ[\"VECTOR_DB_PROVIDER\"] = \"lancedb\"\n", "# Not needed if using \"lancedb\" or \"pgvector\"\n", "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", "\n", "# Relational Database provider \"sqlite\" or \"postgres\"\n", "os.environ[\"DB_PROVIDER\"] = \"sqlite\"\n", "\n", "# Database name\n", "os.environ[\"DB_NAME\"] = \"cognee_db\"\n", "\n", "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", "# os.environ[\"DB_PORT\"]=\"5432\"\n", "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run Cognee with LlamaIndex Documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from typing import Union, BinaryIO\n", "\n", "from cognee.infrastructure.databases.vector.pgvector import (\n", " create_db_and_tables as create_pgvector_db_and_tables,\n", ")\n", "from cognee.infrastructure.databases.relational import (\n", " create_db_and_tables as create_relational_db_and_tables,\n", ")\n", "from cognee.modules.users.models import User\n", "from cognee.modules.users.methods import get_default_user\n", "from cognee.tasks.ingestion.ingest_data import ingest_data\n", "import cognee\n", "\n", "# Create a clean slate for cognee -- reset data and system state\n", "await cognee.prune.prune_data()\n", "await cognee.prune.prune_system(metadata=True)\n", "\n", "\n", "# Add the LlamaIndex documents, and make it available for cognify\n", "async def add(\n", " data: Union[BinaryIO, list[BinaryIO], str, list[str]],\n", " dataset_name: str = \"main_dataset\",\n", " user: User = None,\n", "):\n", " await create_relational_db_and_tables()\n", " await create_pgvector_db_and_tables()\n", "\n", " if user is None:\n", " user = await get_default_user()\n", "\n", " await ingest_data(data, dataset_name, user)\n", "\n", "\n", "await add(documents)\n", "\n", "# Use LLMs and cognee to create knowledge graph\n", "await cognee.cognify()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Query Cognee for summaries related to data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from cognee import SearchType\n", "\n", "# Query cognee for summaries\n", "search_results = await cognee.search(\n", " query_type=SearchType.SUMMARIES, query_text=\"What are the main news discussed in the document?\"\n", ")\n", "# Display search results\n", "print(\"\\n Summary of main news discussed:\\n\")\n", "print(search_results[0][\"text\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Render Knowledge Graph generated from provided data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import graphistry\n", "\n", "from cognee.infrastructure.databases.graph import get_graph_engine\n", "from cognee.shared.utils import render_graph\n", "\n", "# Get graph\n", "graphistry.login(\n", " username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\")\n", ")\n", "graph_engine = await get_graph_engine()\n", "\n", "graph_url = await render_graph(graph_engine.graph)\n", "print(graph_url)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }