{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Cognee GraphRAG with LlamaIndex Documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install llama-index-core" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data\n", "\n", "We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.\n", "\n", "The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from llama_index.core import Document\n", "\n", "news = pd.read_csv(\n", " \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv\"\n", ")[:5]\n", "\n", "news.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare documents as required by LlamaIndex" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "documents = [\n", " Document(text=f\"{row['title']}: {row['text']}\")\n", " for i, row in news.iterrows()\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set environment variables" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# Setting environment variables\n", "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", "\n", "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n", " os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", "\n", "if \"LLM_API_KEY\" not in os.environ:\n", " os.environ[\"LLM_API_KEY\"] = \"\"\n", "\n", "# \"neo4j\" or \"networkx\"\n", "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", "# Not needed if using networkx\n", "#GRAPH_DATABASE_URL=\"\"\n", "#GRAPH_DATABASE_USERNAME=\"\"\n", "#GRAPH_DATABASE_PASSWORD=\"\"\n", "\n", "# \"qdrant\", \"weaviate\" or \"lancedb\"\n", "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", "# Not needed if using \"lancedb\"\n", "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", "\n", "# Database provider\n", "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", "\n", "# Database name\n", "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", "\n", "# Postgres specific parameters (Only if Postgres is run)\n", "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", "# os.environ[\"DB_PORT\"]=\"5432\"\n", "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run Cognee with LlamaIndex Documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from typing import Union, BinaryIO\n", "\n", "from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n", "from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n", "from cognee.infrastructure.databases.graph import get_graph_engine\n", "from cognee.shared.utils import render_graph\n", "from cognee.modules.users.models import User\n", "from cognee.modules.users.methods import get_default_user\n", "from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n", "import cognee\n", "\n", "# Create a clean slate for cognee -- reset data and system state\n", "await cognee.prune.prune_data()\n", "await cognee.prune.prune_system(metadata=True)\n", "\n", "# Add the LlamaIndex documents, and make it available for cognify\n", "async def add(data: Union[BinaryIO, list[BinaryIO], str, list[str]], dataset_name: str = \"main_dataset\", user: User = None):\n", " await create_relational_db_and_tables()\n", " await create_pgvector_db_and_tables()\n", "\n", " if user is None:\n", " user = await get_default_user()\n", "\n", " await ingest_data_with_metadata(data, dataset_name, user)\n", "\n", "await add(documents)\n", "\n", "# Use LLMs and cognee to create knowledge graph\n", "await cognee.cognify()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Query Cognee for summaries related to data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from cognee import SearchType\n", "\n", "# Query cognee for summaries\n", "search_results = await cognee.search(\n", " SearchType.SUMMARIES, query_text=\"What are the main news discussed in the document?\"\n", ")\n", "# Display search results\n", "print(\"\\n Summary of main news discussed:\\n\")\n", "print(search_results[0][\"text\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Render Knowledge Graph generated from provided data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import graphistry\n", "\n", "# Get graph\n", "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n", "graph_engine = await get_graph_engine()\n", "\n", "graph_url = await render_graph(graph_engine.graph)\n", "print(graph_url)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }