<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
239 lines
6.5 KiB
Text
239 lines
6.5 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Cognee GraphRAG with LlamaIndex Documents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%pip install llama-index-core\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load Data\n",
|
|
"\n",
|
|
"We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.\n",
|
|
"\n",
|
|
"The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from llama_index.core import Document\n",
|
|
"\n",
|
|
"news = pd.read_csv(\n",
|
|
" \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv\"\n",
|
|
")[:5]\n",
|
|
"\n",
|
|
"news.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Prepare documents as required by LlamaIndex"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"documents = [Document(text=f\"{row['title']}: {row['text']}\") for i, row in news.iterrows()]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Set environment variables"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"\n",
|
|
"# Setting environment variables\n",
|
|
"if \"GRAPHISTRY_USERNAME\" not in os.environ:\n",
|
|
" os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
|
|
"\n",
|
|
"if \"GRAPHISTRY_PASSWORD\" not in os.environ:\n",
|
|
" os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
|
|
"\n",
|
|
"if \"LLM_API_KEY\" not in os.environ:\n",
|
|
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
|
|
"\n",
|
|
"# \"neo4j\" or \"networkx\"\n",
|
|
"os.environ[\"GRAPH_DATABASE_PROVIDER\"] = \"networkx\"\n",
|
|
"# Not needed if using networkx\n",
|
|
"# os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
|
|
"# os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
|
|
"# os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
|
|
"\n",
|
|
"# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
|
|
"os.environ[\"VECTOR_DB_PROVIDER\"] = \"lancedb\"\n",
|
|
"# Not needed if using \"lancedb\" or \"pgvector\"\n",
|
|
"# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
|
|
"# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
|
|
"\n",
|
|
"# Relational Database provider \"sqlite\" or \"postgres\"\n",
|
|
"os.environ[\"DB_PROVIDER\"] = \"sqlite\"\n",
|
|
"\n",
|
|
"# Database name\n",
|
|
"os.environ[\"DB_NAME\"] = \"cognee_db\"\n",
|
|
"\n",
|
|
"# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
|
|
"# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
|
|
"# os.environ[\"DB_PORT\"]=\"5432\"\n",
|
|
"# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
|
|
"# os.environ[\"DB_PASSWORD\"]=\"cognee\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run Cognee with LlamaIndex Documents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from typing import Union, BinaryIO\n",
|
|
"\n",
|
|
"from cognee.infrastructure.databases.vector.pgvector import (\n",
|
|
" create_db_and_tables as create_pgvector_db_and_tables,\n",
|
|
")\n",
|
|
"from cognee.infrastructure.databases.relational import (\n",
|
|
" create_db_and_tables as create_relational_db_and_tables,\n",
|
|
")\n",
|
|
"from cognee.modules.users.models import User\n",
|
|
"from cognee.modules.users.methods import get_default_user\n",
|
|
"from cognee.tasks.ingestion.ingest_data import ingest_data\n",
|
|
"import cognee\n",
|
|
"\n",
|
|
"# Create a clean slate for cognee -- reset data and system state\n",
|
|
"await cognee.prune.prune_data()\n",
|
|
"await cognee.prune.prune_system(metadata=True)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Add the LlamaIndex documents, and make it available for cognify\n",
|
|
"async def add(\n",
|
|
" data: Union[BinaryIO, list[BinaryIO], str, list[str]],\n",
|
|
" dataset_name: str = \"main_dataset\",\n",
|
|
" user: User = None,\n",
|
|
"):\n",
|
|
" await create_relational_db_and_tables()\n",
|
|
" await create_pgvector_db_and_tables()\n",
|
|
"\n",
|
|
" if user is None:\n",
|
|
" user = await get_default_user()\n",
|
|
"\n",
|
|
" await ingest_data(data, dataset_name, user)\n",
|
|
"\n",
|
|
"\n",
|
|
"await add(documents)\n",
|
|
"\n",
|
|
"# Use LLMs and cognee to create knowledge graph\n",
|
|
"await cognee.cognify()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Query Cognee for summaries related to data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from cognee import SearchType\n",
|
|
"\n",
|
|
"# Query cognee for summaries\n",
|
|
"search_results = await cognee.search(\n",
|
|
" query_type=SearchType.SUMMARIES, query_text=\"What are the main news discussed in the document?\"\n",
|
|
")\n",
|
|
"# Display search results\n",
|
|
"print(\"\\n Summary of main news discussed:\\n\")\n",
|
|
"print(search_results[0][\"text\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Render Knowledge Graph generated from provided data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import graphistry\n",
|
|
"\n",
|
|
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
|
|
"from cognee.shared.utils import render_graph\n",
|
|
"\n",
|
|
"# Get graph\n",
|
|
"graphistry.login(\n",
|
|
" username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\")\n",
|
|
")\n",
|
|
"graph_engine = await get_graph_engine()\n",
|
|
"\n",
|
|
"graph_url = await render_graph(graph_engine.graph)\n",
|
|
"print(graph_url)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|