cognee/notebooks/node_scores.ipynb
Boris e7644f4b3a
feat: migrate new UI to cognee (#966)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: Igor Ilic <igorilic03@gmail.com>
2025-06-18 20:56:44 +02:00

686 lines
72 KiB
Text
Vendored

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fb1fc4002c4652fc",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:11:34.191932Z",
"start_time": "2025-04-22T20:11:28.743188Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\u001b[2m2025-06-18T18:22:18.419562\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDeleted old log file: /Users/borisarzentar/Projects/Topoteretes/cognee/logs/2025-06-18_20-05-03.log\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n",
"\n",
"\u001b[2m2025-06-18T18:22:18.420076\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mLogging initialized \u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m \u001b[36mcognee_version\u001b[0m=\u001b[35m0.1.42-dev\u001b[0m \u001b[36mos_info\u001b[0m=\u001b[35m'Darwin 24.5.0 (Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:25 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T6020)'\u001b[0m \u001b[36mpython_version\u001b[0m=\u001b[35m3.11.5\u001b[0m \u001b[36mstructlog_version\u001b[0m=\u001b[35m25.4.0\u001b[0m\n",
"\n",
"\u001b[1mHTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json \"HTTP/1.1 200 OK\"\u001b[0m\n",
"/Users/borisarzentar/Projects/Topoteretes/cognee/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/Users/borisarzentar/Projects/Topoteretes/cognee/.venv/lib/python3.11/site-packages/dlt/helpers/dbt/__init__.py:3: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
" import pkg_resources\n"
]
}
],
"source": [
"import cognee"
]
},
{
"cell_type": "markdown",
"id": "6c18de8dad96c3f8",
"metadata": {},
"source": [
"# Basic setup"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:11:04.996737Z",
"start_time": "2025-04-22T20:11:04.992873Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"# cognee knowledge graph will be created based on this text\n",
"text = \"\"\"\n",
"Natural language processing (NLP) is an interdisciplinary\n",
"subfield of computer science and information retrieval.\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a2989b7d8237bd7d",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:13:18.624544Z",
"start_time": "2025-04-22T20:13:15.107863Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Adding text to cognee:\n",
"Natural language processing (NLP) is an interdisciplinary\n",
"subfield of computer science and information retrieval.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\u001b[1mLangfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client\u001b[0m\u001b[92m20:22:22 - LiteLLM:INFO\u001b[0m: utils.py:3101 - \n",
"LiteLLM completion() model= gpt-4o-mini; provider = openai\n",
"\u001b[1m\n",
"LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\u001b[92m20:22:23 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\n",
"\u001b[1mselected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\u001b[0m\n",
"\u001b[1mEmbeddingRateLimiter initialized: enabled=False, requests_limit=60, interval_seconds=60\u001b[0m\u001b[92m20:22:23 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\n",
"\u001b[1mselected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\u001b[0m\u001b[92m20:22:24 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.068461\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `9ba851a9-f173-544b-a219-ac0af2f657ff`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks(tasks: [Task], data)\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.068921\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.069781\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"User 769fc99f-cb9b-4d3b-b4ef-c81aea922b27 has registered.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\u001b[2m2025-06-18T18:22:24.550667\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.551277\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.551779\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `9ba851a9-f173-544b-a219-ac0af2f657ff`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks(tasks: [Task], data)\u001b[0m]\u001b[0m"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Text added successfully.\n",
"\n"
]
}
],
"source": [
"print(\"Adding text to cognee:\")\n",
"print(text.strip())\n",
"# Add the text, and make it available for cognify\n",
"await cognee.add(text)\n",
"print(\"Text added successfully.\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3034ec43e0339d72",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:13:46.162905Z",
"start_time": "2025-04-22T20:13:27.466606Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\u001b[2m2025-06-18T18:22:24.559363\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mOntology file 'None' not found. No owl ontology will be attached to the graph.\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.571809\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `8e988ccd-913a-5633-8ccb-e025246f9b94`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks(tasks: [Task], data)\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.572555\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.572990\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.577489\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task started: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:24.580717\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\u001b[92m20:22:24 - LiteLLM:INFO\u001b[0m: utils.py:3101 - \n",
"LiteLLM completion() model= gpt-4o-mini; provider = openai\n",
"\u001b[1m\n",
"LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\u001b[92m20:22:27 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\n",
"\u001b[1mselected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\u001b[0m\u001b[92m20:22:27 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\n",
"\u001b[1mselected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:27.425691\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'concept' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:27.426182\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'natural language processing' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:27.426553\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'field' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:27.426814\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'computer science' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:27.427082\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'information retrieval' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\u001b[92m20:22:27 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:28 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:28 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:29 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:30 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:30.877815\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\u001b[92m20:22:30 - LiteLLM:INFO\u001b[0m: utils.py:3101 - \n",
"LiteLLM completion() model= gpt-4o-mini; provider = openai\n",
"\u001b[1m\n",
"LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\u001b[92m20:22:32 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\n",
"\u001b[1mselected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\u001b[0m\u001b[92m20:22:32 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\n",
"\u001b[1mselected model name for cost calculation: openai/gpt-4o-mini-2024-07-18\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:32.124629\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\u001b[92m20:22:32 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:32 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:33 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:33 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:34 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:34 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.702737\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.703250\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.703582\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.703925\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task completed: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.704270\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.704635\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n",
"\u001b[2m2025-06-18T18:22:34.705024\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `8e988ccd-913a-5633-8ccb-e025246f9b94`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks(tasks: [Task], data)\u001b[0m]\u001b[0m"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cognify process complete.\n",
"\n"
]
}
],
"source": [
"# Use LLMs and cognee to create knowledge graph\n",
"await cognee.cognify()\n",
"print(\"Cognify process complete.\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3144acc7a837e75a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:15:01.401236Z",
"start_time": "2025-04-22T20:15:01.397148Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching cognee for insights with query: 'Tell me about NLP'\n"
]
}
],
"source": [
"query_text = \"Tell me about NLP\"\n",
"print(f\"Searching cognee for insights with query: '{query_text}'\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3c3eba3dc338dda2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:15:25.805Z",
"start_time": "2025-04-22T20:15:24.475476Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[92m20:22:35 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m\u001b[92m20:22:35 - LiteLLM:INFO\u001b[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/text-embedding-3-large\n",
"\u001b[1mselected model name for cost calculation: openai/text-embedding-3-large\u001b[0m"
]
}
],
"source": [
"from cognee.api.v1.search import SearchType\n",
"\n",
"# Query cognee for insights on the added text\n",
"search_results = await cognee.search(query_type=SearchType.INSIGHTS, query_text=query_text)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3dd224c6791db5e0",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:15:48.466032Z",
"start_time": "2025-04-22T20:15:48.460739Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"({'id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'name': 'natural language processing', 'type': 'Entity', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'contains', 'source_node_id': 'ec72faa8-a238-52f7-b159-81272937347e', 'target_node_id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'updated_at': '2025-06-18 18:22:32'}, {'id': 'ec72faa8-a238-52f7-b159-81272937347e', 'name': '', 'type': 'DocumentChunk', 'created_at': 1750270944579, 'updated_at': 1750270944579, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['text']}, 'belongs_to_set': None, 'text': '\\nNatural language processing (NLP) is an interdisciplinary\\nsubfield of computer science and information retrieval.\\n', 'chunk_size': 36, 'chunk_index': 0, 'cut_type': 'paragraph_end'})\n",
"({'id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'name': 'natural language processing', 'type': 'Entity', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_related_to', 'source_node_id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'target_node_id': '02bdab9a-0981-518c-a0d4-1684e0329447', 'ontology_valid': False}, {'id': '02bdab9a-0981-518c-a0d4-1684e0329447', 'name': 'information retrieval', 'type': 'Entity', 'created_at': 1750270947427, 'updated_at': 1750270947427, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'The process of obtaining information system resources that are relevant to an information need.'})\n",
"({'id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'name': 'natural language processing', 'type': 'Entity', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a_subfield_of', 'source_node_id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'target_node_id': '6218dbab-eb6a-5759-a864-b3419755ffe0', 'ontology_valid': False}, {'id': '6218dbab-eb6a-5759-a864-b3419755ffe0', 'name': 'computer science', 'type': 'Entity', 'created_at': 1750270947427, 'updated_at': 1750270947427, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'The study of computation, data processing, and information systems.'})\n",
"({'id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'name': 'natural language processing', 'type': 'Entity', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a', 'source_node_id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'target_node_id': 'dd9713b7-dc20-5101-aad0-1c4216811147', 'updated_at': '2025-06-18 18:22:32'}, {'id': 'dd9713b7-dc20-5101-aad0-1c4216811147', 'name': 'concept', 'type': 'EntityType', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'concept'})\n",
"({'id': 'dd9713b7-dc20-5101-aad0-1c4216811147', 'name': 'concept', 'type': 'EntityType', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'concept'}, {'relationship_name': 'is_a', 'source_node_id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'target_node_id': 'dd9713b7-dc20-5101-aad0-1c4216811147', 'updated_at': '2025-06-18 18:22:32'}, {'id': 'bc338a39-64d6-549a-acec-da60846dd90d', 'name': 'natural language processing', 'type': 'Entity', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'An interdisciplinary subfield of computer science and information retrieval.'})\n"
]
}
],
"source": [
"# Display results\n",
"for result_text in search_results:\n",
" print(result_text)"
]
},
{
"cell_type": "markdown",
"id": "129615f70ac937ef",
"metadata": {},
"source": [
"## Assigning scores to nodes in the graph\n",
"In this section, we show how to assign scores to nodes in the graph. We will use the page rank\n",
"algorithm for this purpose."
]
},
{
"cell_type": "markdown",
"id": "2d5cb30252b5993a",
"metadata": {},
"source": [
"First, we get the graph (knowledge_graph) from the cognee engine."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8d81b01a72d42529",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:28:17.193207Z",
"start_time": "2025-04-22T20:28:17.186961Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Graph engine loaded successfully.\n",
"\n",
"Graph object:\n",
"8 nodes\n",
"10 edges\n"
]
}
],
"source": [
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"graph_engine = await get_graph_engine()\n",
"(nodes, edges) = await graph_engine.get_graph_data()\n",
"print(\"Graph engine loaded successfully.\\n\")\n",
"print(\"Graph object:\")\n",
"print(len(nodes), 'nodes')\n",
"print(len(edges), 'edges')"
]
},
{
"cell_type": "markdown",
"id": "253c3e7a55a627ae",
"metadata": {},
"source": [
"Then, we inspect the nodes and its data in the graph."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5c48132d2d16b777",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:20:19.362858Z",
"start_time": "2025-04-22T20:20:19.356823Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- ('ec72faa8-a238-52f7-b159-81272937347e', {'name': '', 'type': 'DocumentChunk', 'created_at': 1750270944579, 'updated_at': 1750270944579, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['text']}, 'belongs_to_set': None, 'text': '\\nNatural language processing (NLP) is an interdisciplinary\\nsubfield of computer science and information retrieval.\\n', 'chunk_size': 36, 'chunk_index': 0, 'cut_type': 'paragraph_end'})\n",
"--- ('998dd02b-b033-502f-a273-5bfe7d3b3eb1', {'name': 'text_a796439b56064944e6c73f7751917e91', 'type': 'TextDocument', 'created_at': 1750270944572, 'updated_at': 1750270944572, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'raw_data_location': '/Users/borisarzentar/Projects/Topoteretes/cognee/cognee/.data_storage/data/text_a796439b56064944e6c73f7751917e91.txt', 'external_metadata': '{}', 'mime_type': 'text/plain'})\n",
"--- ('02bdab9a-0981-518c-a0d4-1684e0329447', {'name': 'information retrieval', 'type': 'Entity', 'created_at': 1750270947427, 'updated_at': 1750270947427, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'The process of obtaining information system resources that are relevant to an information need.'})\n",
"--- ('0198571b-3e94-50ea-8b9f-19e3a31080c0', {'name': 'field', 'type': 'EntityType', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'field'})\n",
"--- ('6218dbab-eb6a-5759-a864-b3419755ffe0', {'name': 'computer science', 'type': 'Entity', 'created_at': 1750270947427, 'updated_at': 1750270947427, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'The study of computation, data processing, and information systems.'})\n",
"--- ('bc338a39-64d6-549a-acec-da60846dd90d', {'name': 'natural language processing', 'type': 'Entity', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'An interdisciplinary subfield of computer science and information retrieval.'})\n",
"--- ('dd9713b7-dc20-5101-aad0-1c4216811147', {'name': 'concept', 'type': 'EntityType', 'created_at': 1750270947426, 'updated_at': 1750270947426, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['name']}, 'belongs_to_set': None, 'description': 'concept'})\n",
"--- ('3230db46-ad6d-5a66-af88-357d1dd2a2e3', {'name': '', 'type': 'TextSummary', 'created_at': 1750270952124, 'updated_at': 1750270952124, 'ontology_valid': False, 'version': 1, 'topological_rank': 0, 'metadata': {'index_fields': ['text']}, 'belongs_to_set': None, 'text': 'Natural language processing (NLP) is a multidisciplinary branch of computer science and information retrieval.'})\n"
]
}
],
"source": [
"# Print the first 10 nodes in the graph with their data\n",
"for node in nodes[:10]:\n",
" print('---',node)"
]
},
{
"cell_type": "markdown",
"id": "b6d25c6c77bab8d5",
"metadata": {},
"source": [
"The node data consists of a few fields:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c99e319b3646b234",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:23:47.082650Z",
"start_time": "2025-04-22T20:23:47.077861Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['name', 'type', 'created_at', 'updated_at', 'ontology_valid', 'version', 'topological_rank', 'metadata', 'belongs_to_set', 'text'])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"node[1].keys()"
]
},
{
"cell_type": "markdown",
"id": "674f15fac7d14059",
"metadata": {},
"source": [
"We can see how the graph looks, using nx drawing tools."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a344f3b96685c122",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:30:45.374685Z",
"start_time": "2025-04-22T20:30:44.856090Z"
}
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"import networkx as nx\n",
"\n",
"graph = nx.MultiDiGraph()\n",
"graph.add_nodes_from(nodes)\n",
"graph.add_edges_from(edges)\n",
"\n",
"pos = nx.spring_layout(graph, seed=42) # positions for all nodes\n",
"nx.draw(graph, pos, with_labels=False, node_size=100, node_color='white',\n",
" edge_color='gray',\n",
" edgecolors='black', alpha=0.5)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "5201aeb75f7ddf00",
"metadata": {},
"source": [
"We can now add a new field: page rank score. We first compute them using the networkx library."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "97734d83d57d62ef",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:26:03.564879Z",
"start_time": "2025-04-22T20:26:03.557142Z"
}
},
"outputs": [],
"source": [
"import networkx as nx\n",
"ranks = nx.pagerank(graph)"
]
},
{
"cell_type": "markdown",
"id": "33507b44e7910b57",
"metadata": {},
"source": [
"We can take a look at the ranks of the first 10 nodes."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "70677c7fc481682a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:27:14.819317Z",
"start_time": "2025-04-22T20:27:14.807163Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ec72faa8-a238-52f7-b159-81272937347e /rank: 0.1263134587793514\n",
"998dd02b-b033-502f-a273-5bfe7d3b3eb1 /rank: 0.09511967865839628\n",
"02bdab9a-0981-518c-a0d4-1684e0329447 /rank: 0.12207032848939373\n",
"0198571b-3e94-50ea-8b9f-19e3a31080c0 /rank: 0.2757995171468045\n",
"6218dbab-eb6a-5759-a864-b3419755ffe0 /rank: 0.12207032848939373\n",
"bc338a39-64d6-549a-acec-da60846dd90d /rank: 0.09511967865839628\n",
"dd9713b7-dc20-5101-aad0-1c4216811147 /rank: 0.09522882980463071\n",
"3230db46-ad6d-5a66-af88-357d1dd2a2e3 /rank: 0.06827817997363327\n"
]
}
],
"source": [
"for node in list(ranks)[:10]:\n",
" print(node, '/rank:', ranks[node])"
]
},
{
"cell_type": "markdown",
"id": "50f6b51f62a65cc7",
"metadata": {},
"source": [
"Finally, we can get a feeling of the distribution of the ranks."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "648db7aec21f8b3b",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:31:54.910597Z",
"start_time": "2025-04-22T20:31:54.285586Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, 'Page rank distribution')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots(figsize=(10, 5))\n",
"plt.hist(list(ranks.values()), bins=60)\n",
"plt.title('Page rank distribution')"
]
},
{
"cell_type": "markdown",
"id": "4e011e5c95dcfe5c",
"metadata": {},
"source": [
"As we can see, some nodes have a very high score, while most of them are around 0.01. Let's see\n",
"the data of the node with the highest score."
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b2e83e84cd9bf6a5",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:33:06.355984Z",
"start_time": "2025-04-22T20:33:06.352761Z"
}
},
"outputs": [],
"source": [
"# Sort the ranks dictionary by value in descending order\n",
"sorted_ranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "a8a49371bc7640ca",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:33:16.681400Z",
"start_time": "2025-04-22T20:33:16.665675Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"('0198571b-3e94-50ea-8b9f-19e3a31080c0', 0.2757995171468045)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_ranks[0]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e9a594c0fe27dea",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-22T20:33:40.483516Z",
"start_time": "2025-04-22T20:33:40.479188Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'field',\n",
" 'type': 'EntityType',\n",
" 'created_at': 1750270947426,\n",
" 'updated_at': 1750270947426,\n",
" 'ontology_valid': False,\n",
" 'version': 1,\n",
" 'topological_rank': 0,\n",
" 'metadata': {'index_fields': ['name']},\n",
" 'belongs_to_set': None,\n",
" 'description': 'field'}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graph.nodes[sorted_ranks[0][0]] # get the node data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95227d160bddb696",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}