cognee/notebooks/full_run.ipynb
2024-04-30 23:14:11 +02:00

465 lines
37 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "38135bf7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:NetworkXAdapter:File /Users/vasa/Projects/cognee/.cognee_system/databases/cognee_graph.pkl not found. Initializing an empty graph./Users/vasa/Projects/cognee/.venv/lib/python3.11/site-packages/dlt/common/configuration/container.py:94: DeprecationWarning: currentThread() is deprecated, use current_thread() instead\n",
" if m := re.match(r\"dlt-pool-(\\d+)-\", threading.currentThread().getName()):\n",
"/Users/vasa/Projects/cognee/.venv/lib/python3.11/site-packages/dlt/common/configuration/container.py:94: DeprecationWarning: getName() is deprecated, get the name attribute instead\n",
" if m := re.match(r\"dlt-pool-(\\d+)-\", threading.currentThread().getName()):\n"
]
},
{
"data": {
"text/plain": [
"[[LoadInfo(pipeline=<dlt.pipeline.pipeline.Pipeline object at 0x32927cad0>, metrics={'1714493358.732525': [{'started_at': DateTime(2024, 4, 30, 16, 9, 19, 653744, tzinfo=Timezone('UTC')), 'finished_at': DateTime(2024, 4, 30, 16, 9, 19, 957893, tzinfo=Timezone('UTC'))}]}, destination_type='dlt.destinations.duckdb', destination_displayable_credentials='duckdb:///:external:', destination_name='duckdb', environment=None, staging_type=None, staging_name=None, staging_displayable_credentials=None, destination_fingerprint='', dataset_name='code', loads_ids=['1714493358.732525'], load_packages=[LoadPackageInfo(load_id='1714493358.732525', package_path='/Users/vasa/.dlt/pipelines/file_load_from_filesystem/load/loaded/1714493358.732525', state='loaded', schema=Schema file_load_from_filesystem at 13554777360, schema_update={'_dlt_loads': {'name': '_dlt_loads', 'columns': {'load_id': {'name': 'load_id', 'data_type': 'text', 'nullable': False}, 'schema_name': {'name': 'schema_name', 'data_type': 'text', 'nullable': True}, 'status': {'name': 'status', 'data_type': 'bigint', 'nullable': False}, 'inserted_at': {'name': 'inserted_at', 'data_type': 'timestamp', 'nullable': False}, 'schema_version_hash': {'name': 'schema_version_hash', 'data_type': 'text', 'nullable': True}}, 'write_disposition': 'skip', 'resource': '_dlt_loads', 'description': 'Created by DLT. Tracks completed loads', 'table_format': None}, '_dlt_pipeline_state': {'columns': {'version': {'name': 'version', 'data_type': 'bigint', 'nullable': False}, 'engine_version': {'name': 'engine_version', 'data_type': 'bigint', 'nullable': False}, 'pipeline_name': {'name': 'pipeline_name', 'data_type': 'text', 'nullable': False}, 'state': {'name': 'state', 'data_type': 'text', 'nullable': False}, 'created_at': {'name': 'created_at', 'data_type': 'timestamp', 'nullable': False}, 'version_hash': {'name': 'version_hash', 'data_type': 'text', 'nullable': True}, '_dlt_load_id': {'name': '_dlt_load_id', 'data_type': 'text', 'nullable': False}, '_dlt_id': {'name': '_dlt_id', 'data_type': 'text', 'nullable': False, 'unique': True}}, 'write_disposition': 'append', 'name': '_dlt_pipeline_state', 'resource': '_dlt_pipeline_state', 'x-normalizer': {'seen-data': True}, 'table_format': None}, 'file_metadata': {'columns': {'id': {'name': 'id', 'nullable': False, 'merge_key': True, 'data_type': 'text'}, 'name': {'name': 'name', 'data_type': 'text', 'nullable': True}, 'file_path': {'name': 'file_path', 'data_type': 'text', 'nullable': True}, 'extension': {'name': 'extension', 'data_type': 'text', 'nullable': True}, 'mime_type': {'name': 'mime_type', 'data_type': 'text', 'nullable': True}, 'keywords': {'name': 'keywords', 'data_type': 'text', 'nullable': True}, '_dlt_load_id': {'name': '_dlt_load_id', 'data_type': 'text', 'nullable': False}, '_dlt_id': {'name': '_dlt_id', 'data_type': 'text', 'nullable': False, 'unique': True}}, 'write_disposition': 'merge', 'name': 'file_metadata', 'resource': 'data_resources', 'x-normalizer': {'seen-data': True}, 'table_format': None}, '_dlt_version': {'name': '_dlt_version', 'columns': {'version': {'name': 'version', 'data_type': 'bigint', 'nullable': False}, 'engine_version': {'name': 'engine_version', 'data_type': 'bigint', 'nullable': False}, 'inserted_at': {'name': 'inserted_at', 'data_type': 'timestamp', 'nullable': False}, 'schema_name': {'name': 'schema_name', 'data_type': 'text', 'nullable': False}, 'version_hash': {'name': 'version_hash', 'data_type': 'text', 'nullable': False}, 'schema': {'name': 'schema', 'data_type': 'text', 'nullable': False}}, 'write_disposition': 'skip', 'resource': '_dlt_version', 'description': 'Created by DLT. Tracks schema updates', 'table_format': None}}, completed_at=DateTime(2024, 4, 30, 16, 9, 19, 951047, tzinfo=Timezone('UTC')), jobs={'new_jobs': [], 'failed_jobs': [], 'started_jobs': [], 'completed_jobs': [LoadJobInfo(state='completed_jobs', file_path='/Users/vasa/.dlt/pipelines/file_load_from_filesystem/load/loaded/1714493358.732525/completed_jobs/_dlt_pipeline_state.5b1065da97.0.insert_values', file_size=526, created_at=DateTime(2024, 4, 30, 16, 9, 19, 309619, tzinfo=Timezone('UTC')), elapsed=0.6414282321929932, job_file_info=ParsedLoadJobFileName(table_name='_dlt_pipeline_state', file_id='5b1065da97', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='/Users/vasa/.dlt/pipelines/file_load_from_filesystem/load/loaded/1714493358.732525/completed_jobs/file_metadata.13d63c321b.0.insert_values', file_size=354, created_at=DateTime(2024, 4, 30, 16, 9, 19, 309748, tzinfo=Timezone('UTC')), elapsed=0.6412985324859619, job_file_info=ParsedLoadJobFileName(table_name='file_metadata', file_id='13d63c321b', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='/Users/vasa/.dlt/pipelines/file_load_from_filesystem/load/loaded/1714493358.732525/completed_jobs/file_metadata.c6d93b3a58.0.sql', file_size=401, created_at=DateTime(2024, 4, 30, 16, 9, 19, 721255, tzinfo=Timezone('UTC')), elapsed=0.22979211807250977, job_file_info=ParsedLoadJobFileName(table_name='file_metadata', file_id='c6d93b3a58', retry_count=0, file_format='sql'), failed_message=None)]})], first_run=True)]]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from os import path\n",
"import cognee\n",
"import dspy\n",
"from cognee.modules.cognify.dataset import HotPotQA\n",
"from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
"\n",
"data_directory_path = path.abspath(\"../.data\")\n",
"cognee.config.data_root_directory(data_directory_path)\n",
"\n",
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
"cognee.config.system_root_directory(cognee_directory_path)\n",
"\n",
"cognee.config.set_graph_model(SourceCodeGraph)\n",
"\n",
"await cognee.prune.prune_system()\n",
"\n",
"colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = \"http://20.102.90.50:2017/wiki17_abstracts\")\n",
"dspy.configure(rm = colbertv2_wiki17_abstracts)\n",
"\n",
"# dataset = HotPotQA(\n",
"# train_seed = 1,\n",
"# train_size = 10,\n",
"# eval_seed = 2023,\n",
"# dev_size = 0,\n",
"# test_size = 0,\n",
"# keep_details = True,\n",
"# )\n",
"\n",
"# texts_to_add = []\n",
"\n",
"# for train_case in dataset.train:\n",
"# train_case_text = \"\\r\\n\".join(\" \".join(context_sentences) for context_sentences in train_case.get(\"context\")[\"sentences\"])\n",
"\n",
"# texts_to_add.append(train_case_text)\n",
"\n",
"dataset_name = \"code\"\n",
"await cognee.add(\"data://\" + data_directory_path, dataset_name)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "44603a2a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['code']\n"
]
},
{
"ename": "CatalogException",
"evalue": "Catalog Error: Table with name file_metadata does not exist!\nDid you mean \"code_staging.file_metadata\"?",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mCatalogException\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 12\u001b[0m\n\u001b[1;32m 8\u001b[0m cognee\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39msystem_root_directory(cognee_directory_path)\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(cognee\u001b[38;5;241m.\u001b[39mdatasets\u001b[38;5;241m.\u001b[39mlist_datasets())\n\u001b[0;32m---> 12\u001b[0m train_dataset \u001b[38;5;241m=\u001b[39m \u001b[43mcognee\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdatasets\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshort_stories\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(train_dataset))\n",
"File \u001b[0;32m~/Projects/cognee/cognee/api/v1/datasets/datasets.py:17\u001b[0m, in \u001b[0;36mdatasets.query_data\u001b[0;34m(dataset_name)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mquery_data\u001b[39m(dataset_name: \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 16\u001b[0m db \u001b[38;5;241m=\u001b[39m infrastructure_config\u001b[38;5;241m.\u001b[39mget_config(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdatabase_engine\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_files_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_name\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Projects/cognee/cognee/infrastructure/databases/relational/duckdb/DuckDBAdapter.py:21\u001b[0m, in \u001b[0;36mDuckDBAdapter.get_files_metadata\u001b[0;34m(self, dataset_name)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_files_metadata\u001b[39m(\u001b[38;5;28mself\u001b[39m, dataset_name: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m---> 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_connection\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mreturn\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mSELECT id, name, file_path, extension, mime_type, keywords FROM \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mdataset_name\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.file_metadata;\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_df\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecords\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Projects/cognee/cognee/infrastructure/databases/relational/duckdb/DuckDBAdapter.py:22\u001b[0m, in \u001b[0;36mDuckDBAdapter.get_files_metadata\u001b[0;34m(self, dataset_name)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_files_metadata\u001b[39m(\u001b[38;5;28mself\u001b[39m, dataset_name: \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_connection() \u001b[38;5;28;01mas\u001b[39;00m connection:\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mSELECT id, name, file_path, extension, mime_type, keywords FROM \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mdataset_name\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.file_metadata;\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mto_df()\u001b[38;5;241m.\u001b[39mto_dict(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrecords\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mCatalogException\u001b[0m: Catalog Error: Table with name file_metadata does not exist!\nDid you mean \"code_staging.file_metadata\"?"
]
}
],
"source": [
"# from os import path\n",
"# import cognee\n",
"\n",
"# data_directory_path = path.abspath(\"../.data\")\n",
"# cognee.config.data_root_directory(data_directory_path)\n",
"\n",
"# cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
"# cognee.config.system_root_directory(cognee_directory_path)\n",
"\n",
"# print(cognee.datasets.list_datasets())\n",
"\n",
"# train_dataset = cognee.datasets.query_data(\"short_stories\")\n",
"# print(len(train_dataset))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "65bfaf09",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:NetworkXAdapter:File /Users/vasa/Projects/cognee/.cognee_system/databases/cognee_graph.pkl not found. Initializing an empty graph.WARNING:NetworkXAdapter:File /Users/vasa/Projects/cognee/.cognee_system/databases/cognee_graph.pkl not found. Initializing an empty graph.ERROR:root:Collection still not found. Creating collection again."
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "050895021b1a44cab961b00c590714ce",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing chunk (afe50dab-7232-4c41-9223-5164dc42c1ae) from document (d87dfe97f0d55afb9b3bf6cb14f8bb0f).\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "490c0404a0a34e4ba314deb2f85a5f97",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chunk (afe50dab-7232-4c41-9223-5164dc42c1ae) classified.\n",
"Chunk (afe50dab-7232-4c41-9223-5164dc42c1ae) summarized.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/selector_events.py:864: ResourceWarning: unclosed transport <_SelectorSocketTransport fd=88 read=idle write=<idle, bufsize=0>>\n",
" _warn(f\"unclosed transport {self!r}\", ResourceWarning, source=self)\n",
"ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
"/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/selector_events.py:864: ResourceWarning: unclosed transport <_SelectorSocketTransport fd=92 read=idle write=<idle, bufsize=0>>\n",
" _warn(f\"unclosed transport {self!r}\", ResourceWarning, source=self)\n",
"ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
"/Users/vasa/Projects/cognee/.venv/lib/python3.11/site-packages/pydantic/main.py:1096: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node properties: [('constructor_parameters', None), ('from_class', None)]\n",
"Node properties: [('parameters', ['s']), ('return_type', 'int'), ('is_static', False)]\n",
"Node properties: [('is_static', False), ('default_value', '{}')]\n",
"Node properties: [('is_static', False), ('default_value', '0')]\n",
"Node properties: [('is_static', False), ('default_value', '0')]\n",
"Node properties: [('is_static', False), ('default_value', None)]\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f1e021c7325f4fe099571713aa4cefed",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node properties: [('constructor_parameters', None), ('from_class', None)]\n",
"Node properties: [('parameters', ['s']), ('return_type', 'int'), ('is_static', False)]\n",
"Node properties: [('is_static', False), ('default_value', '{}')]\n",
"Node properties: [('is_static', False), ('default_value', '0')]\n",
"Node properties: [('is_static', False), ('default_value', '0')]\n",
"Node properties: [('is_static', False), ('default_value', None)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/selector_events.py:864: ResourceWarning: unclosed transport <_SelectorSocketTransport fd=93 read=idle write=<idle, bufsize=0>>\n",
" _warn(f\"unclosed transport {self!r}\", ResourceWarning, source=self)\n",
"ResourceWarning: Enable tracemalloc to get the object allocation traceback\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9ee03487aeb94cae97d1b67ea8d239af",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chunk (afe50dab-7232-4c41-9223-5164dc42c1ae) cognified.\n"
]
},
{
"data": {
"text/plain": [
"<networkx.classes.multidigraph.MultiDiGraph at 0x1038f8a90>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from os import path\n",
"import logging\n",
"import cognee\n",
"from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
"\n",
"cognee.config.set_graph_model(SourceCodeGraph)\n",
"\n",
"logging.basicConfig(level = logging.INFO)\n",
"\n",
"await cognee.prune.prune_system()\n",
"\n",
"data_directory_path = path.abspath(\"../.data\")\n",
"cognee.config.data_root_directory(data_directory_path)\n",
"\n",
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
"cognee.config.system_root_directory(cognee_directory_path)\n",
"\n",
"await cognee.cognify('code')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a514cf38",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Graph is visualized at: https://hub.graphistry.com/graph/graph.html?dataset=bcdf88d11a934508b58e6c7850e73fd3&type=arrow&viztoken=2823d6ff-3dd6-464c-b864-2dcb8f399d79&usertag=1daaf574-pygraphistry-0.33.7&splashAfter=1714419870&info=true\n"
]
}
],
"source": [
"import networkx as nx\n",
"import pandas as pd\n",
"import graphistry\n",
"from cognee.config import Config\n",
"from cognee.utils import render_graph\n",
"from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client, GraphDBType\n",
"\n",
"config = Config()\n",
"config.load()\n",
"\n",
"graphistry.register(\n",
" api = 3,\n",
" username = config.graphistry_username,\n",
" password = config.graphistry_password\n",
")\n",
"\n",
"graph_client = await get_graph_client(GraphDBType.NETWORKX, \"cognee_graph.pkl\")\n",
"graph = graph_client.graph\n",
"\n",
"await render_graph(graph)\n",
"\n",
"# edges = nx.to_pandas_edgelist(graph)\n",
"\n",
"# nodes_data = [{\n",
"# \"id\": node_id,\n",
"# \"label\": node[\"name\"] if \"name\" in node else node_id,\n",
"# } for (node_id, node) in graph.nodes(data = True)]\n",
"\n",
"# nodes = pd.DataFrame(nodes_data)\n",
"\n",
"# plotter = graphistry.edges(edges, source = \"source\", destination = \"target\").nodes(nodes, \"id\")\n",
"\n",
"# plotter.bind(edge_title = \"relationship_name\", edge_label = \"relationship_name\", point_title = \"name\", point_label = \"name\")\n",
"# url = plotter.plot(render = False, as_files = True)\n",
"# print(f\"Graph is visualized at: {url}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e916c484",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"graph MultiDiGraph with 45 nodes and 62 edges\n",
"summaries_and_ids [{'document_id': 'DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f', 'Summary': 'Longest Substring Without Repeating Characters'}]\n",
"enriched_query Chose the summary that is the most relevant to the query`Who are French girls?`\n",
"Here are the summaries:`[{&#39;document_id&#39;: &#39;DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f&#39;, &#39;Summary&#39;: &#39;Longest Substring Without Repeating Characters&#39;}]`\n",
"check_relevant_summary {'document_id': 'DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f', 'response_summary': 'Longest Substring Without Repeating Characters'}\n",
"connected_nodes ['DATA_LABEL_STRING', 'DATA_LABEL_LENGTH', 'DATA_LABEL_SUBSTRING', 'DATA_LABEL_ANSWER', 'DATA_LABEL_BBBBB', 'DATA_LABEL_PWWKEW', 'DATA_LABEL_NOTE', 'DATA_LABEL_PWKE', 'DATA_LABEL_SUBSEQUENCE', 'DATA_LABEL_CLASS', 'DATA_LABEL_OBJECT', 'DATA_LABEL_LENGTHOFLONGESTSUBSTRING', 'DATA_LABEL_TYPE', 'DATA_LABEL_RTYPE', 'DATA_LABEL_MAPSET', 'DATA_SUMMARY__DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f', 'DATA_DESCRIPTION__DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f', 'COGNITIVE_LAYER__SYNTAX_LAYER', 'COGNITIVE_LAYER__SEMANTIC_LAYER', 'COGNITIVE_LAYER__FUNCTIONAL_LAYER', 'COGNITIVE_LAYER__MODULE_INTERACTION_LAYER', 'COGNITIVE_LAYER__DATA_FLOW_LAYER', 'COGNITIVE_LAYER__CONTROL_FLOW_LAYER', 'COGNITIVE_LAYER__PERFORMANCE_LAYER', 'COGNITIVE_LAYER__SECURITY_LAYER', 'COGNITIVE_LAYER__DOCUMENTATION_AND_COMMENTS_LAYER', 'COGNITIVE_LAYER__CONVENTIONS_AND_STYLE_LAYER', 'COGNITIVE_LAYER__DEPENDENCY_AND_INTEGRATION_LAYER', 'COGNITIVE_LAYER__VERSION_CONTROL_AND_HISTORY_LAYER', 'COGNITIVE_LAYER__TEST_AND_VERIFICATION_LAYER', 'COGNITIVE_LAYER__LICENSE_AND_COPYRIGHT_LAYER', 'COGNITIVE_LAYER__PLATFORM_AND_ENVIRONMENT_LAYER']\n",
"descs {'DATA_LABEL_STRING': 'No desc available', 'DATA_LABEL_LENGTH': 'No desc available', 'DATA_LABEL_SUBSTRING': 'No desc available', 'DATA_LABEL_ANSWER': 'No desc available', 'DATA_LABEL_BBBBB': 'No desc available', 'DATA_LABEL_PWWKEW': 'No desc available', 'DATA_LABEL_NOTE': 'No desc available', 'DATA_LABEL_PWKE': 'No desc available', 'DATA_LABEL_SUBSEQUENCE': 'No desc available', 'DATA_LABEL_CLASS': 'No desc available', 'DATA_LABEL_OBJECT': 'No desc available', 'DATA_LABEL_LENGTHOFLONGESTSUBSTRING': 'No desc available', 'DATA_LABEL_TYPE': 'No desc available', 'DATA_LABEL_RTYPE': 'No desc available', 'DATA_LABEL_MAPSET': 'No desc available', 'DATA_SUMMARY__DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f': 'No desc available', 'DATA_DESCRIPTION__DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f': \"The task is to find the length of the longest substring in a given string that does not contain any repeating characters. Examples include finding such substrings in 'abcabcbb' (resulting in 'abc' with length 3), 'bbbbb' (resulting in 'b' with length 1), and 'pwwkew' (resulting in 'wke' with length 3), emphasizing that substrings are different from subsequences. The provided Python class 'Solution' includes a method 'lengthOfLongestSubstring' that uses a hash map to track characters and their indices, updating start and result variables to calculate the maximum length of such substrings without repetitions.\", 'COGNITIVE_LAYER__SYNTAX_LAYER': 'This layer deals with the syntactic structure of the source code including tokens, keywords, operators, and control structures, which are essential for understanding its grammatical correctness.', 'COGNITIVE_LAYER__SEMANTIC_LAYER': 'This layer addresses the meanings of individual instructions and the functions they perform within the code. It covers variable declarations, method calls, and data manipulations that carry semantic value.', 'COGNITIVE_LAYER__FUNCTIONAL_LAYER': 'This layer focuses on the algorithmic and logical aspects of the code, assessing how different components interact to fulfill designated tasks and solve specific problems.', 'COGNITIVE_LAYER__MODULE_INTERACTION_LAYER': 'Here, the analysis is on the interaction between different modules, functions, or classes in the source code, illustrating the architectural design and interdependencies.', 'COGNITIVE_LAYER__DATA_FLOW_LAYER': \"This layer examines how data is passed through the system, including variable scopes, parameter passing, and state management, which is crucial for understanding the program's behavior.\", 'COGNITIVE_LAYER__CONTROL_FLOW_LAYER': 'Examining the flow of execution throughout the code (e.g., loops, conditionals, and function calls), this layer is important for understanding the logic and potential execution paths in the program.', 'COGNITIVE_LAYER__PERFORMANCE_LAYER': 'This layer analyzes aspects of the code that affect its performance, such as complexity, optimization, potential bottlenecks, and resource management.', 'COGNITIVE_LAYER__SECURITY_LAYER': 'Focuses on identifying security-related aspects of source code, such as vulnerabilities, security controls, and adherence to secure coding practices.', 'COGNITIVE_LAYER__DOCUMENTATION_AND_COMMENTS_LAYER': \"Includes inline comments, docstrings, and external documentation that provide insights into the developer's intentions, explain complex pieces of code and specify APIs.\", 'COGNITIVE_LAYER__CONVENTIONS_AND_STYLE_LAYER': 'Encompasses coding standards, naming conventions, and formatting that contribute to code readability, maintainability, and consistency across a codebase.', 'COGNITIVE_LAYER__DEPENDENCY_AND_INTEGRATION_LAYER': 'Analyzes external libraries, components, or services that the code interacts with, both at the source level and through build and deployment processes.', 'COGNITIVE_LAYER__VERSION_CONTROL_AND_HISTORY_LAYER': 'Captures changes, version history, collaboration, and branch management within version control systems to understand the development and evolution of the codebase.', 'COGNITIVE_LAYER__TEST_AND_VERIFICATION_LAYER': \"This layer includes test scripts, test cases, and the overall testing strategy implemented to verify the code's functionality and robustness.\", 'COGNITIVE_LAYER__LICENSE_AND_COPYRIGHT_LAYER': 'Deals with legal aspects such as copyright notices, licensing information, and intellectual property concerns related to the source code.', 'COGNITIVE_LAYER__PLATFORM_AND_ENVIRONMENT_LAYER': 'Examines compatibility issues, target runtime environments, and platform-specific considerations that are important for code deployment and execution.'}\n",
"{'DATA_LABEL_STRING': 'No desc available', 'DATA_LABEL_LENGTH': 'No desc available', 'DATA_LABEL_SUBSTRING': 'No desc available', 'DATA_LABEL_ANSWER': 'No desc available', 'DATA_LABEL_BBBBB': 'No desc available', 'DATA_LABEL_PWWKEW': 'No desc available', 'DATA_LABEL_NOTE': 'No desc available', 'DATA_LABEL_PWKE': 'No desc available', 'DATA_LABEL_SUBSEQUENCE': 'No desc available', 'DATA_LABEL_CLASS': 'No desc available', 'DATA_LABEL_OBJECT': 'No desc available', 'DATA_LABEL_LENGTHOFLONGESTSUBSTRING': 'No desc available', 'DATA_LABEL_TYPE': 'No desc available', 'DATA_LABEL_RTYPE': 'No desc available', 'DATA_LABEL_MAPSET': 'No desc available', 'DATA_SUMMARY__DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f': 'No desc available', 'DATA_DESCRIPTION__DOCUMENT__d87dfe97f0d55afb9b3bf6cb14f8bb0f': \"The task is to find the length of the longest substring in a given string that does not contain any repeating characters. Examples include finding such substrings in 'abcabcbb' (resulting in 'abc' with length 3), 'bbbbb' (resulting in 'b' with length 1), and 'pwwkew' (resulting in 'wke' with length 3), emphasizing that substrings are different from subsequences. The provided Python class 'Solution' includes a method 'lengthOfLongestSubstring' that uses a hash map to track characters and their indices, updating start and result variables to calculate the maximum length of such substrings without repetitions.\", 'COGNITIVE_LAYER__SYNTAX_LAYER': 'This layer deals with the syntactic structure of the source code including tokens, keywords, operators, and control structures, which are essential for understanding its grammatical correctness.', 'COGNITIVE_LAYER__SEMANTIC_LAYER': 'This layer addresses the meanings of individual instructions and the functions they perform within the code. It covers variable declarations, method calls, and data manipulations that carry semantic value.', 'COGNITIVE_LAYER__FUNCTIONAL_LAYER': 'This layer focuses on the algorithmic and logical aspects of the code, assessing how different components interact to fulfill designated tasks and solve specific problems.', 'COGNITIVE_LAYER__MODULE_INTERACTION_LAYER': 'Here, the analysis is on the interaction between different modules, functions, or classes in the source code, illustrating the architectural design and interdependencies.', 'COGNITIVE_LAYER__DATA_FLOW_LAYER': \"This layer examines how data is passed through the system, including variable scopes, parameter passing, and state management, which is crucial for understanding the program's behavior.\", 'COGNITIVE_LAYER__CONTROL_FLOW_LAYER': 'Examining the flow of execution throughout the code (e.g., loops, conditionals, and function calls), this layer is important for understanding the logic and potential execution paths in the program.', 'COGNITIVE_LAYER__PERFORMANCE_LAYER': 'This layer analyzes aspects of the code that affect its performance, such as complexity, optimization, potential bottlenecks, and resource management.', 'COGNITIVE_LAYER__SECURITY_LAYER': 'Focuses on identifying security-related aspects of source code, such as vulnerabilities, security controls, and adherence to secure coding practices.', 'COGNITIVE_LAYER__DOCUMENTATION_AND_COMMENTS_LAYER': \"Includes inline comments, docstrings, and external documentation that provide insights into the developer's intentions, explain complex pieces of code and specify APIs.\", 'COGNITIVE_LAYER__CONVENTIONS_AND_STYLE_LAYER': 'Encompasses coding standards, naming conventions, and formatting that contribute to code readability, maintainability, and consistency across a codebase.', 'COGNITIVE_LAYER__DEPENDENCY_AND_INTEGRATION_LAYER': 'Analyzes external libraries, components, or services that the code interacts with, both at the source level and through build and deployment processes.', 'COGNITIVE_LAYER__VERSION_CONTROL_AND_HISTORY_LAYER': 'Captures changes, version history, collaboration, and branch management within version control systems to understand the development and evolution of the codebase.', 'COGNITIVE_LAYER__TEST_AND_VERIFICATION_LAYER': \"This layer includes test scripts, test cases, and the overall testing strategy implemented to verify the code's functionality and robustness.\", 'COGNITIVE_LAYER__LICENSE_AND_COPYRIGHT_LAYER': 'Deals with legal aspects such as copyright notices, licensing information, and intellectual property concerns related to the source code.', 'COGNITIVE_LAYER__PLATFORM_AND_ENVIRONMENT_LAYER': 'Examines compatibility issues, target runtime environments, and platform-specific considerations that are important for code deployment and execution.'}\n"
]
}
],
"source": [
"from os import path\n",
"import cognee\n",
"from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client, GraphDBType\n",
"from cognee.modules.search.vector.search_similarity import search_similarity\n",
"from cognee.modules.search.graph.search_summary import search_summary\n",
"\n",
"data_directory_path = path.abspath(\"../.data\")\n",
"cognee.config.data_root_directory(data_directory_path)\n",
"\n",
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
"cognee.config.system_root_directory(cognee_directory_path)\n",
"\n",
"graph_client = await get_graph_client(GraphDBType.NETWORKX)\n",
"graph = graph_client.graph\n",
"\n",
"results = await search_summary(\"Who are French girls?\", graph)\n",
"\n",
"print(results)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b2ffa34a-bd42-4556-807d-c32ff82479f3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Conventions and Style Layer',\n",
" 'description': 'Encompasses coding standards, naming conventions, and formatting that contribute to code readability, maintainability, and consistency across a codebase.'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graph.nodes['COGNITIVE_LAYER__CONVENTIONS_AND_STYLE_LAYER']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "59a56a97-051e-4f49-b1e5-985748e057ad",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'results' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Assuming connected_nodes is a list of node IDs you obtained from graph.neighbors()\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Safely fetch summaries, providing a default if 'summary' is not available\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m descriptions \u001b[38;5;241m=\u001b[39m {node: graph\u001b[38;5;241m.\u001b[39mnodes[node]\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msummary\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNo summary available\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m node \u001b[38;5;129;01min\u001b[39;00m \u001b[43mresults\u001b[49m}\n",
"\u001b[0;31mNameError\u001b[0m: name 'results' is not defined"
]
}
],
"source": [
"# Assuming connected_nodes is a list of node IDs you obtained from graph.neighbors()\n",
"\n",
"# Safely fetch summaries, providing a default if 'summary' is not available\n",
"descriptions = {node: graph.nodes[node].get('summary', 'No summary available') for node in connected_nodes}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "497f448c-afc1-4b7e-814f-1ebf55fe510c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}