fix: notebooks errors (#565)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Automatically creates a blank graph when a file isn’t found, ensuring
smoother operations.
- Updated demonstration notebooks with dynamic configurations, including
refined search operations and input prompts.
- Introduced optional support for additional graph functionalities via
an integrated dependency.

- **Refactor**
- Streamlined processing by eliminating duplicate steps and simplifying
graph rendering workflows.

- **Chores**
- Updated environment configurations and upgraded the Python runtime for
improved performance and consistency.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Boris 2025-02-19 23:07:11 +01:00 committed by GitHub
parent 811e932cae
commit 45f7c63322
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 422 additions and 451 deletions

View file

@ -243,6 +243,15 @@ class NetworkXAdapter(GraphDBInterface):
await self.save_graph_to_file(self.filename)
async def create_empty_graph(self, file_path: str) -> None:
self.graph = nx.MultiDiGraph()
file_dir = os.path.dirname(file_path)
if not os.path.exists(file_dir):
os.makedirs(file_dir, exist_ok=True)
await self.save_graph_to_file(file_path)
async def save_graph_to_file(self, file_path: str = None) -> None:
"""Asynchronously save the graph to a file in JSON format."""
if not file_path:
@ -322,19 +331,12 @@ class NetworkXAdapter(GraphDBInterface):
else:
# Log that the file does not exist and an empty graph is initialized
logger.warning("File %s not found. Initializing an empty graph.", file_path)
self.graph = (
nx.MultiDiGraph()
) # Use MultiDiGraph to keep it consistent with __init__
await self.create_empty_graph(file_path)
file_dir = os.path.dirname(file_path)
if not os.path.exists(file_dir):
os.makedirs(file_dir, exist_ok=True)
await self.save_graph_to_file(file_path)
except Exception as e:
except Exception:
logger.error("Failed to load graph from file: %s", file_path)
raise e
await self.create_empty_graph(file_path)
async def delete_graph(self, file_path: str = None):
"""Asynchronously delete the graph file from the filesystem."""

View file

@ -9,7 +9,8 @@
"import os\n",
"\n",
"os.environ[\"GRAPHISTRY_USERNAME\"] = input(\"Please enter your graphistry username\")\n",
"os.environ[\"GRAPHISTRY_PASSWORD\"] = input(\"Please enter your graphistry password\")"
"os.environ[\"GRAPHISTRY_PASSWORD\"] = input(\"Please enter your graphistry password\")\n",
"os.environ[\"OPENAI_API_KEY\"] = input(\"Please enter your OpenAI API key\")"
]
},
{
@ -70,19 +71,16 @@
"outputs": [],
"source": [
"from cognee.tasks.repo_processor import (\n",
" enrich_dependency_graph,\n",
" expand_dependency_graph,\n",
" get_repo_file_dependencies,\n",
")\n",
"from cognee.tasks.storage import add_data_points\n",
"from cognee.modules.pipelines.tasks.Task import Task\n",
"\n",
"detailed_extraction = True\n",
"\n",
"tasks = [\n",
" Task(get_repo_file_dependencies),\n",
" Task(add_data_points, task_config={\"batch_size\": 50}),\n",
" Task(enrich_dependency_graph, task_config={\"batch_size\": 50}),\n",
" Task(expand_dependency_graph, task_config={\"batch_size\": 50}),\n",
" Task(add_data_points, task_config={\"batch_size\": 50}),\n",
" Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),\n",
" Task(add_data_points, task_config={\"batch_size\": 100 if detailed_extraction else 500}),\n",
"]"
]
},
@ -95,9 +93,6 @@
"from cognee.modules.pipelines import run_tasks\n",
"from uuid import uuid5, NAMESPACE_OID\n",
"\n",
"notebook_path = os.path.abspath(\"\")\n",
"repo_clone_location = os.path.join(notebook_path, \".data/graphrag\")\n",
"\n",
"pipeline = run_tasks(tasks, uuid5(NAMESPACE_OID, repo_clone_location), repo_clone_location, \"code_graph_pipeline\")\n",
"\n",
"async for result in pipeline:\n",
@ -112,7 +107,7 @@
"source": [
"from cognee.shared.utils import render_graph\n",
"\n",
"await render_graph(None, include_nodes=True, include_labels=True)"
"await render_graph()"
]
},
{
@ -128,46 +123,11 @@
"metadata": {},
"outputs": [],
"source": [
"from evals.eval_on_hotpot import eval_on_hotpotQA\n",
"from evals.eval_on_hotpot import answer_with_cognee\n",
"from evals.eval_on_hotpot import answer_without_cognee\n",
"from evals.eval_on_hotpot import eval_answers\n",
"from cognee.base_config import get_base_config\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"import wget\n",
"import json\n",
"import statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee\n",
"num_samples = 10 # With cognee, it takes ~1m10s per sample\n",
"from cognee import search, SearchType\n",
"\n",
"base_config = get_base_config()\n",
"data_root_dir = base_config.data_root_directory\n",
"results = await search(query_type=SearchType.CODE, query_text=\"def create_graphrag_config\")\n",
"\n",
"if not Path(data_root_dir).exists():\n",
" Path(data_root_dir).mkdir()\n",
"\n",
"filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
"if not filepath.exists():\n",
" url = \"http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json\"\n",
" wget.download(url, out=data_root_dir)\n",
"\n",
"with open(filepath, \"r\") as file:\n",
" dataset = json.load(file)\n",
"\n",
"instances = dataset if not num_samples else dataset[:num_samples]\n",
"answers = []\n",
"for instance in tqdm(instances, desc=\"Getting answers\"):\n",
" answer = answer_provider(instance)\n",
" answers.append(answer)"
"print(results)\n"
]
}
],

View file

@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-15T10:43:57.893763Z",
@ -73,22 +73,15 @@
"if \"OPENAI_API_KEY\" not in os.environ:\n",
" os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
"\n",
"# Graphiti integration is only tested with neo4j + pgvector + postgres for now\n",
"GRAPH_DATABASE_PROVIDER = \"neo4j\"\n",
"GRAPH_DATABASE_URL = \"bolt://localhost:7687\"\n",
"GRAPH_DATABASE_USERNAME = \"neo4j\"\n",
"GRAPH_DATABASE_PASSWORD = \"pleaseletmein\"\n",
"GRAPH_DATABASE_URL = \"bolt://localhost:7687\"\n",
"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"] = \"pgvector\"\n",
"\n",
"os.environ[\"DB_PROVIDER\"] = \"postgres\"\n",
"\n",
"os.environ[\"DB_NAME\"] = \"cognee_db\"\n",
"\n",
"os.environ[\"DB_HOST\"] = \"127.0.0.1\"\n",
"os.environ[\"DB_PORT\"] = \"5432\"\n",
"os.environ[\"DB_USERNAME\"] = \"cognee\"\n",
"os.environ[\"DB_PASSWORD\"] = \"cognee\""
"os.environ[\"GRAPH_DATABASE_PROVIDER\"] = GRAPH_DATABASE_PROVIDER\n",
"os.environ[\"GRAPH_DATABASE_USERNAME\"] = GRAPH_DATABASE_USERNAME\n",
"os.environ[\"GRAPH_DATABASE_PASSWORD\"] = GRAPH_DATABASE_PASSWORD\n",
"os.environ[\"GRAPH_DATABASE_URL\"] = GRAPH_DATABASE_URL\n"
]
},
{
@ -100,7 +93,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-15T10:43:57.928664Z",
@ -125,36 +118,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-15T10:44:25.008501Z",
"start_time": "2025-01-15T10:43:57.932240Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Database deleted successfully.\n",
"Database deleted successfully.\n",
"User d3b51a32-38e1-4fe5-8270-6dc1d6ebfdf0 has registered.\n",
"Pipeline file_load_from_filesystem load step completed in 0.10 seconds\n",
"1 load package(s) were loaded to destination sqlalchemy and into dataset public\n",
"The sqlalchemy destination used postgresql://cognee:***@127.0.0.1:5432/cognee_db location to store data\n",
"Load package 1736937839.7739599 is LOADED and contains no failed jobs\n",
"Pipeline file_load_from_filesystem load step completed in 0.06 seconds\n",
"1 load package(s) were loaded to destination sqlalchemy and into dataset public\n",
"The sqlalchemy destination used postgresql://cognee:***@127.0.0.1:5432/cognee_db location to store data\n",
"Load package 1736937841.8467042 is LOADED and contains no failed jobs\n",
"Graph database initialized.\n",
"Added text: Kamala Harris is the Attorney Gener...\n",
"Added text: As AG, Harris was in office from Ja...\n",
"✅ Result Processed: <graphiti_core.graphiti.Graphiti object at 0x326fe0ce0>\n"
]
}
],
"outputs": [],
"source": [
"# 🔧 Setting Up Logging to Suppress Errors\n",
"setup_logging(logging.ERROR) # Keeping logs clean and focused\n",
@ -202,15 +173,7 @@
"start_time": "2025-01-15T10:44:25.013325Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"💡 Answer: Kamala Harris was in office as Attorney General of California from January 3, 2011, to January 3, 2017.\n"
]
}
],
"outputs": [],
"source": [
"# Step 1: Formulating the Query 🔍\n",
"query = \"When was Kamala Harris in office?\"\n",
@ -260,7 +223,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.11.8"
}
},
"nbformat": 4,

728
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -86,6 +86,7 @@ plotly = {version = "^6.0.0", optional = true}
gdown = {version = "^5.2.0", optional = true}
pyside6 = {version = "^6.8.2.1", optional = true}
qasync = {version = "^0.27.1", optional = true}
graphiti-core = {version = "^0.7.0", optional = true}
[tool.poetry.extras]
@ -109,6 +110,7 @@ docs = ["unstructured"]
codegraph = ["fastembed", "tree-sitter", "tree-sitter-python"]
evals = ["plotly", "gdown"]
gui = ["pyside6", "qasync"]
graphiti = ["graphiti-core"]
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"