fix: Fix Jupyter Notebook (#142)

* fix: resolve issue with dlt sqlalchemy usage
Cognee database configuration information was not handled properly by dlt, a new dlt handler
moudle was made to handle database configuration propagation.

* fix: resolve issue with jupyter notebook

cognee add function uses old way of working in the notebook, updated it to
work with the latest state of the cognee add function which doesn't return output.

* fix: Remove empty DB_PATH argument from .env.template

Empty value for DB_PATH in the .env file overrides default value for path intended to be used by cognee.

---------
This commit is contained in:
Igor Ilic 2024-10-07 12:58:54 +02:00 committed by GitHub
parent 2e70c23de8
commit fcd60861ba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 77 additions and 123 deletions

View file

@ -18,8 +18,14 @@ VECTOR_ENGINE_PROVIDER="qdrant" # or "weaviate" or "lancedb"
VECTOR_DB_URL=
VECTOR_DB_KEY=
# Database provider
DB_PROVIDER="sqlite" # or "postgres"
# Database name
DB_NAME=cognee_db
# Postgres specific parameters (Only if Postgres is run)
DB_HOST=127.0.0.1
DB_PORT=5432
DB_USERNAME=cognee
DB_PASSWORD=cognee
DB_DATABASE=cognee_db

View file

@ -8,8 +8,9 @@ from cognee.infrastructure.files.storage import LocalStorage
from cognee.modules.ingestion import get_matched_datasets, save_data_to_file
from cognee.shared.utils import send_telemetry
from cognee.base_config import get_base_config
from cognee.infrastructure.databases.relational import get_relational_config, get_relational_engine, create_db_and_tables
from cognee.infrastructure.databases.relational import get_relational_engine, create_db_and_tables
from cognee.modules.users.methods import get_default_user
from cognee.tasks.ingestion import get_dlt_destination
from cognee.modules.users.permissions.methods import give_permission_on_document
from cognee.modules.users.models import User
from cognee.modules.data.methods import create_dataset
@ -78,18 +79,7 @@ async def add_files(file_paths: List[str], dataset_name: str, user: User = None)
else:
processed_file_paths.append(file_path)
relational_config = get_relational_config()
destination = dlt.destinations.sqlalchemy(
credentials = {
"host": relational_config.db_host,
"port": relational_config.db_port,
"username": relational_config.db_username,
"password": relational_config.db_password,
"database": relational_config.db_name,
"drivername": relational_config.db_provider,
},
)
destination = get_dlt_destination()
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",

View file

@ -38,6 +38,8 @@ class UUID(TypeDecorator):
if value is None:
return value
if dialect.name == 'postgresql':
if isinstance(value, uuid.UUID):
return value
return uuid.UUID(value)
else:
return uuid.UUID(bytes = value)

View file

@ -0,0 +1,41 @@
import os
import dlt
from typing import Union
from cognee.infrastructure.databases.relational import get_relational_config
def get_dlt_destination() -> Union[type[dlt.destinations.sqlalchemy], None]:
"""
Handles propagation of the cognee database configuration to the dlt library
Returns:
sqlachemy: sqlachemy destination used by the dlt library
"""
relational_config = get_relational_config()
if relational_config.db_provider == "sqlite":
# When sqlite is the database provider hostname, port, username and password should not be forwarded.
# The database is found by combining the path location and the database name
destination = dlt.destinations.sqlalchemy(
credentials = {
"database": os.path.join(relational_config.db_path, relational_config.db_name),
"drivername": relational_config.db_provider,
},
)
elif relational_config.db_provider == "postgres":
# The dlt library doesn't accept postgres as the drivername, it only accepts postgresql
destination = dlt.destinations.sqlalchemy(
credentials = {
"host": relational_config.db_host,
"port": relational_config.db_port,
"username": relational_config.db_username,
"password": relational_config.db_password,
"database": relational_config.db_name,
"drivername": "postgresql",
},
)
else:
destination = None
return destination

View file

@ -6,20 +6,10 @@ from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_config, get_relational_engine
from cognee.modules.data.methods import create_dataset
from cognee.modules.users.permissions.methods import give_permission_on_document
from .get_dlt_destination import get_dlt_destination
async def ingest_data(file_paths: list[str], dataset_name: str, user: User):
relational_config = get_relational_config()
destination = dlt.destinations.sqlalchemy(
credentials = {
"host": relational_config.db_host,
"port": relational_config.db_port,
"username": relational_config.db_username,
"password": relational_config.db_password,
"database": relational_config.db_name,
"drivername": relational_config.db_provider,
},
)
destination = get_dlt_destination()
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",

View file

@ -1,44 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "958375a6ffc0c2e4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:47.336283Z",
"start_time": "2024-09-20T14:02:43.652444Z"
}
},
"outputs": [],
"source": [
"import asyncio\n",
"import logging\n",
"from typing import Union\n",
"\n",
"from cognee.modules.cognify.config import get_cognify_config\n",
"from cognee.shared.data_models import KnowledgeGraph\n",
"from cognee.modules.data.models import Dataset, Data\n",
"from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
"from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n",
"from cognee.modules.pipelines.tasks.Task import Task\n",
"from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n",
"from cognee.modules.users.models import User\n",
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n",
"from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n",
"from cognee.tasks import chunk_extract_summary, \\\n",
" chunk_naive_llm_classifier, \\\n",
" chunk_remove_disconnected, \\\n",
" infer_data_ontology, \\\n",
" save_chunks_to_store, \\\n",
" chunk_update_check, \\\n",
" chunks_into_graph, \\\n",
" source_documents_to_chunks, \\\n",
" check_permissions_on_documents, \\\n",
" classify_documents"
]
},
{
"cell_type": "code",
"execution_count": 2,
@ -308,64 +269,8 @@
"outputs": [],
"source": [
"import cognee\n",
"from os import listdir, path\n",
"\n",
"data_path = path.abspath(\".data\")\n",
"\n",
"results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n",
"\n",
"for result in results:\n",
" print(result)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6f9b564de121713d",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:55.564445Z",
"start_time": "2024-09-20T14:02:55.562784Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8911f8bd4f8c440a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:56.714408Z",
"start_time": "2024-09-20T14:02:56.711812Z"
}
},
"outputs": [],
"source": [
"# from enum import Enum, auto\n",
"# from typing import Optional, List, Union, Dict, Any\n",
"# from pydantic import BaseModel, Field\n",
"# \n",
"# class Node(BaseModel):\n",
"# \"\"\"Node in a knowledge graph.\"\"\"\n",
"# id: str\n",
"# name: str\n",
"# type: str\n",
"# description: str\n",
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n",
"# \n",
"# class Edge(BaseModel):\n",
"# \"\"\"Edge in a knowledge graph.\"\"\"\n",
"# source_node_id: str\n",
"# target_node_id: str\n",
"# relationship_name: str\n",
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n",
"# \n",
"# class KnowledgeGraph(BaseModel):\n",
"# \"\"\"Knowledge graph.\"\"\"\n",
"# nodes: List[Node] = Field(..., default_factory=list)\n",
"# edges: List[Edge] = Field(..., default_factory=list)"
"await cognee.add([job_1, job_2, job_3, job_4, job_5, job_position], \"example\")\n"
]
},
{
@ -380,6 +285,21 @@
},
"outputs": [],
"source": [
"from cognee.shared.data_models import KnowledgeGraph\n",
"from cognee.modules.data.models import Dataset, Data\n",
"from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
"from cognee.modules.pipelines.tasks.Task import Task\n",
"from cognee.modules.pipelines import run_tasks\n",
"from cognee.modules.users.models import User\n",
"from cognee.tasks import chunk_remove_disconnected, \\\n",
" infer_data_ontology, \\\n",
" save_chunks_to_store, \\\n",
" chunk_update_check, \\\n",
" chunks_into_graph, \\\n",
" source_documents_to_chunks, \\\n",
" check_permissions_on_documents, \\\n",
" classify_documents\n",
"\n",
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
" data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
"\n",
@ -421,8 +341,13 @@
},
"outputs": [],
"source": [
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.modules.data.methods import get_datasets_by_name\n",
"\n",
"user = await get_default_user()\n",
"\n",
"datasets = await get_datasets_by_name([\"example\"], user.id)\n",
"\n",
"await run_cognify_pipeline(datasets[0], user)"
]
},
@ -490,7 +415,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": ".venv",
"language": "python",
"name": "python3"
},