feat: add relational db migration and dlt example (#843)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Igor Ilic <igorilic03@gmail.com> Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
parent
e0aa7c9a61
commit
d720a4dbb2
3 changed files with 1200 additions and 732 deletions
290
examples/relational_db_with_dlt/fix_foreign_keys.sql
Normal file
290
examples/relational_db_with_dlt/fix_foreign_keys.sql
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
BEGIN TRANSACTION;
|
||||
|
||||
PRAGMA foreign_keys=OFF;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
1) pokemon_list
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_list (
|
||||
name TEXT,
|
||||
url TEXT,
|
||||
_dlt_load_id TEXT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY
|
||||
);
|
||||
INSERT INTO new_pokemon_list
|
||||
SELECT name, url, _dlt_load_id, _dlt_id
|
||||
FROM pokemon_list;
|
||||
DROP TABLE pokemon_list;
|
||||
ALTER TABLE new_pokemon_list RENAME TO pokemon_list;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
2) pokemon_details (Parent for most child tables)
|
||||
_dlt_id is a PRIMARY KEY so children can FK to it.
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details (
|
||||
base_experience BIGINT,
|
||||
height BIGINT,
|
||||
id BIGINT,
|
||||
is_default BOOLEAN,
|
||||
name TEXT,
|
||||
"order" BIGINT,
|
||||
species__name,
|
||||
weight BIGINT,
|
||||
_dlt_load_id TEXT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY
|
||||
);
|
||||
INSERT INTO new_pokemon_details
|
||||
SELECT base_experience,
|
||||
height,
|
||||
id,
|
||||
is_default,
|
||||
name,
|
||||
"order",
|
||||
species__name,
|
||||
weight,
|
||||
_dlt_load_id,
|
||||
_dlt_id
|
||||
FROM pokemon_details;
|
||||
DROP TABLE pokemon_details;
|
||||
ALTER TABLE new_pokemon_details RENAME TO pokemon_details;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
3) pokemon_details_abilities (Child)
|
||||
Foreign key from _dlt_parent_id → pokemon_details(_dlt_id)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__abilities (
|
||||
ability__name TEXT,
|
||||
ability__url TEXT,
|
||||
is_hidden BOOLEAN,
|
||||
slot BIGINT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
CONSTRAINT fk_abilities
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__abilities
|
||||
SELECT ability__name,
|
||||
ability__url,
|
||||
is_hidden,
|
||||
slot,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__abilities;
|
||||
DROP TABLE pokemon_details__abilities;
|
||||
ALTER TABLE new_pokemon_details__abilities RENAME TO pokemon_details__abilities;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
4) pokemon_details_forms (Child)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details_forms (
|
||||
name TEXT,
|
||||
url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details_forms
|
||||
SELECT name,
|
||||
url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__forms;
|
||||
DROP TABLE pokemon_details__forms;
|
||||
ALTER TABLE new_pokemon_details_forms RENAME TO pokemon_details__forms;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
5) pokemon_details_game_indices (Child)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__game_indices (
|
||||
game_index BIGINT,
|
||||
version__name TEXT,
|
||||
version__url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__game_indices
|
||||
SELECT game_index,
|
||||
version__name,
|
||||
version__url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__game_indices;
|
||||
DROP TABLE pokemon_details__game_indices;
|
||||
ALTER TABLE new_pokemon_details__game_indices RENAME TO pokemon_details__game_indices;
|
||||
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
6) pokemon_details_moves (Child of pokemon_details)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__moves (
|
||||
move__name TEXT,
|
||||
move__url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__moves
|
||||
SELECT move__name,
|
||||
move__url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__moves;
|
||||
DROP TABLE pokemon_details__moves;
|
||||
ALTER TABLE new_pokemon_details__moves RENAME TO pokemon_details__moves;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
7) pokemon_details_moves_version_group_details (Child of pokemon_details_moves)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__moves__version_group_details (
|
||||
level_learned_at BIGINT,
|
||||
version_group__name TEXT,
|
||||
version_group__url TEXT,
|
||||
move_learn_method__name TEXT,
|
||||
move_learn_method__url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
"order" BIGINT,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details__moves(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__moves__version_group_details
|
||||
SELECT level_learned_at,
|
||||
version_group__name,
|
||||
version_group__url,
|
||||
move_learn_method__name,
|
||||
move_learn_method__url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id,
|
||||
"order"
|
||||
FROM pokemon_details__moves__version_group_details;
|
||||
DROP TABLE pokemon_details__moves__version_group_details;
|
||||
ALTER TABLE new_pokemon_details__moves__version_group_details
|
||||
RENAME TO pokemon_details__moves__version_group_details;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
8) pokemon_details_past_abilities (Child of pokemon_details)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__past_abilities (
|
||||
generation__name TEXT,
|
||||
generation__url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__past_abilities
|
||||
SELECT generation__name,
|
||||
generation__url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__past_abilities;
|
||||
DROP TABLE pokemon_details__past_abilities;
|
||||
ALTER TABLE new_pokemon_details__past_abilities
|
||||
RENAME TO pokemon_details__past_abilities;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
9) pokemon_details_stats (Child of pokemon_details)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details_stats (
|
||||
base_stat BIGINT,
|
||||
effort BIGINT,
|
||||
stat__name TEXT,
|
||||
stat__url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details_stats
|
||||
SELECT base_stat,
|
||||
effort,
|
||||
stat__name,
|
||||
stat__url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__stats;
|
||||
DROP TABLE pokemon_details__stats;
|
||||
ALTER TABLE new_pokemon_details_stats
|
||||
RENAME TO pokemon_details__stats;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
10) pokemon_details_types (Child of pokemon_details)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__types (
|
||||
slot BIGINT,
|
||||
type__name TEXT,
|
||||
type__url TEXT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__types
|
||||
SELECT slot,
|
||||
type__name,
|
||||
type__url,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__types;
|
||||
DROP TABLE pokemon_details__types;
|
||||
ALTER TABLE new_pokemon_details__types
|
||||
RENAME TO pokemon_details__types;
|
||||
|
||||
/*------------------------------------------------------------------------
|
||||
11) pokemon_details_past_abilities_abilities (Child of pokemon_details_past_abilities)
|
||||
------------------------------------------------------------------------*/
|
||||
CREATE TABLE IF NOT EXISTS new_pokemon_details__past_abilities__abilities (
|
||||
is_hidden BOOLEAN,
|
||||
slot BIGINT,
|
||||
_dlt_parent_id VARCHAR(128) NOT NULL,
|
||||
_dlt_list_idx BIGINT NOT NULL,
|
||||
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
|
||||
FOREIGN KEY (_dlt_parent_id)
|
||||
REFERENCES pokemon_details__past_abilities(_dlt_id)
|
||||
ON DELETE CASCADE
|
||||
);
|
||||
INSERT INTO new_pokemon_details__past_abilities__abilities
|
||||
SELECT is_hidden,
|
||||
slot,
|
||||
_dlt_parent_id,
|
||||
_dlt_list_idx,
|
||||
_dlt_id
|
||||
FROM pokemon_details__past_abilities__abilities;
|
||||
DROP TABLE pokemon_details__past_abilities__abilities;
|
||||
ALTER TABLE new_pokemon_details__past_abilities__abilities
|
||||
RENAME TO pokemon_details__past_abilities__abilities;
|
||||
|
||||
|
||||
/* Re-enable FK checks */
|
||||
PRAGMA foreign_keys=ON;
|
||||
|
||||
COMMIT;
|
||||
174
examples/relational_db_with_dlt/relational_db_and_dlt.py
Normal file
174
examples/relational_db_with_dlt/relational_db_and_dlt.py
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
import dlt
|
||||
import requests
|
||||
import asyncio
|
||||
import threading
|
||||
import sqlalchemy as sa
|
||||
import pathlib
|
||||
import os
|
||||
from dlt.destinations.impl.sqlalchemy.configuration import SqlalchemyCredentials
|
||||
|
||||
|
||||
import cognee
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.api.v1.visualize.visualize import visualize_graph
|
||||
from cognee.infrastructure.databases.relational import (
|
||||
get_migration_relational_engine,
|
||||
create_db_and_tables as create_relational_db_and_tables,
|
||||
)
|
||||
from cognee.infrastructure.databases.vector.pgvector import (
|
||||
create_db_and_tables as create_pgvector_db_and_tables,
|
||||
)
|
||||
from cognee.tasks.ingestion.migrate_relational_database import migrate_relational_database
|
||||
from cognee.modules.search.types import SearchType
|
||||
|
||||
from cognee.root_dir import get_absolute_path
|
||||
|
||||
|
||||
class PatchedSqlalchemyCredentials(SqlalchemyCredentials):
|
||||
def __init__(self, connection_string=None):
|
||||
super().__init__(connection_string)
|
||||
if not hasattr(self, "_conn_lock"):
|
||||
self._conn_lock = threading.Lock()
|
||||
|
||||
|
||||
BASE_URL = "https://pokeapi.co/api/v2/"
|
||||
|
||||
|
||||
@dlt.resource(write_disposition="replace")
|
||||
def pokemon_list(limit: int = 5):
|
||||
"""Fetch Pokémon list (first 5 Pokémon)."""
|
||||
response = requests.get(f"{BASE_URL}pokemon", params={"limit": limit})
|
||||
response.raise_for_status()
|
||||
yield response.json()["results"]
|
||||
|
||||
|
||||
@dlt.transformer(data_from=pokemon_list)
|
||||
def pokemon_details(pokemons):
|
||||
"""Fetch full detail for each Pokémon."""
|
||||
for pokemon in pokemons:
|
||||
response = requests.get(pokemon["url"])
|
||||
response.raise_for_status()
|
||||
yield response.json()
|
||||
|
||||
|
||||
async def setup_and_process_data():
|
||||
"""
|
||||
Setup configuration and process Pokemon data into a SQLite database with dlt.
|
||||
"""
|
||||
engine = sa.create_engine("sqlite:///pokemon_data.db")
|
||||
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="pokemon_pipeline",
|
||||
destination=dlt.destinations.sqlalchemy(
|
||||
PatchedSqlalchemyCredentials("sqlite:///pokemon_data.db?timeout=15")
|
||||
),
|
||||
dataset_name="main",
|
||||
dev_mode=False,
|
||||
)
|
||||
|
||||
info = pipeline.run([pokemon_list, pokemon_details])
|
||||
print(f"[setup_and_process_data] Pipeline run complete. Pipeline info:\n{info}")
|
||||
|
||||
# (Optional) Inspect tables for debugging
|
||||
print("[setup_and_process_data] Verifying data was written to the database.")
|
||||
with engine.connect() as conn:
|
||||
tables = conn.execute(
|
||||
sa.text("SELECT name FROM sqlite_master WHERE type='table';")
|
||||
).fetchall()
|
||||
print(f"[setup_and_process_data] Tables in database: {tables}")
|
||||
# Example: if 'pokemon_details' is expected, we can see how many rows:
|
||||
for table_tuple in tables:
|
||||
table_name = table_tuple[0]
|
||||
row_count = conn.execute(sa.text(f"SELECT COUNT(*) FROM {table_name}")).fetchone()[0]
|
||||
print(f" -> Table '{table_name}' has {row_count} row(s).")
|
||||
|
||||
print("[setup_and_process_data] Data loading step finished.\n")
|
||||
return None
|
||||
|
||||
|
||||
async def apply_foreign_key_fixes():
|
||||
"""
|
||||
Apply foreign key fixes to the SQLite database after data processing.
|
||||
"""
|
||||
engine = sa.create_engine("sqlite:///pokemon_data.db")
|
||||
with engine.connect() as conn:
|
||||
raw_conn = conn.connection.connection
|
||||
with open(
|
||||
"examples/relational_db_with_dlt/fix_foreign_keys.sql", "r", encoding="utf-8"
|
||||
) as f:
|
||||
sql_script = f.read()
|
||||
raw_conn.executescript(sql_script) # runs multiple statements
|
||||
print("[apply_foreign_key_fixes] Applied foreign key fixes")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def migrate_to_cognee():
|
||||
"""
|
||||
Migrate the data from the SQLite database to cognee's knowledge graph.
|
||||
"""
|
||||
|
||||
# Use cognee's absolute path function instead of relative paths
|
||||
data_directory_path = get_absolute_path(".data_storage")
|
||||
cognee.config.data_root_directory(data_directory_path)
|
||||
|
||||
cognee_directory_path = get_absolute_path(".cognee_system")
|
||||
cognee.config.system_root_directory(cognee_directory_path)
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
engine = get_migration_relational_engine()
|
||||
|
||||
await create_relational_db_and_tables()
|
||||
await create_pgvector_db_and_tables()
|
||||
|
||||
schema = await engine.extract_schema()
|
||||
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
await migrate_relational_database(graph_engine, schema=schema)
|
||||
|
||||
|
||||
async def visualize_knowledge_graph():
|
||||
"""
|
||||
Generate and save an HTML visualization of the knowledge graph.
|
||||
"""
|
||||
home_dir = os.path.expanduser("~")
|
||||
html_path = os.path.join(home_dir, "graph_visualization.html")
|
||||
|
||||
await visualize_graph(html_path)
|
||||
|
||||
|
||||
async def search_knowledge_graph():
|
||||
"""
|
||||
Perform a search query against the knowledge graph.
|
||||
"""
|
||||
search_results = await cognee.search(
|
||||
query_type=SearchType.GRAPH_COMPLETION, query_text="What kind of data do you contain?"
|
||||
)
|
||||
print(search_results)
|
||||
|
||||
|
||||
async def main():
|
||||
print("[main] Starting main function, running setup_and_process_data...")
|
||||
await setup_and_process_data()
|
||||
print("[main] Data loaded into SQLite.")
|
||||
await apply_foreign_key_fixes()
|
||||
print("[main] Foreign key fixes applied.")
|
||||
await migrate_to_cognee()
|
||||
print("[main] Migration to cognee finished.")
|
||||
await visualize_knowledge_graph()
|
||||
print("[main] Knowledge graph visualization created.")
|
||||
await search_knowledge_graph()
|
||||
print("[main] Knowledge graph search completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("[__main__] Creating and running event loop.")
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
loop.run_until_complete(main())
|
||||
finally:
|
||||
loop.run_until_complete(loop.shutdown_asyncgens())
|
||||
print("[__main__] Event loop closed. Exiting.")
|
||||
1468
poetry.lock
generated
1468
poetry.lock
generated
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue