feat: add relational db migration and dlt example (#843)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
Co-authored-by: Igor Ilic <igorilic03@gmail.com>
Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
Hande 2025-06-03 06:35:26 +02:00 committed by GitHub
parent e0aa7c9a61
commit d720a4dbb2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 1200 additions and 732 deletions

View file

@ -0,0 +1,290 @@
BEGIN TRANSACTION;
PRAGMA foreign_keys=OFF;
/*------------------------------------------------------------------------
1) pokemon_list
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_list (
name TEXT,
url TEXT,
_dlt_load_id TEXT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY
);
INSERT INTO new_pokemon_list
SELECT name, url, _dlt_load_id, _dlt_id
FROM pokemon_list;
DROP TABLE pokemon_list;
ALTER TABLE new_pokemon_list RENAME TO pokemon_list;
/*------------------------------------------------------------------------
2) pokemon_details (Parent for most child tables)
_dlt_id is a PRIMARY KEY so children can FK to it.
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details (
base_experience BIGINT,
height BIGINT,
id BIGINT,
is_default BOOLEAN,
name TEXT,
"order" BIGINT,
species__name,
weight BIGINT,
_dlt_load_id TEXT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY
);
INSERT INTO new_pokemon_details
SELECT base_experience,
height,
id,
is_default,
name,
"order",
species__name,
weight,
_dlt_load_id,
_dlt_id
FROM pokemon_details;
DROP TABLE pokemon_details;
ALTER TABLE new_pokemon_details RENAME TO pokemon_details;
/*------------------------------------------------------------------------
3) pokemon_details_abilities (Child)
Foreign key from _dlt_parent_id pokemon_details(_dlt_id)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__abilities (
ability__name TEXT,
ability__url TEXT,
is_hidden BOOLEAN,
slot BIGINT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
CONSTRAINT fk_abilities
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__abilities
SELECT ability__name,
ability__url,
is_hidden,
slot,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__abilities;
DROP TABLE pokemon_details__abilities;
ALTER TABLE new_pokemon_details__abilities RENAME TO pokemon_details__abilities;
/*------------------------------------------------------------------------
4) pokemon_details_forms (Child)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details_forms (
name TEXT,
url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details_forms
SELECT name,
url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__forms;
DROP TABLE pokemon_details__forms;
ALTER TABLE new_pokemon_details_forms RENAME TO pokemon_details__forms;
/*------------------------------------------------------------------------
5) pokemon_details_game_indices (Child)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__game_indices (
game_index BIGINT,
version__name TEXT,
version__url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__game_indices
SELECT game_index,
version__name,
version__url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__game_indices;
DROP TABLE pokemon_details__game_indices;
ALTER TABLE new_pokemon_details__game_indices RENAME TO pokemon_details__game_indices;
/*------------------------------------------------------------------------
6) pokemon_details_moves (Child of pokemon_details)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__moves (
move__name TEXT,
move__url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__moves
SELECT move__name,
move__url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__moves;
DROP TABLE pokemon_details__moves;
ALTER TABLE new_pokemon_details__moves RENAME TO pokemon_details__moves;
/*------------------------------------------------------------------------
7) pokemon_details_moves_version_group_details (Child of pokemon_details_moves)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__moves__version_group_details (
level_learned_at BIGINT,
version_group__name TEXT,
version_group__url TEXT,
move_learn_method__name TEXT,
move_learn_method__url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
"order" BIGINT,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details__moves(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__moves__version_group_details
SELECT level_learned_at,
version_group__name,
version_group__url,
move_learn_method__name,
move_learn_method__url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id,
"order"
FROM pokemon_details__moves__version_group_details;
DROP TABLE pokemon_details__moves__version_group_details;
ALTER TABLE new_pokemon_details__moves__version_group_details
RENAME TO pokemon_details__moves__version_group_details;
/*------------------------------------------------------------------------
8) pokemon_details_past_abilities (Child of pokemon_details)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__past_abilities (
generation__name TEXT,
generation__url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__past_abilities
SELECT generation__name,
generation__url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__past_abilities;
DROP TABLE pokemon_details__past_abilities;
ALTER TABLE new_pokemon_details__past_abilities
RENAME TO pokemon_details__past_abilities;
/*------------------------------------------------------------------------
9) pokemon_details_stats (Child of pokemon_details)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details_stats (
base_stat BIGINT,
effort BIGINT,
stat__name TEXT,
stat__url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details_stats
SELECT base_stat,
effort,
stat__name,
stat__url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__stats;
DROP TABLE pokemon_details__stats;
ALTER TABLE new_pokemon_details_stats
RENAME TO pokemon_details__stats;
/*------------------------------------------------------------------------
10) pokemon_details_types (Child of pokemon_details)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__types (
slot BIGINT,
type__name TEXT,
type__url TEXT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__types
SELECT slot,
type__name,
type__url,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__types;
DROP TABLE pokemon_details__types;
ALTER TABLE new_pokemon_details__types
RENAME TO pokemon_details__types;
/*------------------------------------------------------------------------
11) pokemon_details_past_abilities_abilities (Child of pokemon_details_past_abilities)
------------------------------------------------------------------------*/
CREATE TABLE IF NOT EXISTS new_pokemon_details__past_abilities__abilities (
is_hidden BOOLEAN,
slot BIGINT,
_dlt_parent_id VARCHAR(128) NOT NULL,
_dlt_list_idx BIGINT NOT NULL,
_dlt_id VARCHAR(128) NOT NULL PRIMARY KEY,
FOREIGN KEY (_dlt_parent_id)
REFERENCES pokemon_details__past_abilities(_dlt_id)
ON DELETE CASCADE
);
INSERT INTO new_pokemon_details__past_abilities__abilities
SELECT is_hidden,
slot,
_dlt_parent_id,
_dlt_list_idx,
_dlt_id
FROM pokemon_details__past_abilities__abilities;
DROP TABLE pokemon_details__past_abilities__abilities;
ALTER TABLE new_pokemon_details__past_abilities__abilities
RENAME TO pokemon_details__past_abilities__abilities;
/* Re-enable FK checks */
PRAGMA foreign_keys=ON;
COMMIT;

View file

@ -0,0 +1,174 @@
import dlt
import requests
import asyncio
import threading
import sqlalchemy as sa
import pathlib
import os
from dlt.destinations.impl.sqlalchemy.configuration import SqlalchemyCredentials
import cognee
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.api.v1.visualize.visualize import visualize_graph
from cognee.infrastructure.databases.relational import (
get_migration_relational_engine,
create_db_and_tables as create_relational_db_and_tables,
)
from cognee.infrastructure.databases.vector.pgvector import (
create_db_and_tables as create_pgvector_db_and_tables,
)
from cognee.tasks.ingestion.migrate_relational_database import migrate_relational_database
from cognee.modules.search.types import SearchType
from cognee.root_dir import get_absolute_path
class PatchedSqlalchemyCredentials(SqlalchemyCredentials):
def __init__(self, connection_string=None):
super().__init__(connection_string)
if not hasattr(self, "_conn_lock"):
self._conn_lock = threading.Lock()
BASE_URL = "https://pokeapi.co/api/v2/"
@dlt.resource(write_disposition="replace")
def pokemon_list(limit: int = 5):
"""Fetch Pokémon list (first 5 Pokémon)."""
response = requests.get(f"{BASE_URL}pokemon", params={"limit": limit})
response.raise_for_status()
yield response.json()["results"]
@dlt.transformer(data_from=pokemon_list)
def pokemon_details(pokemons):
"""Fetch full detail for each Pokémon."""
for pokemon in pokemons:
response = requests.get(pokemon["url"])
response.raise_for_status()
yield response.json()
async def setup_and_process_data():
"""
Setup configuration and process Pokemon data into a SQLite database with dlt.
"""
engine = sa.create_engine("sqlite:///pokemon_data.db")
pipeline = dlt.pipeline(
pipeline_name="pokemon_pipeline",
destination=dlt.destinations.sqlalchemy(
PatchedSqlalchemyCredentials("sqlite:///pokemon_data.db?timeout=15")
),
dataset_name="main",
dev_mode=False,
)
info = pipeline.run([pokemon_list, pokemon_details])
print(f"[setup_and_process_data] Pipeline run complete. Pipeline info:\n{info}")
# (Optional) Inspect tables for debugging
print("[setup_and_process_data] Verifying data was written to the database.")
with engine.connect() as conn:
tables = conn.execute(
sa.text("SELECT name FROM sqlite_master WHERE type='table';")
).fetchall()
print(f"[setup_and_process_data] Tables in database: {tables}")
# Example: if 'pokemon_details' is expected, we can see how many rows:
for table_tuple in tables:
table_name = table_tuple[0]
row_count = conn.execute(sa.text(f"SELECT COUNT(*) FROM {table_name}")).fetchone()[0]
print(f" -> Table '{table_name}' has {row_count} row(s).")
print("[setup_and_process_data] Data loading step finished.\n")
return None
async def apply_foreign_key_fixes():
"""
Apply foreign key fixes to the SQLite database after data processing.
"""
engine = sa.create_engine("sqlite:///pokemon_data.db")
with engine.connect() as conn:
raw_conn = conn.connection.connection
with open(
"examples/relational_db_with_dlt/fix_foreign_keys.sql", "r", encoding="utf-8"
) as f:
sql_script = f.read()
raw_conn.executescript(sql_script) # runs multiple statements
print("[apply_foreign_key_fixes] Applied foreign key fixes")
return None
async def migrate_to_cognee():
"""
Migrate the data from the SQLite database to cognee's knowledge graph.
"""
# Use cognee's absolute path function instead of relative paths
data_directory_path = get_absolute_path(".data_storage")
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = get_absolute_path(".cognee_system")
cognee.config.system_root_directory(cognee_directory_path)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
engine = get_migration_relational_engine()
await create_relational_db_and_tables()
await create_pgvector_db_and_tables()
schema = await engine.extract_schema()
graph_engine = await get_graph_engine()
await migrate_relational_database(graph_engine, schema=schema)
async def visualize_knowledge_graph():
"""
Generate and save an HTML visualization of the knowledge graph.
"""
home_dir = os.path.expanduser("~")
html_path = os.path.join(home_dir, "graph_visualization.html")
await visualize_graph(html_path)
async def search_knowledge_graph():
"""
Perform a search query against the knowledge graph.
"""
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION, query_text="What kind of data do you contain?"
)
print(search_results)
async def main():
print("[main] Starting main function, running setup_and_process_data...")
await setup_and_process_data()
print("[main] Data loaded into SQLite.")
await apply_foreign_key_fixes()
print("[main] Foreign key fixes applied.")
await migrate_to_cognee()
print("[main] Migration to cognee finished.")
await visualize_knowledge_graph()
print("[main] Knowledge graph visualization created.")
await search_knowledge_graph()
print("[main] Knowledge graph search completed.")
if __name__ == "__main__":
print("[__main__] Creating and running event loop.")
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
print("[__main__] Event loop closed. Exiting.")

1468
poetry.lock generated

File diff suppressed because it is too large Load diff