improved code readability by splitting code blocks under conditional statements into separate functions
This commit is contained in:
parent
656894370e
commit
2921021ca3
1 changed files with 238 additions and 224 deletions
|
|
@ -9,7 +9,6 @@ from cognee.infrastructure.databases.relational.config import get_migration_conf
|
|||
from cognee.tasks.storage.index_data_points import index_data_points
|
||||
from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
||||
from cognee.tasks.schema.ingest_database_schema import ingest_database_schema
|
||||
from cognee.tasks.schema.models import SchemaTable
|
||||
|
||||
from cognee.modules.engine.models import TableRow, TableType, ColumnValue
|
||||
|
||||
|
|
@ -31,12 +30,56 @@ async def migrate_relational_database(
|
|||
|
||||
Both TableType and TableRow inherit from DataPoint to maintain consistency with Cognee data model.
|
||||
"""
|
||||
engine = get_migration_relational_engine()
|
||||
# Create a mapping of node_id to node objects for referencing in edge creation
|
||||
node_mapping = {}
|
||||
edge_mapping = []
|
||||
|
||||
if schema_only:
|
||||
node_mapping, edge_mapping = await schema_only_ingestion()
|
||||
|
||||
else:
|
||||
node_mapping, edge_mapping = await complete_database_ingestion(schema, migrate_column_data)
|
||||
|
||||
def _remove_duplicate_edges(edge_mapping):
|
||||
seen = set()
|
||||
unique_original_shape = []
|
||||
|
||||
for tup in edge_mapping:
|
||||
# We go through all the tuples in the edge_mapping and we only add unique tuples to the list
|
||||
# To eliminate duplicate edges.
|
||||
source_id, target_id, rel_name, rel_dict = tup
|
||||
# We need to convert the dictionary to a frozenset to be able to compare values for it
|
||||
rel_dict_hashable = frozenset(sorted(rel_dict.items()))
|
||||
hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
|
||||
|
||||
# We use the seen set to keep track of unique edges
|
||||
if hashable_tup not in seen:
|
||||
# A list that has frozensets elements instead of dictionaries is needed to be able to compare values
|
||||
seen.add(hashable_tup)
|
||||
# append the original tuple shape (with the dictionary) if it's the first time we see it
|
||||
unique_original_shape.append(tup)
|
||||
|
||||
return unique_original_shape
|
||||
|
||||
# Add all nodes and edges to the graph
|
||||
# NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
|
||||
# If we'd create nodes and add them to graph in real time the process would take too long.
|
||||
# Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
|
||||
await graph_db.add_nodes(list(node_mapping.values()))
|
||||
await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
|
||||
|
||||
# In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
|
||||
# Cognee uses this information to perform searches on the knowledge graph.
|
||||
await index_data_points(list(node_mapping.values()))
|
||||
await index_graph_edges()
|
||||
|
||||
logger.info("Data successfully migrated from relational database to desired graph database.")
|
||||
return await graph_db.get_graph_data()
|
||||
|
||||
|
||||
async def schema_only_ingestion():
|
||||
node_mapping = {}
|
||||
edge_mapping = []
|
||||
database_config = get_migration_config().to_dict()
|
||||
# Calling the ingest_database_schema function to return DataPoint subclasses
|
||||
result = await ingest_database_schema(
|
||||
|
|
@ -110,8 +153,14 @@ async def migrate_relational_database(
|
|||
),
|
||||
)
|
||||
)
|
||||
return node_mapping, edge_mapping
|
||||
|
||||
else:
|
||||
|
||||
async def complete_database_ingestion(schema, migrate_column_data):
|
||||
engine = get_migration_relational_engine()
|
||||
# Create a mapping of node_id to node objects for referencing in edge creation
|
||||
node_mapping = {}
|
||||
edge_mapping = []
|
||||
async with engine.engine.begin() as cursor:
|
||||
# First, create table type nodes for all tables
|
||||
for table_name, details in schema.items():
|
||||
|
|
@ -261,39 +310,4 @@ async def migrate_relational_database(
|
|||
),
|
||||
)
|
||||
)
|
||||
|
||||
def _remove_duplicate_edges(edge_mapping):
|
||||
seen = set()
|
||||
unique_original_shape = []
|
||||
|
||||
for tup in edge_mapping:
|
||||
# We go through all the tuples in the edge_mapping and we only add unique tuples to the list
|
||||
# To eliminate duplicate edges.
|
||||
source_id, target_id, rel_name, rel_dict = tup
|
||||
# We need to convert the dictionary to a frozenset to be able to compare values for it
|
||||
rel_dict_hashable = frozenset(sorted(rel_dict.items()))
|
||||
hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
|
||||
|
||||
# We use the seen set to keep track of unique edges
|
||||
if hashable_tup not in seen:
|
||||
# A list that has frozensets elements instead of dictionaries is needed to be able to compare values
|
||||
seen.add(hashable_tup)
|
||||
# append the original tuple shape (with the dictionary) if it's the first time we see it
|
||||
unique_original_shape.append(tup)
|
||||
|
||||
return unique_original_shape
|
||||
|
||||
# Add all nodes and edges to the graph
|
||||
# NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
|
||||
# If we'd create nodes and add them to graph in real time the process would take too long.
|
||||
# Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
|
||||
await graph_db.add_nodes(list(node_mapping.values()))
|
||||
await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
|
||||
|
||||
# In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
|
||||
# Cognee uses this information to perform searches on the knowledge graph.
|
||||
await index_data_points(list(node_mapping.values()))
|
||||
await index_graph_edges()
|
||||
|
||||
logger.info("Data successfully migrated from relational database to desired graph database.")
|
||||
return await graph_db.get_graph_data()
|
||||
return node_mapping, edge_mapping
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue