fixing inefficient implementation for batch addition
This commit is contained in:
parent
c021e832e7
commit
e6f2aa1bfb
2 changed files with 151 additions and 46 deletions
|
|
@ -272,48 +272,55 @@ class ApacheAGEAdapter:
|
||||||
if not nodes:
|
if not nodes:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Process in batches of 100
|
# Group nodes by label for efficient batch processing
|
||||||
|
nodes_by_label = {}
|
||||||
|
for node_id, labels, properties in nodes:
|
||||||
|
label_key = ':'.join(labels) if labels else "Node"
|
||||||
|
if label_key not in nodes_by_label:
|
||||||
|
nodes_by_label[label_key] = []
|
||||||
|
nodes_by_label[label_key].append((node_id, labels, properties))
|
||||||
|
|
||||||
|
# Process each label group in batches
|
||||||
BATCH_SIZE = 100
|
BATCH_SIZE = 100
|
||||||
|
|
||||||
for i in range(0, len(nodes), BATCH_SIZE):
|
for label_key, label_nodes in nodes_by_label.items():
|
||||||
batch = nodes[i:i + BATCH_SIZE]
|
for i in range(0, len(label_nodes), BATCH_SIZE):
|
||||||
|
batch = label_nodes[i:i + BATCH_SIZE]
|
||||||
node_data_list = []
|
|
||||||
for node_id, labels, properties in batch:
|
node_data_list = []
|
||||||
props = {"id": node_id, **properties}
|
for node_id, labels, properties in batch:
|
||||||
props_parts = []
|
props = {"id": node_id, **properties}
|
||||||
for k, v in props.items():
|
props_parts = []
|
||||||
if isinstance(v, str):
|
for k, v in props.items():
|
||||||
props_parts.append(f'{k}: "{v}"')
|
if isinstance(v, str):
|
||||||
elif isinstance(v, bool):
|
props_parts.append(f'{k}: "{v}"')
|
||||||
props_parts.append(f'{k}: {str(v).lower()}')
|
elif isinstance(v, bool):
|
||||||
elif isinstance(v, (int, float)):
|
props_parts.append(f'{k}: {str(v).lower()}')
|
||||||
props_parts.append(f'{k}: {v}')
|
elif isinstance(v, (int, float)):
|
||||||
elif v is None:
|
props_parts.append(f'{k}: {v}')
|
||||||
props_parts.append(f'{k}: null')
|
elif v is None:
|
||||||
else:
|
props_parts.append(f'{k}: null')
|
||||||
props_parts.append(f'{k}: "{json.dumps(v)}"')
|
else:
|
||||||
props_str = '{' + ', '.join(props_parts) + '}'
|
props_parts.append(f'{k}: "{json.dumps(v)}"')
|
||||||
label_str = ':'.join(labels) if labels else "Node"
|
props_str = '{' + ', '.join(props_parts) + '}'
|
||||||
node_data_list.append(f'{{id: "{node_id}", props: {props_str}, label: "{label_str}"}}')
|
node_data_list.append(f'{{id: "{node_id}", props: {props_str}}}')
|
||||||
|
|
||||||
unwind_data = '[' + ', '.join(node_data_list) + ']'
|
unwind_data = '[' + ', '.join(node_data_list) + ']'
|
||||||
|
|
||||||
all_prop_keys = set()
|
all_prop_keys = set()
|
||||||
for node_id, labels, properties in batch:
|
for node_id, labels, properties in batch:
|
||||||
all_prop_keys.update(properties.keys())
|
all_prop_keys.update(properties.keys())
|
||||||
all_prop_keys.add('id')
|
all_prop_keys.add('id')
|
||||||
|
|
||||||
set_clauses = [f"n.{key} = node_data.props.{key}" for key in sorted(all_prop_keys)]
|
set_clauses = [f"n.{key} = node_data.props.{key}" for key in sorted(all_prop_keys)]
|
||||||
set_clause = "SET " + ", ".join(set_clauses)
|
set_clause = "SET " + ", ".join(set_clauses) if set_clauses else ""
|
||||||
|
|
||||||
common_label = batch[0][1][0] if batch[0][1] else "Node"
|
query = f"""
|
||||||
query = f"""
|
UNWIND {unwind_data} AS node_data
|
||||||
UNWIND {unwind_data} AS node_data
|
MERGE (n:{label_key} {{id: node_data.id}})
|
||||||
MERGE (n {{id: node_data.id}})
|
{set_clause}
|
||||||
{set_clause}
|
"""
|
||||||
"""
|
await self.execute_cypher(query)
|
||||||
await self.execute_cypher(query)
|
|
||||||
|
|
||||||
async def get_node(self, node_id: str) -> Optional[NodeData]:
|
async def get_node(self, node_id: str) -> Optional[NodeData]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -398,16 +405,14 @@ class ApacheAGEAdapter:
|
||||||
|
|
||||||
# Use MERGE to avoid duplicate edges
|
# Use MERGE to avoid duplicate edges
|
||||||
query = f"""
|
query = f"""
|
||||||
MATCH (a {{id: '{source_id}'}}), (b {{id: '{target_id}'}})
|
MERGE (a {{id: '{source_id}'}})-[r:{relationship_type}]->(b {{id: '{target_id}'}})
|
||||||
MERGE (a)-[r:{relationship_type}]->(b)
|
|
||||||
SET r = {{{props_str}}}
|
SET r = {{{props_str}}}
|
||||||
RETURN r
|
RETURN r
|
||||||
"""
|
"""
|
||||||
else:
|
else:
|
||||||
# Use MERGE without properties
|
# Use MERGE without properties
|
||||||
query = f"""
|
query = f"""
|
||||||
MATCH (a {{id: '{source_id}'}}), (b {{id: '{target_id}'}})
|
MERGE (a {{id: '{source_id}'}})-[r:{relationship_type}]->(b {{id: '{target_id}'}})
|
||||||
MERGE (a)-[r:{relationship_type}]->(b)
|
|
||||||
RETURN r
|
RETURN r
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -497,8 +502,7 @@ class ApacheAGEAdapter:
|
||||||
|
|
||||||
query = f"""
|
query = f"""
|
||||||
UNWIND {values_list} AS edge
|
UNWIND {values_list} AS edge
|
||||||
MATCH (a {{id: edge.src}}), (b {{id: edge.tgt}})
|
MERGE (a {{id: edge.src}})-[r:{rel_type}]->(b {{id: edge.tgt}})
|
||||||
MERGE (a)-[r:{rel_type}]->(b)
|
|
||||||
{set_clause}
|
{set_clause}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -89,6 +89,55 @@ async def main():
|
||||||
|
|
||||||
print(f"Node Ingestion Single (Merge - {half} existing, {len(new_nodes)} new): AGE={age_time_single_merge:.4f}s, Neo4j={neo4j_time_single_merge:.4f}s")
|
print(f"Node Ingestion Single (Merge - {half} existing, {len(new_nodes)} new): AGE={age_time_single_merge:.4f}s, Neo4j={neo4j_time_single_merge:.4f}s")
|
||||||
|
|
||||||
|
edges = [(f"node_{i}", f"node_{(i+1) % batch_size}", "CONNECTS", {"weight": 1.0})
|
||||||
|
for i in range(batch_size)]
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for source_id, target_id, rel_type, props in edges:
|
||||||
|
await age_adapter.add_edge(source_id, target_id, rel_type, props)
|
||||||
|
age_time_edge_single_new = time.perf_counter() - start
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for source_id, target_id, rel_type, props in edges:
|
||||||
|
try:
|
||||||
|
src_uuid = UUID(source_id) if '-' in source_id else UUID(int=hash(source_id) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
src_uuid = UUID(int=hash(source_id) & ((1 << 128) - 1))
|
||||||
|
try:
|
||||||
|
tgt_uuid = UUID(target_id) if '-' in target_id else UUID(int=hash(target_id) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
tgt_uuid = UUID(int=hash(target_id) & ((1 << 128) - 1))
|
||||||
|
await neo4j_adapter.add_edge(src_uuid, tgt_uuid, rel_type, props)
|
||||||
|
neo4j_time_edge_single_new = time.perf_counter() - start
|
||||||
|
|
||||||
|
print(f"Edge Ingestion Single (New): AGE={age_time_edge_single_new:.4f}s, Neo4j={neo4j_time_edge_single_new:.4f}s")
|
||||||
|
|
||||||
|
half_edges = batch_size // 2
|
||||||
|
existing_edges = edges[:half_edges]
|
||||||
|
new_edge_ids = [(f"node_{i}", f"node_{(i+1) % batch_size}", "CONNECTS", {"weight": 1.0})
|
||||||
|
for i in range(batch_size, batch_size + half_edges)]
|
||||||
|
merge_edges = existing_edges + new_edge_ids
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for source_id, target_id, rel_type, props in merge_edges:
|
||||||
|
await age_adapter.add_edge(source_id, target_id, rel_type, props)
|
||||||
|
age_time_edge_single_merge = time.perf_counter() - start
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for source_id, target_id, rel_type, props in merge_edges:
|
||||||
|
try:
|
||||||
|
src_uuid = UUID(source_id) if '-' in source_id else UUID(int=hash(source_id) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
src_uuid = UUID(int=hash(source_id) & ((1 << 128) - 1))
|
||||||
|
try:
|
||||||
|
tgt_uuid = UUID(target_id) if '-' in target_id else UUID(int=hash(target_id) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
tgt_uuid = UUID(int=hash(target_id) & ((1 << 128) - 1))
|
||||||
|
await neo4j_adapter.add_edge(src_uuid, tgt_uuid, rel_type, props)
|
||||||
|
neo4j_time_edge_single_merge = time.perf_counter() - start
|
||||||
|
|
||||||
|
print(f"Edge Ingestion Single (Merge - {half_edges} existing, {len(new_edge_ids)} new): AGE={age_time_edge_single_merge:.4f}s, Neo4j={neo4j_time_edge_single_merge:.4f}s")
|
||||||
|
|
||||||
await age_adapter.drop_graph(recreate=True)
|
await age_adapter.drop_graph(recreate=True)
|
||||||
await neo4j_adapter.delete_graph()
|
await neo4j_adapter.delete_graph()
|
||||||
|
|
||||||
|
|
@ -126,6 +175,58 @@ async def main():
|
||||||
|
|
||||||
print(f"Node Ingestion Batch (Merge - {half} existing, {len(new_nodes)} new): AGE={age_time_batch_merge:.4f}s, Neo4j={neo4j_time_batch_merge:.4f}s")
|
print(f"Node Ingestion Batch (Merge - {half} existing, {len(new_nodes)} new): AGE={age_time_batch_merge:.4f}s, Neo4j={neo4j_time_batch_merge:.4f}s")
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for i in range(0, len(edges), 100):
|
||||||
|
await age_adapter.add_edges(edges[i:i+100])
|
||||||
|
age_time_edge_batch_new = time.perf_counter() - start
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for i in range(0, len(edges), 100):
|
||||||
|
batch = edges[i:i+100]
|
||||||
|
def to_uuid(s):
|
||||||
|
try:
|
||||||
|
return UUID(s) if '-' in s else UUID(int=hash(s) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
return UUID(int=hash(s) & ((1 << 128) - 1))
|
||||||
|
edge_tuples = [(to_uuid(src), to_uuid(tgt), rel_type, props)
|
||||||
|
for src, tgt, rel_type, props in batch]
|
||||||
|
await neo4j_adapter.add_edges(edge_tuples)
|
||||||
|
neo4j_time_edge_batch_new = time.perf_counter() - start
|
||||||
|
|
||||||
|
print(f"Edge Ingestion Batch (New): AGE={age_time_edge_batch_new:.4f}s, Neo4j={neo4j_time_edge_batch_new:.4f}s")
|
||||||
|
|
||||||
|
for i in range(0, len(existing_edges), 100):
|
||||||
|
await age_adapter.add_edges(existing_edges[i:i+100])
|
||||||
|
batch = existing_edges[i:i+100]
|
||||||
|
def to_uuid(s):
|
||||||
|
try:
|
||||||
|
return UUID(s) if '-' in s else UUID(int=hash(s) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
return UUID(int=hash(s) & ((1 << 128) - 1))
|
||||||
|
edge_tuples = [(to_uuid(src), to_uuid(tgt), rel_type, props)
|
||||||
|
for src, tgt, rel_type, props in batch]
|
||||||
|
await neo4j_adapter.add_edges(edge_tuples)
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for i in range(0, len(merge_edges), 100):
|
||||||
|
await age_adapter.add_edges(merge_edges[i:i+100])
|
||||||
|
age_time_edge_batch_merge = time.perf_counter() - start
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
for i in range(0, len(merge_edges), 100):
|
||||||
|
batch = merge_edges[i:i+100]
|
||||||
|
def to_uuid(s):
|
||||||
|
try:
|
||||||
|
return UUID(s) if '-' in s else UUID(int=hash(s) & ((1 << 128) - 1))
|
||||||
|
except:
|
||||||
|
return UUID(int=hash(s) & ((1 << 128) - 1))
|
||||||
|
edge_tuples = [(to_uuid(src), to_uuid(tgt), rel_type, props)
|
||||||
|
for src, tgt, rel_type, props in batch]
|
||||||
|
await neo4j_adapter.add_edges(edge_tuples)
|
||||||
|
neo4j_time_edge_batch_merge = time.perf_counter() - start
|
||||||
|
|
||||||
|
print(f"Edge Ingestion Batch (Merge - {half_edges} existing, {len(new_edge_ids)} new): AGE={age_time_edge_batch_merge:.4f}s, Neo4j={neo4j_time_edge_batch_merge:.4f}s")
|
||||||
|
|
||||||
await age_adapter.close()
|
await age_adapter.close()
|
||||||
await neo4j_adapter.driver.close()
|
await neo4j_adapter.driver.close()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue