fixing inefficient implementation for batch addition

This commit is contained in:
hajdul88 2026-01-12 18:35:08 +01:00
parent c021e832e7
commit e6f2aa1bfb
2 changed files with 151 additions and 46 deletions

View file

@ -272,48 +272,55 @@ class ApacheAGEAdapter:
if not nodes:
return
# Process in batches of 100
# Group nodes by label for efficient batch processing
nodes_by_label = {}
for node_id, labels, properties in nodes:
label_key = ':'.join(labels) if labels else "Node"
if label_key not in nodes_by_label:
nodes_by_label[label_key] = []
nodes_by_label[label_key].append((node_id, labels, properties))
# Process each label group in batches
BATCH_SIZE = 100
for i in range(0, len(nodes), BATCH_SIZE):
batch = nodes[i:i + BATCH_SIZE]
node_data_list = []
for node_id, labels, properties in batch:
props = {"id": node_id, **properties}
props_parts = []
for k, v in props.items():
if isinstance(v, str):
props_parts.append(f'{k}: "{v}"')
elif isinstance(v, bool):
props_parts.append(f'{k}: {str(v).lower()}')
elif isinstance(v, (int, float)):
props_parts.append(f'{k}: {v}')
elif v is None:
props_parts.append(f'{k}: null')
else:
props_parts.append(f'{k}: "{json.dumps(v)}"')
props_str = '{' + ', '.join(props_parts) + '}'
label_str = ':'.join(labels) if labels else "Node"
node_data_list.append(f'{{id: "{node_id}", props: {props_str}, label: "{label_str}"}}')
unwind_data = '[' + ', '.join(node_data_list) + ']'
all_prop_keys = set()
for node_id, labels, properties in batch:
all_prop_keys.update(properties.keys())
all_prop_keys.add('id')
set_clauses = [f"n.{key} = node_data.props.{key}" for key in sorted(all_prop_keys)]
set_clause = "SET " + ", ".join(set_clauses)
common_label = batch[0][1][0] if batch[0][1] else "Node"
query = f"""
UNWIND {unwind_data} AS node_data
MERGE (n {{id: node_data.id}})
{set_clause}
"""
await self.execute_cypher(query)
for label_key, label_nodes in nodes_by_label.items():
for i in range(0, len(label_nodes), BATCH_SIZE):
batch = label_nodes[i:i + BATCH_SIZE]
node_data_list = []
for node_id, labels, properties in batch:
props = {"id": node_id, **properties}
props_parts = []
for k, v in props.items():
if isinstance(v, str):
props_parts.append(f'{k}: "{v}"')
elif isinstance(v, bool):
props_parts.append(f'{k}: {str(v).lower()}')
elif isinstance(v, (int, float)):
props_parts.append(f'{k}: {v}')
elif v is None:
props_parts.append(f'{k}: null')
else:
props_parts.append(f'{k}: "{json.dumps(v)}"')
props_str = '{' + ', '.join(props_parts) + '}'
node_data_list.append(f'{{id: "{node_id}", props: {props_str}}}')
unwind_data = '[' + ', '.join(node_data_list) + ']'
all_prop_keys = set()
for node_id, labels, properties in batch:
all_prop_keys.update(properties.keys())
all_prop_keys.add('id')
set_clauses = [f"n.{key} = node_data.props.{key}" for key in sorted(all_prop_keys)]
set_clause = "SET " + ", ".join(set_clauses) if set_clauses else ""
query = f"""
UNWIND {unwind_data} AS node_data
MERGE (n:{label_key} {{id: node_data.id}})
{set_clause}
"""
await self.execute_cypher(query)
async def get_node(self, node_id: str) -> Optional[NodeData]:
"""
@ -398,16 +405,14 @@ class ApacheAGEAdapter:
# Use MERGE to avoid duplicate edges
query = f"""
MATCH (a {{id: '{source_id}'}}), (b {{id: '{target_id}'}})
MERGE (a)-[r:{relationship_type}]->(b)
MERGE (a {{id: '{source_id}'}})-[r:{relationship_type}]->(b {{id: '{target_id}'}})
SET r = {{{props_str}}}
RETURN r
"""
else:
# Use MERGE without properties
query = f"""
MATCH (a {{id: '{source_id}'}}), (b {{id: '{target_id}'}})
MERGE (a)-[r:{relationship_type}]->(b)
MERGE (a {{id: '{source_id}'}})-[r:{relationship_type}]->(b {{id: '{target_id}'}})
RETURN r
"""
@ -497,8 +502,7 @@ class ApacheAGEAdapter:
query = f"""
UNWIND {values_list} AS edge
MATCH (a {{id: edge.src}}), (b {{id: edge.tgt}})
MERGE (a)-[r:{rel_type}]->(b)
MERGE (a {{id: edge.src}})-[r:{rel_type}]->(b {{id: edge.tgt}})
{set_clause}
"""

View file

@ -89,6 +89,55 @@ async def main():
print(f"Node Ingestion Single (Merge - {half} existing, {len(new_nodes)} new): AGE={age_time_single_merge:.4f}s, Neo4j={neo4j_time_single_merge:.4f}s")
edges = [(f"node_{i}", f"node_{(i+1) % batch_size}", "CONNECTS", {"weight": 1.0})
for i in range(batch_size)]
start = time.perf_counter()
for source_id, target_id, rel_type, props in edges:
await age_adapter.add_edge(source_id, target_id, rel_type, props)
age_time_edge_single_new = time.perf_counter() - start
start = time.perf_counter()
for source_id, target_id, rel_type, props in edges:
try:
src_uuid = UUID(source_id) if '-' in source_id else UUID(int=hash(source_id) & ((1 << 128) - 1))
except:
src_uuid = UUID(int=hash(source_id) & ((1 << 128) - 1))
try:
tgt_uuid = UUID(target_id) if '-' in target_id else UUID(int=hash(target_id) & ((1 << 128) - 1))
except:
tgt_uuid = UUID(int=hash(target_id) & ((1 << 128) - 1))
await neo4j_adapter.add_edge(src_uuid, tgt_uuid, rel_type, props)
neo4j_time_edge_single_new = time.perf_counter() - start
print(f"Edge Ingestion Single (New): AGE={age_time_edge_single_new:.4f}s, Neo4j={neo4j_time_edge_single_new:.4f}s")
half_edges = batch_size // 2
existing_edges = edges[:half_edges]
new_edge_ids = [(f"node_{i}", f"node_{(i+1) % batch_size}", "CONNECTS", {"weight": 1.0})
for i in range(batch_size, batch_size + half_edges)]
merge_edges = existing_edges + new_edge_ids
start = time.perf_counter()
for source_id, target_id, rel_type, props in merge_edges:
await age_adapter.add_edge(source_id, target_id, rel_type, props)
age_time_edge_single_merge = time.perf_counter() - start
start = time.perf_counter()
for source_id, target_id, rel_type, props in merge_edges:
try:
src_uuid = UUID(source_id) if '-' in source_id else UUID(int=hash(source_id) & ((1 << 128) - 1))
except:
src_uuid = UUID(int=hash(source_id) & ((1 << 128) - 1))
try:
tgt_uuid = UUID(target_id) if '-' in target_id else UUID(int=hash(target_id) & ((1 << 128) - 1))
except:
tgt_uuid = UUID(int=hash(target_id) & ((1 << 128) - 1))
await neo4j_adapter.add_edge(src_uuid, tgt_uuid, rel_type, props)
neo4j_time_edge_single_merge = time.perf_counter() - start
print(f"Edge Ingestion Single (Merge - {half_edges} existing, {len(new_edge_ids)} new): AGE={age_time_edge_single_merge:.4f}s, Neo4j={neo4j_time_edge_single_merge:.4f}s")
await age_adapter.drop_graph(recreate=True)
await neo4j_adapter.delete_graph()
@ -126,6 +175,58 @@ async def main():
print(f"Node Ingestion Batch (Merge - {half} existing, {len(new_nodes)} new): AGE={age_time_batch_merge:.4f}s, Neo4j={neo4j_time_batch_merge:.4f}s")
start = time.perf_counter()
for i in range(0, len(edges), 100):
await age_adapter.add_edges(edges[i:i+100])
age_time_edge_batch_new = time.perf_counter() - start
start = time.perf_counter()
for i in range(0, len(edges), 100):
batch = edges[i:i+100]
def to_uuid(s):
try:
return UUID(s) if '-' in s else UUID(int=hash(s) & ((1 << 128) - 1))
except:
return UUID(int=hash(s) & ((1 << 128) - 1))
edge_tuples = [(to_uuid(src), to_uuid(tgt), rel_type, props)
for src, tgt, rel_type, props in batch]
await neo4j_adapter.add_edges(edge_tuples)
neo4j_time_edge_batch_new = time.perf_counter() - start
print(f"Edge Ingestion Batch (New): AGE={age_time_edge_batch_new:.4f}s, Neo4j={neo4j_time_edge_batch_new:.4f}s")
for i in range(0, len(existing_edges), 100):
await age_adapter.add_edges(existing_edges[i:i+100])
batch = existing_edges[i:i+100]
def to_uuid(s):
try:
return UUID(s) if '-' in s else UUID(int=hash(s) & ((1 << 128) - 1))
except:
return UUID(int=hash(s) & ((1 << 128) - 1))
edge_tuples = [(to_uuid(src), to_uuid(tgt), rel_type, props)
for src, tgt, rel_type, props in batch]
await neo4j_adapter.add_edges(edge_tuples)
start = time.perf_counter()
for i in range(0, len(merge_edges), 100):
await age_adapter.add_edges(merge_edges[i:i+100])
age_time_edge_batch_merge = time.perf_counter() - start
start = time.perf_counter()
for i in range(0, len(merge_edges), 100):
batch = merge_edges[i:i+100]
def to_uuid(s):
try:
return UUID(s) if '-' in s else UUID(int=hash(s) & ((1 << 128) - 1))
except:
return UUID(int=hash(s) & ((1 << 128) - 1))
edge_tuples = [(to_uuid(src), to_uuid(tgt), rel_type, props)
for src, tgt, rel_type, props in batch]
await neo4j_adapter.add_edges(edge_tuples)
neo4j_time_edge_batch_merge = time.perf_counter() - start
print(f"Edge Ingestion Batch (Merge - {half_edges} existing, {len(new_edge_ids)} new): AGE={age_time_edge_batch_merge:.4f}s, Neo4j={neo4j_time_edge_batch_merge:.4f}s")
await age_adapter.close()
await neo4j_adapter.driver.close()