#!/usr/bin/env python3 """ Split docs/diff_hku/cherry_pick_ordered.csv into per-wave CSVs under docs/diff_hku/waves/ Wave definitions (category groups): Wave 0: security, postgres, storage, ci (DB safety & infra) Wave 1: tests, workspace, chunking, ingestion (correctness & pipeline) Wave 2: embedding, llm_cloud, rerank (providers) Wave 3: json, pdf, docx, katex, xlsx (data formats) Wave 4: dependabot, webui, misc, docs, other (low-risk churn) Usage: python scripts/split_cherrypick_waves.py """ import csv from pathlib import Path ROOT = Path(__file__).resolve().parents[1] SRC = ROOT / "docs" / "diff_hku" / "cherry_pick_ordered.csv" OUT_DIR = ROOT / "docs" / "diff_hku" / "waves" WAVE_CATEGORIES = { 0: {"security", "postgres", "storage", "ci"}, 1: {"tests", "workspace", "chunking", "ingestion"}, 2: {"embedding", "llm_cloud", "rerank"}, 3: {"json", "pdf", "docx", "katex", "xlsx"}, 4: {"dependabot", "webui", "misc", "docs", "other"}, } def main(): if not SRC.exists(): print("Source CSV not found:", SRC) return 1 OUT_DIR.mkdir(parents=True, exist_ok=True) # Build category → wave mapping cat_to_wave = {} for wave_id, cats in WAVE_CATEGORIES.items(): for c in cats: cat_to_wave[c] = wave_id # Read all rows with SRC.open("r", newline="", encoding="utf-8") as fh: reader = csv.DictReader(fh) fieldnames = reader.fieldnames rows = list(reader) # Bucket rows by wave wave_rows = {w: [] for w in WAVE_CATEGORIES} for r in rows: cat = r.get("category", "other").strip() or "other" w = cat_to_wave.get(cat, 4) # default to Wave 4 wave_rows[w].append(r) # Write per-wave CSVs for wave_id in sorted(WAVE_CATEGORIES.keys()): out_path = OUT_DIR / f"wave_{wave_id}.csv" with out_path.open("w", newline="", encoding="utf-8") as fh: writer = csv.DictWriter(fh, fieldnames=fieldnames) writer.writeheader() writer.writerows(wave_rows[wave_id]) print(f"Wave {wave_id}: {len(wave_rows[wave_id])} commits → {out_path}") # Also write a shell script per wave for convenience for wave_id in sorted(WAVE_CATEGORIES.keys()): script_path = OUT_DIR / f"apply_wave_{wave_id}.sh" with script_path.open("w", encoding="utf-8") as fh: fh.write("#!/usr/bin/env bash\n") fh.write(f"# Auto-generated script to apply Wave {wave_id} commits\n") fh.write("set -e\n\n") for r in wave_rows[wave_id]: commit = r.get("commit", "") subject = r.get("subject", "").replace('"', '\\"') fh.write(f'echo "Cherry-picking {commit}: {subject}"\n') fh.write(f"git cherry-pick -x {commit}\n\n") script_path.chmod(0o755) print(f" → shell script: {script_path}") print("\nDone.") return 0 if __name__ == "__main__": raise SystemExit(main())