LightRAG/scripts/generate_cherrypick_order.py
2025-12-04 19:13:48 +08:00

127 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""
Generate an ordered cherry-pick CSV from docs/diff_hku/unmerged_upstream_mapping.csv
Ordering rule: primary = category priority (safety-first), secondary = chronological by auth_date (oldest first).
Output: docs/diff_hku/cherry_pick_ordered.csv with columns:
commit,auth_date,author,subject,category,priority_idx,git_cherry_pick_cmd
Usage:
python scripts/generate_cherrypick_order.py
"""
import csv
from pathlib import Path
from datetime import datetime
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "docs" / "diff_hku" / "unmerged_upstream_mapping.csv"
OUT = ROOT / "docs" / "diff_hku" / "cherry_pick_ordered.csv"
DEFAULT_PRIORITY_ORDER = [
# Wave 0 - security and DB safety
"security",
"postgres",
"storage",
"ci",
"tests",
# workspace and data safety
"workspace",
"chunking",
"ingestion",
# embeddings / llm providers
"embedding",
"llm_cloud",
"rerank",
# docs and misc
"json",
"pdf",
"docx",
"katex",
"dependabot",
"webui",
"misc",
"docs",
"other",
]
def build_priority_map(order_list):
mapping = {}
for idx, name in enumerate(order_list):
if name not in mapping:
mapping[name] = idx
# unknown categories will be placed at end using high index
return mapping
def parse_date(s: str):
try:
return datetime.fromisoformat(s)
except Exception:
# fallback: try parsing date-only
try:
return datetime.strptime(s, "%Y-%m-%d")
except Exception:
return datetime.min
def main():
if not SRC.exists():
print("Source mapping CSV not found at", SRC)
return 1
priority_map = build_priority_map(DEFAULT_PRIORITY_ORDER)
rows = []
with SRC.open("r", newline="", encoding="utf-8") as fh:
reader = csv.DictReader(fh)
for r in reader:
cat = (r.get("category") or "").strip() or "other"
priority_idx = priority_map.get(cat, max(priority_map.values()) + 1)
date_val = parse_date((r.get("auth_date") or "").strip())
rows.append({
"commit": r.get("commit"),
"auth_date": r.get("auth_date"),
"author": r.get("author"),
"subject": r.get("subject"),
"category": cat,
"priority_idx": priority_idx,
"date_val": date_val,
})
# Sort by priority_idx then date_val then commit
rows.sort(key=lambda x: (x["priority_idx"], x["date_val"], x["commit"]))
OUT.parent.mkdir(parents=True, exist_ok=True)
with OUT.open("w", newline="", encoding="utf-8") as fh:
writer = csv.writer(fh)
writer.writerow([
"commit",
"auth_date",
"author",
"subject",
"category",
"priority_idx",
"git_cherry_pick_cmd",
])
for r in rows:
cmd = f"git cherry-pick {r['commit']}"
writer.writerow([
r["commit"],
r["auth_date"],
r["author"],
r["subject"],
r["category"],
r["priority_idx"],
cmd,
])
print("Wrote ordered cherry-pick CSV to:", OUT)
return 0
if __name__ == "__main__":
raise SystemExit(main())