143 lines
4.2 KiB
Python
143 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
from pathlib import Path
|
|
import re
|
|
|
|
infile = Path("docs/diff_hku/unmerged_upstream_commits.txt")
|
|
out_csv = Path("docs/diff_hku/unmerged_upstream_mapping.csv")
|
|
summary_md = Path("docs/diff_hku/unmerged_upstream_mapping_summary.md")
|
|
if not infile.exists():
|
|
raise SystemExit("ERROR: input file not found")
|
|
text = infile.read_text()
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
patterns = [
|
|
(
|
|
"chunking",
|
|
re.compile(
|
|
r"chunk|chunking|chunk_token|chunk_size|chunk_top_k|ChunkTokenLimit|recursive split|top_n",
|
|
re.I,
|
|
),
|
|
),
|
|
(
|
|
"ingestion",
|
|
re.compile(
|
|
r"duplicate|dedup|deduplication|insert|insertion|track_id|content dedup|ingest|ingestion",
|
|
re.I,
|
|
),
|
|
),
|
|
(
|
|
"embedding",
|
|
re.compile(
|
|
r"embedding|embed|jina_embed|embedding provider|embedding_model|embedding_token",
|
|
re.I,
|
|
),
|
|
),
|
|
(
|
|
"llm_cloud",
|
|
re.compile(
|
|
r"openai|ollama|azure|bedrock|gemini|llm|structured output|parsed", re.I
|
|
),
|
|
),
|
|
(
|
|
"postgres",
|
|
re.compile(
|
|
r"postgres|vchordrq|AGE|age|postgres_impl|set_config|row level security|rls",
|
|
re.I,
|
|
),
|
|
),
|
|
("pdf", re.compile(r"pdf|pypdf|PyPDF2|pdf-decrypt|decryption", re.I)),
|
|
("docx", re.compile(r"docx|DOCX", re.I)),
|
|
("xlsx", re.compile(r"xlsx|excel|sheet\.max_column", re.I)),
|
|
(
|
|
"webui",
|
|
re.compile(r"webui|lightrag_webui|vite|react|i18next|plugin-react-swc", re.I),
|
|
),
|
|
(
|
|
"tests",
|
|
re.compile(
|
|
r"\btest\b|e2e|pytest|workspace_isolation|integration|coverage|CI|github actions|actions|ruff",
|
|
re.I,
|
|
),
|
|
),
|
|
(
|
|
"ci",
|
|
re.compile(
|
|
r"workflow|tests\.yml|docker-build|docker-compose|docker-build-push", re.I
|
|
),
|
|
),
|
|
("dependabot", re.compile(r"dependabot|bump", re.I)),
|
|
("json", re.compile(r"json|sanitiz|sanitize|sanitizer|UTF-8", re.I)),
|
|
(
|
|
"tools",
|
|
re.compile(
|
|
r"clean_llm_query_cache|migrate_llm|download_cache|clean_llm|lightrag-clean",
|
|
re.I,
|
|
),
|
|
),
|
|
(
|
|
"workspace",
|
|
re.compile(
|
|
r"workspace|namespace|NamespaceLock|pipeline status|pipeline|isolation|auto-initialize",
|
|
re.I,
|
|
),
|
|
),
|
|
("katex", re.compile(r"katex|KaTeX|mhchem|copy-tex", re.I)),
|
|
(
|
|
"security",
|
|
re.compile(
|
|
r"auth|token|secret|passlib|bcrypt|security|super_admin|has_tenant_access|membership",
|
|
re.I,
|
|
),
|
|
),
|
|
(
|
|
"storage",
|
|
re.compile(
|
|
r"faiss|milvus|qdrant|redis|neo4j|memgraph|nano_vector|mongo|vector_db|vchordrq",
|
|
re.I,
|
|
),
|
|
),
|
|
("rerank", re.compile(r"rerank|cohere|reranker|rerank_chunking", re.I)),
|
|
(
|
|
"docs",
|
|
re.compile(
|
|
r"Readme|README|Documentation|docs/|FrontendBuildGuide|OfflineDeployment|UV_LOCK_GUIDE|evaluation",
|
|
re.I,
|
|
),
|
|
),
|
|
("misc", re.compile(r".*", re.I)),
|
|
]
|
|
rows = []
|
|
counts = {}
|
|
for line in lines:
|
|
parts = line.split(" ", 3)
|
|
if len(parts) >= 4:
|
|
h, date, author, subject = parts
|
|
elif len(parts) == 3:
|
|
h, date, author = parts
|
|
subject = ""
|
|
else:
|
|
h = parts[0]
|
|
date = ""
|
|
author = ""
|
|
subject = ""
|
|
assigned = None
|
|
for name, regex in patterns[:-1]:
|
|
if regex.search(line):
|
|
assigned = name
|
|
break
|
|
if not assigned:
|
|
assigned = "misc"
|
|
rows.append((h, date, author, subject, assigned))
|
|
counts[assigned] = counts.get(assigned, 0) + 1
|
|
# write csv
|
|
with out_csv.open("w") as fh:
|
|
fh.write("commit,auth_date,author,subject,category\n")
|
|
for h, date, author, subject, cat in rows:
|
|
subject = subject.replace('"', '""')
|
|
fh.write(f'{h},{date},{author},"{subject}",{cat}\n')
|
|
# write summary
|
|
with summary_md.open("w") as fh:
|
|
fh.write("# Upstream commit mapping summary\n\n")
|
|
fh.write(f"- total upstream commits mapped: {len(lines)}\n\n")
|
|
for k, v in sorted(counts.items(), key=lambda x: -x[1]):
|
|
fh.write(f"- {k}: {v} commits\n")
|
|
print("Done: wrote mapping CSV and summary")
|