Fix MinerU API output for sanitized filenames

This commit is contained in:
少卿 2025-12-03 15:32:35 +08:00
parent b44e65a12e
commit 15279fc8d9

View file

@ -62,6 +62,28 @@ class MinerUParser(RAGFlowPdfParser):
self.outlines = [] self.outlines = []
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
@staticmethod
def _sanitize_output_name(name: str) -> str:
"""Approximate MinerU API sanitize logic while keeping CJK characters."""
cleaned = re.sub(r"[^\w.\-\u4e00-\u9fff]", "_", name.strip())
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
return cleaned or name
@staticmethod
def _detect_zip_root_dir(zip_path: Path) -> Optional[str]:
try:
with zipfile.ZipFile(zip_path, "r") as zip_ref:
for entry in zip_ref.namelist():
if not entry:
continue
entry = entry.rstrip("/")
if not entry:
continue
return entry.split("/", 1)[0]
except Exception:
return None
return None
def _extract_zip_no_root(self, zip_path, extract_to, root_dir): def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
if not root_dir: if not root_dir:
@ -72,7 +94,7 @@ class MinerUParser(RAGFlowPdfParser):
root_dir = None root_dir = None
if not root_dir or not root_dir.endswith("/"): if not root_dir or not root_dir.endswith("/"):
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}") self.logger.info(f"[MinerU] No root directory found, extracting all...")
zip_ref.extractall(extract_to) zip_ref.extractall(extract_to)
return return
@ -182,15 +204,21 @@ class MinerUParser(RAGFlowPdfParser):
return False, reason return False, reason
def _run_mineru( def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self,
): input_path: Path,
output_dir: Path,
method: str = "auto",
backend: str = "pipeline",
lang: Optional[str] = None,
server_url: Optional[str] = None,
callback: Optional[Callable] = None,
) -> str:
if self.using_api: if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback) return self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
else: return self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None) -> str:
output_zip_path = os.path.join(str(output_dir), "output.zip") output_zip_path = Path(output_dir) / "output.zip"
pdf_file_path = str(input_path) pdf_file_path = str(input_path)
@ -198,8 +226,8 @@ class MinerUParser(RAGFlowPdfParser):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip() pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method) sanitized_file_name = self._sanitize_output_name(pdf_file_name)
os.makedirs(output_path, exist_ok=True) chosen_root = sanitized_file_name or pdf_file_name
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
@ -238,8 +266,14 @@ class MinerUParser(RAGFlowPdfParser):
with open(output_zip_path, "wb") as f: with open(output_zip_path, "wb") as f:
f.write(response.content) f.write(response.content)
self.logger.info(f"[MinerU] Unzip to {output_path}...") zip_root = self._detect_zip_root_dir(output_zip_path)
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") chosen_root = zip_root or sanitized_file_name or pdf_file_name
root_dir_prefix = (zip_root + "/") if zip_root and not zip_root.endswith("/") else (zip_root if zip_root else f"{chosen_root}/")
output_path = Path(output_dir) / chosen_root
output_path.mkdir(parents=True, exist_ok=True)
self.logger.info(f"[MinerU] Unzip to {output_path} (root {root_dir_prefix})...")
self._extract_zip_no_root(output_zip_path, output_path, root_dir_prefix)
if callback: if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...") callback(0.40, f"[MinerU] Unzip to {output_path}...")
@ -248,10 +282,11 @@ class MinerUParser(RAGFlowPdfParser):
except Exception as e: except Exception as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}") raise RuntimeError(f"[MinerU] api failed with exception {e}")
self.logger.info("[MinerU] Api completed successfully.") self.logger.info("[MinerU] Api completed successfully.")
return chosen_root
def _run_mineru_executable( def _run_mineru_executable(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
): ) -> str:
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method] cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
if backend: if backend:
cmd.extend(["-b", backend]) cmd.extend(["-b", backend])
@ -303,6 +338,7 @@ class MinerUParser(RAGFlowPdfParser):
if return_code != 0: if return_code != 0:
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}") raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
self.logger.info("[MinerU] Command completed successfully.") self.logger.info("[MinerU] Command completed successfully.")
return input_path.stem
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from self.page_from = page_from
@ -458,37 +494,81 @@ class MinerUParser(RAGFlowPdfParser):
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline", actual_root: Optional[str] = None) -> list[dict[str, Any]]:
candidates = [] candidates: list[Path] = []
seen = set() seen: set[Path] = set()
attempted: list[Path] = []
def add_candidate_path(p: Path): def add_candidate_path(p: Path):
if p not in seen: if p not in seen:
seen.add(p) seen.add(p)
candidates.append(p) candidates.append(p)
if backend.startswith("vlm-"): candidate_roots: list[str] = []
add_candidate_path(output_dir / file_stem / "vlm")
if method: def add_candidate_root(name: Optional[str]):
add_candidate_path(output_dir / file_stem / method) if not name:
add_candidate_path(output_dir / file_stem / "auto") return
else: root_name = Path(str(name)).name
if method: if root_name not in candidate_roots:
add_candidate_path(output_dir / file_stem / method) candidate_roots.append(root_name)
add_candidate_path(output_dir / file_stem / "vlm")
add_candidate_path(output_dir / file_stem / "auto") add_candidate_root(file_stem)
sanitized_stem = self._sanitize_output_name(file_stem)
add_candidate_root(sanitized_stem)
add_candidate_root(actual_root)
if not candidate_roots:
candidate_roots.append(file_stem)
for root in candidate_roots:
base = output_dir / root
if backend.startswith("vlm-"):
add_candidate_path(base / "vlm")
if method:
add_candidate_path(base / method)
add_candidate_path(base / "auto")
else:
if method:
add_candidate_path(base / method)
add_candidate_path(base / "vlm")
add_candidate_path(base / "auto")
add_candidate_path(base)
candidate_file_stems: list[str] = []
def add_file_stem(name: Optional[str]):
if not name:
return
stem_name = Path(str(name)).stem
if stem_name not in candidate_file_stems:
candidate_file_stems.append(stem_name)
add_file_stem(file_stem)
add_file_stem(sanitized_stem)
add_file_stem(actual_root)
json_file = None json_file = None
subdir = None subdir = None
for sub in candidates: for sub in candidates:
jf = sub / f"{file_stem}_content_list.json" for stem in candidate_file_stems:
if jf.exists(): jf = sub / f"{stem}_content_list.json"
subdir = sub attempted.append(jf)
json_file = jf if jf.exists():
subdir = sub
json_file = jf
break
if json_file:
break break
if not json_file: if not json_file:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(c / (file_stem + '_content_list.json')) for c in candidates)}") fallback_matches = sorted(output_dir.glob("**/*_content_list.json"))
if fallback_matches:
json_file = fallback_matches[0]
subdir = json_file.parent
self.logger.info(f"[MinerU] Fallback located content list at {json_file}")
else:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
with open(json_file, "r", encoding="utf-8") as f: with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
@ -588,8 +668,8 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1) self.__images__(pdf, zoomin=1)
try: try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback) output_root = self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend, actual_root=output_root)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")