Update mineru_parser.py
This commit is contained in:
parent
19b5ec3437
commit
680d91276a
1 changed files with 83 additions and 128 deletions
|
|
@ -62,29 +62,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
self.outlines = []
|
self.outlines = []
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _sanitize_output_name(name: str) -> str:
|
|
||||||
"""Approximate MinerU API sanitize logic while keeping CJK characters."""
|
|
||||||
cleaned = re.sub(r"[^\w.\-\u4e00-\u9fff]", "_", name.strip())
|
|
||||||
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
|
|
||||||
return cleaned or name
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _detect_zip_root_dir(zip_path: Path) -> Optional[str]:
|
|
||||||
try:
|
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
|
||||||
for entry in zip_ref.namelist():
|
|
||||||
if not entry:
|
|
||||||
continue
|
|
||||||
entry = entry.rstrip("/")
|
|
||||||
if not entry:
|
|
||||||
continue
|
|
||||||
return entry.split("/", 1)[0]
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||||||
|
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
if not root_dir:
|
if not root_dir:
|
||||||
files = zip_ref.namelist()
|
files = zip_ref.namelist()
|
||||||
|
|
@ -94,7 +73,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
root_dir = None
|
root_dir = None
|
||||||
|
|
||||||
if not root_dir or not root_dir.endswith("/"):
|
if not root_dir or not root_dir.endswith("/"):
|
||||||
self.logger.info(f"[MinerU] No root directory found, extracting all...")
|
self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})")
|
||||||
zip_ref.extractall(extract_to)
|
zip_ref.extractall(extract_to)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
@ -130,7 +109,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
|
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
|
||||||
if backend not in valid_backends:
|
if backend not in valid_backends:
|
||||||
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
||||||
logging.warning(reason)
|
self.logger.warning(reason)
|
||||||
return False, reason
|
return False, reason
|
||||||
|
|
||||||
subprocess_kwargs = {
|
subprocess_kwargs = {
|
||||||
|
|
@ -150,40 +129,40 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
if backend == "vlm-http-client" and server_url:
|
if backend == "vlm-http-client" and server_url:
|
||||||
try:
|
try:
|
||||||
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
||||||
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
self.logger.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
||||||
if server_accessible:
|
if server_accessible:
|
||||||
self.using_api = False # We are using http client, not API
|
self.using_api = False # We are using http client, not API
|
||||||
return True, reason
|
return True, reason
|
||||||
else:
|
else:
|
||||||
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
||||||
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
self.logger.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
||||||
return False, reason
|
return False, reason
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
||||||
try:
|
try:
|
||||||
response = requests.get(server_url, timeout=5)
|
response = requests.get(server_url, timeout=5)
|
||||||
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
self.logger.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
||||||
self.using_api = False
|
self.using_api = False
|
||||||
return True, reason
|
return True, reason
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
||||||
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
self.logger.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
||||||
return False, reason
|
return False, reason
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
||||||
version_info = result.stdout.strip()
|
version_info = result.stdout.strip()
|
||||||
if version_info:
|
if version_info:
|
||||||
logging.info(f"[MinerU] Detected version: {version_info}")
|
self.logger.info(f"[MinerU] Detected version: {version_info}")
|
||||||
else:
|
else:
|
||||||
logging.info("[MinerU] Detected MinerU, but version info is empty.")
|
self.logger.info("[MinerU] Detected MinerU, but version info is empty.")
|
||||||
return True, reason
|
return True, reason
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
self.logger.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
|
self.logger.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
|
self.logger.error(f"[MinerU] Unexpected error during installation check: {e}")
|
||||||
|
|
||||||
# If executable check fails, try API check
|
# If executable check fails, try API check
|
||||||
try:
|
try:
|
||||||
|
|
@ -193,32 +172,26 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
if not openapi_exists:
|
if not openapi_exists:
|
||||||
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
||||||
return openapi_exists, reason
|
return openapi_exists, reason
|
||||||
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
self.logger.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
||||||
self.using_api = openapi_exists
|
self.using_api = openapi_exists
|
||||||
return openapi_exists, reason
|
return openapi_exists, reason
|
||||||
else:
|
else:
|
||||||
logging.info("[MinerU] api not exists.")
|
self.logger.info("[MinerU] api not exists.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
reason = f"[MinerU] Unexpected error during api check: {e}"
|
reason = f"[MinerU] Unexpected error during api check: {e}"
|
||||||
logging.error(f"[MinerU] Unexpected error during api check: {e}")
|
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
|
||||||
return False, reason
|
return False, reason
|
||||||
|
|
||||||
def _run_mineru(
|
def _run_mineru(
|
||||||
self,
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||||||
input_path: Path,
|
):
|
||||||
output_dir: Path,
|
|
||||||
method: str = "auto",
|
|
||||||
backend: str = "pipeline",
|
|
||||||
lang: Optional[str] = None,
|
|
||||||
server_url: Optional[str] = None,
|
|
||||||
callback: Optional[Callable] = None,
|
|
||||||
) -> str:
|
|
||||||
if self.using_api:
|
if self.using_api:
|
||||||
return self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
||||||
return self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
else:
|
||||||
|
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
||||||
|
|
||||||
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None) -> str:
|
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
||||||
output_zip_path = Path(output_dir) / "output.zip"
|
output_zip_path = os.path.join(str(output_dir), "output.zip")
|
||||||
|
|
||||||
pdf_file_path = str(input_path)
|
pdf_file_path = str(input_path)
|
||||||
|
|
||||||
|
|
@ -226,8 +199,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
||||||
|
|
||||||
pdf_file_name = Path(pdf_file_path).stem.strip()
|
pdf_file_name = Path(pdf_file_path).stem.strip()
|
||||||
sanitized_file_name = self._sanitize_output_name(pdf_file_name)
|
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
||||||
chosen_root = sanitized_file_name or pdf_file_name
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
|
||||||
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
||||||
|
|
||||||
|
|
@ -266,14 +239,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
with open(output_zip_path, "wb") as f:
|
with open(output_zip_path, "wb") as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
|
|
||||||
zip_root = self._detect_zip_root_dir(output_zip_path)
|
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||||||
chosen_root = zip_root or sanitized_file_name or pdf_file_name
|
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
||||||
root_dir_prefix = (zip_root + "/") if zip_root and not zip_root.endswith("/") else (zip_root if zip_root else f"{chosen_root}/")
|
|
||||||
output_path = Path(output_dir) / chosen_root
|
|
||||||
output_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
self.logger.info(f"[MinerU] Unzip to {output_path} (root {root_dir_prefix})...")
|
|
||||||
self._extract_zip_no_root(output_zip_path, output_path, root_dir_prefix)
|
|
||||||
|
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||||
|
|
@ -282,11 +249,10 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||||
self.logger.info("[MinerU] Api completed successfully.")
|
self.logger.info("[MinerU] Api completed successfully.")
|
||||||
return chosen_root
|
|
||||||
|
|
||||||
def _run_mineru_executable(
|
def _run_mineru_executable(
|
||||||
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||||||
) -> str:
|
):
|
||||||
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
||||||
if backend:
|
if backend:
|
||||||
cmd.extend(["-b", backend])
|
cmd.extend(["-b", backend])
|
||||||
|
|
@ -338,7 +304,6 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
if return_code != 0:
|
if return_code != 0:
|
||||||
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
|
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
|
||||||
self.logger.info("[MinerU] Command completed successfully.")
|
self.logger.info("[MinerU] Command completed successfully.")
|
||||||
return input_path.stem
|
|
||||||
|
|
||||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||||
self.page_from = page_from
|
self.page_from = page_from
|
||||||
|
|
@ -350,7 +315,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.page_images = None
|
self.page_images = None
|
||||||
self.total_page = 0
|
self.total_page = 0
|
||||||
logging.exception(e)
|
self.logger.exception(e)
|
||||||
|
|
||||||
def _line_tag(self, bx):
|
def _line_tag(self, bx):
|
||||||
pn = [bx["page_idx"] + 1]
|
pn = [bx["page_idx"] + 1]
|
||||||
|
|
@ -494,81 +459,71 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||||
return poss
|
return poss
|
||||||
|
|
||||||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline", actual_root: Optional[str] = None) -> list[dict[str, Any]]:
|
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
||||||
candidates: list[Path] = []
|
candidates = []
|
||||||
seen: set[Path] = set()
|
seen = set()
|
||||||
attempted: list[Path] = []
|
|
||||||
|
|
||||||
def add_candidate_path(p: Path):
|
def add_candidate_path(p: Path):
|
||||||
if p not in seen:
|
if p not in seen:
|
||||||
seen.add(p)
|
seen.add(p)
|
||||||
candidates.append(p)
|
candidates.append(p)
|
||||||
|
|
||||||
candidate_roots: list[str] = []
|
if backend.startswith("vlm-"):
|
||||||
|
add_candidate_path(output_dir / file_stem / "vlm")
|
||||||
def add_candidate_root(name: Optional[str]):
|
if method:
|
||||||
if not name:
|
add_candidate_path(output_dir / file_stem / method)
|
||||||
return
|
add_candidate_path(output_dir / file_stem / "auto")
|
||||||
root_name = Path(str(name)).name
|
else:
|
||||||
if root_name not in candidate_roots:
|
if method:
|
||||||
candidate_roots.append(root_name)
|
add_candidate_path(output_dir / file_stem / method)
|
||||||
|
add_candidate_path(output_dir / file_stem / "vlm")
|
||||||
add_candidate_root(file_stem)
|
add_candidate_path(output_dir / file_stem / "auto")
|
||||||
sanitized_stem = self._sanitize_output_name(file_stem)
|
|
||||||
add_candidate_root(sanitized_stem)
|
|
||||||
add_candidate_root(actual_root)
|
|
||||||
|
|
||||||
if not candidate_roots:
|
|
||||||
candidate_roots.append(file_stem)
|
|
||||||
|
|
||||||
for root in candidate_roots:
|
|
||||||
base = output_dir / root
|
|
||||||
if backend.startswith("vlm-"):
|
|
||||||
add_candidate_path(base / "vlm")
|
|
||||||
if method:
|
|
||||||
add_candidate_path(base / method)
|
|
||||||
add_candidate_path(base / "auto")
|
|
||||||
else:
|
|
||||||
if method:
|
|
||||||
add_candidate_path(base / method)
|
|
||||||
add_candidate_path(base / "vlm")
|
|
||||||
add_candidate_path(base / "auto")
|
|
||||||
add_candidate_path(base)
|
|
||||||
|
|
||||||
candidate_file_stems: list[str] = []
|
|
||||||
|
|
||||||
def add_file_stem(name: Optional[str]):
|
|
||||||
if not name:
|
|
||||||
return
|
|
||||||
stem_name = Path(str(name)).stem
|
|
||||||
if stem_name not in candidate_file_stems:
|
|
||||||
candidate_file_stems.append(stem_name)
|
|
||||||
|
|
||||||
add_file_stem(file_stem)
|
|
||||||
add_file_stem(sanitized_stem)
|
|
||||||
add_file_stem(actual_root)
|
|
||||||
|
|
||||||
json_file = None
|
json_file = None
|
||||||
subdir = None
|
subdir = None
|
||||||
|
attempted = []
|
||||||
|
|
||||||
|
# mirror MinerU's sanitize_filename to align ZIP naming
|
||||||
|
def _sanitize_filename(name: str) -> str:
|
||||||
|
sanitized = re.sub(r"[/\\\.]{2,}|[/\\]", "", name)
|
||||||
|
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
||||||
|
if sanitized.startswith("."):
|
||||||
|
sanitized = "_" + sanitized[1:]
|
||||||
|
return sanitized or "unnamed"
|
||||||
|
|
||||||
|
safe_stem = _sanitize_filename(file_stem)
|
||||||
|
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
|
||||||
|
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
|
||||||
|
self.logger.info(f"[MinerU] Searching output candidates: {', '.join(str(c) for c in candidates)}")
|
||||||
|
|
||||||
for sub in candidates:
|
for sub in candidates:
|
||||||
for stem in candidate_file_stems:
|
jf = sub / f"{file_stem}_content_list.json"
|
||||||
jf = sub / f"{stem}_content_list.json"
|
self.logger.info(f"[MinerU] Trying original path: {jf}")
|
||||||
attempted.append(jf)
|
attempted.append(jf)
|
||||||
if jf.exists():
|
if jf.exists():
|
||||||
subdir = sub
|
subdir = sub
|
||||||
json_file = jf
|
json_file = jf
|
||||||
break
|
break
|
||||||
if json_file:
|
|
||||||
|
# MinerU API sanitizes non-ASCII filenames inside the ZIP root and file names.
|
||||||
|
alt = sub / f"{safe_stem}_content_list.json"
|
||||||
|
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
|
||||||
|
attempted.append(alt)
|
||||||
|
if alt.exists():
|
||||||
|
subdir = sub
|
||||||
|
json_file = alt
|
||||||
|
break
|
||||||
|
|
||||||
|
nested_alt = sub / safe_stem / f"{safe_stem}_content_list.json"
|
||||||
|
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
|
||||||
|
attempted.append(nested_alt)
|
||||||
|
if nested_alt.exists():
|
||||||
|
subdir = nested_alt.parent
|
||||||
|
json_file = nested_alt
|
||||||
break
|
break
|
||||||
|
|
||||||
if not json_file:
|
if not json_file:
|
||||||
fallback_matches = sorted(output_dir.glob("**/*_content_list.json"))
|
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
||||||
if fallback_matches:
|
|
||||||
json_file = fallback_matches[0]
|
|
||||||
subdir = json_file.parent
|
|
||||||
self.logger.info(f"[MinerU] Fallback located content list at {json_file}")
|
|
||||||
else:
|
|
||||||
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
|
||||||
|
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
with open(json_file, "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
@ -668,8 +623,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
self.__images__(pdf, zoomin=1)
|
self.__images__(pdf, zoomin=1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
output_root = self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
||||||
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend, actual_root=output_root)
|
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
||||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue