Feat: Introduce MinerU parsing options with backend, language, and method configurations

This commit is contained in:
concertdictate 2025-12-12 23:51:17 +02:00
parent 03d22b9e71
commit 4353de319d
2 changed files with 126 additions and 41 deletions

View file

@ -24,6 +24,7 @@ import tempfile
import threading import threading
import time import time
import zipfile import zipfile
from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from os import PathLike from os import PathLike
from pathlib import Path from pathlib import Path
@ -77,6 +78,63 @@ LANGUAGE_TO_MINERU_MAP = {
'Hindi': 'devanagari', 'Hindi': 'devanagari',
} }
class MinerUBackend(StrEnum):
"""MinerU processing backend options."""
PIPELINE = "pipeline" # Traditional multimodel pipeline (default)
VLM_TRANSFORMERS = "vlm-transformers" # Vision-language model using HuggingFace Transformers
VLM_MLX_ENGINE = "vlm-mlx-engine" # Faster, requires Apple Silicon and macOS 13.5+
VLM_VLLM_ENGINE = "vlm-vllm-engine" # Local vLLM engine, requires local GPU
VLM_VLLM_ASYNC_ENGINE = "vlm-vllm-async-engine" # Asynchronous vLLM engine, new in MinerU API
VLM_LMDEPLOY_ENGINE = "vlm-lmdeploy-engine" # LMDeploy engine
VLM_HTTP_CLIENT = "vlm-http-client" # HTTP client for remote vLLM server (CPU only)
class MinerULanguage(StrEnum):
"""MinerU supported languages for OCR (pipeline backend only)."""
CH = "ch" # Chinese
CH_SERVER = "ch_server" # Chinese (server)
CH_LITE = "ch_lite" # Chinese (lite)
EN = "en" # English
KOREAN = "korean" # Korean
JAPAN = "japan" # Japanese
CHINESE_CHT = "chinese_cht" # Chinese Traditional
TA = "ta" # Tamil
TE = "te" # Telugu
KA = "ka" # Kannada
TH = "th" # Thai
EL = "el" # Greek
LATIN = "latin" # Latin
ARABIC = "arabic" # Arabic
EAST_SLAVIC = "east_slavic" # East Slavic
CYRILLIC = "cyrillic" # Cyrillic
DEVANAGARI = "devanagari" # Devanagari
class MinerUParseMethod(StrEnum):
"""MinerU PDF parsing methods (pipeline backend only)."""
AUTO = "auto" # Automatically determine the method based on the file type
TXT = "txt" # Use text extraction method
OCR = "ocr" # Use OCR method for image-based PDFs
@dataclass
class MinerUParseOptions:
"""Options for MinerU PDF parsing."""
backend: MinerUBackend = MinerUBackend.PIPELINE
lang: Optional[MinerULanguage] = None # language for OCR (pipeline backend only)
method: MinerUParseMethod = MinerUParseMethod.AUTO
server_url: Optional[str] = None
delete_output: bool = True
parse_method: str = "raw"
formula_enable: bool = True
table_enable: bool = True
class MinerUParser(RAGFlowPdfParser): class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""): def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path) self.mineru_path = Path(mineru_path)
@ -120,7 +178,8 @@ class MinerUParser(RAGFlowPdfParser):
with open(full_path, "wb") as f: with open(full_path, "wb") as f:
f.write(zip_ref.read(filename)) f.write(zip_ref.read(filename))
def _is_http_endpoint_valid(self, url, timeout=5): @staticmethod
def _is_http_endpoint_valid(url, timeout=5):
try: try:
response = requests.head(url, timeout=timeout, allow_redirects=True) response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code in [200, 301, 302, 307, 308] return response.status_code in [200, 301, 302, 307, 308]
@ -165,7 +224,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}") self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
try: try:
response = requests.get(server_url, timeout=5) response = requests.get(server_url, timeout=5)
self.logger.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}") self.logger.info(
f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
self.using_api = False self.using_api = False
return True, reason return True, reason
except Exception as e: except Exception as e:
@ -209,14 +269,15 @@ class MinerUParser(RAGFlowPdfParser):
return False, reason return False, reason
def _run_mineru( def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
): ):
if self.using_api: if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback) self._run_mineru_api(input_path, output_dir, options, callback)
else: else:
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback) self._run_mineru_executable(input_path, output_dir, options, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): def _run_mineru_api(self, input_path: Path, output_dir: Path, options: MinerUParseOptions,
callback: Optional[Callable] = None):
output_zip_path = os.path.join(str(output_dir), "output.zip") output_zip_path = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path) pdf_file_path = str(input_path)
@ -225,16 +286,16 @@ class MinerUParser(RAGFlowPdfParser):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip() pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method) output_path = os.path.join(str(output_dir), pdf_file_name, options.method)
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
data = { data = {
"output_dir": "./output", "output_dir": "./output",
"lang_list": lang, "lang_list": options.lang,
"backend": backend, "backend": options.backend,
"parse_method": method, "parse_method": options.method,
"formula_enable": True, "formula_enable": True,
"table_enable": True, "table_enable": True,
"server_url": None, "server_url": None,
@ -253,7 +314,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse") self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
if callback: if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800) response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
timeout=1800)
response.raise_for_status() response.raise_for_status()
if response.headers.get("Content-Type") == "application/zip": if response.headers.get("Content-Type") == "application/zip":
@ -277,15 +339,15 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info("[MinerU] Api completed successfully.") self.logger.info("[MinerU] Api completed successfully.")
def _run_mineru_executable( def _run_mineru_executable(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
): ):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method] cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", options.method]
if backend: if options.backend:
cmd.extend(["-b", backend]) cmd.extend(["-b", options.backend])
if lang: if options.lang:
cmd.extend(["-l", lang]) cmd.extend(["-l", options.lang])
if server_url and backend == "vlm-http-client": if options.server_url and options.backend == "vlm-http-client":
cmd.extend(["-u", server_url]) cmd.extend(["-u", options.server_url])
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}") self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
@ -337,7 +399,8 @@ class MinerUParser(RAGFlowPdfParser):
try: try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf: with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf self.pdf = pdf
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])] self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in
enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e: except Exception as e:
self.page_images = None self.page_images = None
self.total_page = 0 self.total_page = 0
@ -399,7 +462,8 @@ class MinerUParser(RAGFlowPdfParser):
pos = poss[-1] pos = poss[-1]
last_page_idx = pos[0][-1] last_page_idx = pos[0][-1]
if not (0 <= last_page_idx < page_count): if not (0 <= last_page_idx < page_count):
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") self.logger.warning(
f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
if need_position: if need_position:
return None, None return None, None
return return
@ -425,10 +489,12 @@ class MinerUParser(RAGFlowPdfParser):
if 0 <= pn - 1 < page_count: if 0 <= pn - 1 < page_count:
bottom += self.page_images[pn - 1].size[1] bottom += self.page_images[pn - 1].size[1]
else: else:
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") self.logger.warning(
f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
if not (0 <= pns[0] < page_count): if not (0 <= pns[0] < page_count):
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") self.logger.warning(
f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
continue continue
img0 = self.page_images[pns[0]] img0 = self.page_images[pns[0]]
@ -441,7 +507,8 @@ class MinerUParser(RAGFlowPdfParser):
bottom -= img0.size[1] bottom -= img0.size[1]
for pn in pns[1:]: for pn in pns[1:]:
if not (0 <= pn < page_count): if not (0 <= pn < page_count):
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.") self.logger.warning(
f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
continue continue
page = self.page_images[pn] page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1])) x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
@ -485,7 +552,8 @@ class MinerUParser(RAGFlowPdfParser):
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
dict[str, Any]]:
candidates = [] candidates = []
seen = set() seen = set()
@ -567,11 +635,13 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.TEXT: case MinerUContentType.TEXT:
section = output["text"] section = output["text"]
case MinerUContentType.TABLE: case MinerUContentType.TABLE:
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", [])) section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(
output.get("table_footnote", []))
if not section.strip(): if not section.strip():
section = "FAILED TO PARSE TABLE" section = "FAILED TO PARSE TABLE"
case MinerUContentType.IMAGE: case MinerUContentType.IMAGE:
section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", [])) section = "".join(output.get("image_caption", [])) + "\n" + "".join(
output.get("image_footnote", []))
case MinerUContentType.EQUATION: case MinerUContentType.EQUATION:
section = output["text"] section = output["text"]
case MinerUContentType.CODE: case MinerUContentType.CODE:
@ -593,17 +663,17 @@ class MinerUParser(RAGFlowPdfParser):
return [] return []
def parse_pdf( def parse_pdf(
self, self,
filepath: str | PathLike[str], filepath: str | PathLike[str],
binary: BytesIO | bytes, binary: BytesIO | bytes,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
*, *,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
backend: str = "pipeline", backend: str = "pipeline",
server_url: Optional[str] = None, server_url: Optional[str] = None,
delete_output: bool = True, delete_output: bool = True,
parse_method: str = "raw", parse_method: str = "raw",
**kwargs, **kwargs,
) -> tuple: ) -> tuple:
import shutil import shutil
@ -613,7 +683,9 @@ class MinerUParser(RAGFlowPdfParser):
# Assuming the dict is defined as shown # Assuming the dict is defined as shown
lang = kwargs.get('lang', 'English') lang = kwargs.get('lang', 'English')
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found
# Use .get() to avoid KeyError if the key doesn't exist mineru_method_raw_str = kwargs.get('parser_config', {}).get('mineru_parse_method', 'auto')
enable_formula = kwargs.get('parser_config', {}).get('mineru_formula_enable', True)
enable_table = kwargs.get('parser_config', {}).get('mineru_enable', True)
# remove spaces, or mineru crash, and _read_output fail too # remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath) file_path = Path(filepath)
@ -653,8 +725,18 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1) self.__images__(pdf, zoomin=1)
try: try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=mineru_lang_code, server_url=server_url, callback=callback) options = MinerUParseOptions(
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) backend=MinerUBackend(backend),
lang=MinerULanguage(mineru_lang_code),
method=MinerUParseMethod(mineru_method_raw_str),
server_url=server_url,
delete_output=delete_output,
parse_method=parse_method,
formula_enable=enable_formula,
table_enable=enable_table,
)
self._run_mineru(pdf, out_dir, options, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")

View file

@ -96,6 +96,9 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
callback(-1, "MinerU not found.") callback(-1, "MinerU not found.")
return None, None, None return None, None, None
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
pdf_parser = DoclingParser() pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw") parse_method = kwargs.get("parse_method", "raw")