- Add ProgressTracker class for real-time progress parsing from mineru stdout - Implement intelligent stall-based timeout (5min no progress triggers warning) - Add vlm-direct backend mode to directly connect to vLLM server - Improve API mode with streaming response, HTTP session reuse, and auto-retry - Add heartbeat updates during long-running operations - Support dynamic timeout based on page count This addresses the deadlock issue where tasks hang indefinitely when using MinerU API backend, even after the API returns 200 OK.
1019 lines
43 KiB
Python
1019 lines
43 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import json
|
|
import logging
|
|
import os
|
|
import platform
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import threading
|
|
import time
|
|
import zipfile
|
|
from io import BytesIO
|
|
from os import PathLike
|
|
from pathlib import Path
|
|
from queue import Empty, Queue
|
|
from typing import Any, Callable, Optional
|
|
|
|
import numpy as np
|
|
import pdfplumber
|
|
import requests
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3.util.retry import Retry
|
|
from PIL import Image
|
|
from strenum import StrEnum
|
|
|
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
|
|
|
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
|
if LOCK_KEY_pdfplumber not in sys.modules:
|
|
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
|
|
|
|
|
class ProgressTracker:
|
|
"""
|
|
Progress tracker: parse mineru output for progress and implement intelligent stall-based timeout.
|
|
|
|
Features:
|
|
1. Parse tqdm progress bar output (e.g., "100%|████| 659/659")
|
|
2. Intelligent stall-based timeout (reset timer when progress updates)
|
|
3. Report progress to RagFlow callback
|
|
|
|
Design principles:
|
|
- Standalone helper class, does not modify MinerUParser core logic
|
|
- Parsing failure does not affect functionality, just no progress display
|
|
- Compatible with upstream code changes
|
|
"""
|
|
|
|
# Match tqdm progress bar format: "100%|████████████| 659/659" or "Page 45/659"
|
|
TQDM_PATTERN = re.compile(r'(\d+)%\|[^\|]*\|\s*(\d+)/(\d+)')
|
|
PAGE_PATTERN = re.compile(r'[Pp]age\s*(\d+)\s*/\s*(\d+)')
|
|
STEP_PATTERN = re.compile(r'(\d+)/(\d+)\s*\[') # "45/659 [00:30<01:00"
|
|
|
|
def __init__(
|
|
self,
|
|
callback: Optional[Callable] = None,
|
|
stall_timeout: int = 300, # 5 minutes without progress triggers timeout
|
|
progress_base: float = 0.25, # Base progress value
|
|
progress_range: float = 0.45, # Progress range (0.25 ~ 0.70)
|
|
):
|
|
self.callback = callback
|
|
self.stall_timeout = stall_timeout
|
|
self.progress_base = progress_base
|
|
self.progress_range = progress_range
|
|
|
|
self.last_progress_time = time.time()
|
|
self.last_progress_value = 0.0
|
|
self.total_pages = 0
|
|
self.current_page = 0
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
def parse_line(self, line: str) -> Optional[float]:
|
|
"""
|
|
Parse a line of output and extract progress information.
|
|
Returns progress value 0.0 ~ 1.0, or None if cannot parse.
|
|
"""
|
|
line = line.strip()
|
|
if not line:
|
|
return None
|
|
|
|
# Try to match tqdm format: "100%|████| 659/659"
|
|
if match := self.TQDM_PATTERN.search(line):
|
|
percent = int(match.group(1))
|
|
current = int(match.group(2))
|
|
total = int(match.group(3))
|
|
self.current_page = current
|
|
self.total_pages = total
|
|
return percent / 100.0
|
|
|
|
# Try to match "Page 45/659" format
|
|
if match := self.PAGE_PATTERN.search(line):
|
|
current = int(match.group(1))
|
|
total = int(match.group(2))
|
|
self.current_page = current
|
|
self.total_pages = total
|
|
return current / total if total > 0 else None
|
|
|
|
# Try to match "45/659 [" format
|
|
if match := self.STEP_PATTERN.search(line):
|
|
current = int(match.group(1))
|
|
total = int(match.group(2))
|
|
self.current_page = current
|
|
self.total_pages = total
|
|
return current / total if total > 0 else None
|
|
|
|
return None
|
|
|
|
def on_output(self, line: str, prefix: str = "STDOUT") -> None:
|
|
"""
|
|
Process a line of output. Parse progress and update state.
|
|
"""
|
|
progress = self.parse_line(line)
|
|
|
|
if progress is not None and progress > self.last_progress_value:
|
|
self.last_progress_value = progress
|
|
self.last_progress_time = time.time() # Reset timeout timer!
|
|
|
|
# Calculate actual progress value (map to progress_base ~ progress_base+progress_range)
|
|
actual_progress = self.progress_base + (progress * self.progress_range)
|
|
|
|
if self.callback:
|
|
msg = f"[MinerU] Processing page {self.current_page}/{self.total_pages}" if self.total_pages else f"[MinerU] Progress: {progress:.0%}"
|
|
try:
|
|
self.callback(actual_progress, msg)
|
|
except Exception as e:
|
|
self.logger.warning(f"[MinerU] Progress callback failed: {e}")
|
|
|
|
def is_stalled(self) -> bool:
|
|
"""Check if stalled (no progress update for a long time)"""
|
|
return (time.time() - self.last_progress_time) > self.stall_timeout
|
|
|
|
def get_stall_info(self) -> str:
|
|
"""Get stall information for error reporting"""
|
|
stall_duration = int(time.time() - self.last_progress_time)
|
|
return (
|
|
f"No progress for {stall_duration}s "
|
|
f"(stalled at {self.last_progress_value:.0%}, "
|
|
f"page {self.current_page}/{self.total_pages or '?'})"
|
|
)
|
|
|
|
def reset(self) -> None:
|
|
"""Reset state (for retry)"""
|
|
self.last_progress_time = time.time()
|
|
self.last_progress_value = 0.0
|
|
self.current_page = 0
|
|
|
|
|
|
class MinerUContentType(StrEnum):
|
|
IMAGE = "image"
|
|
TABLE = "table"
|
|
TEXT = "text"
|
|
EQUATION = "equation"
|
|
CODE = "code"
|
|
LIST = "list"
|
|
DISCARDED = "discarded"
|
|
|
|
|
|
class MinerUParser(RAGFlowPdfParser):
|
|
"""MinerU PDF Parser with multiple backend support.
|
|
|
|
Backends:
|
|
- pipeline: Local MinerU with traditional pipeline (requires GPU)
|
|
- vlm-http-client: MinerU CLI calling external vLLM server
|
|
- vlm-direct: NEW! Direct vLLM API call from RagFlow (bypasses mineru-api)
|
|
- api: External mineru-api service (legacy, prone to timeout issues)
|
|
"""
|
|
|
|
# Timeout settings for different operations
|
|
API_CONNECT_TIMEOUT = 30 # seconds to establish connection
|
|
API_READ_TIMEOUT_PER_PAGE = 60 # seconds per page for processing
|
|
API_MAX_TIMEOUT = 1800 # maximum total timeout
|
|
HEARTBEAT_INTERVAL = 30 # seconds between progress updates
|
|
|
|
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
|
|
self.mineru_path = Path(mineru_path)
|
|
self.mineru_api = mineru_api.rstrip("/")
|
|
self.mineru_server_url = mineru_server_url.rstrip("/")
|
|
self.using_api = False
|
|
self.using_vlm_direct = False # New flag for direct vLLM mode
|
|
self.outlines = []
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
self._session = None # Reusable HTTP session
|
|
|
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
|
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
|
if not root_dir:
|
|
files = zip_ref.namelist()
|
|
if files and files[0].endswith("/"):
|
|
root_dir = files[0]
|
|
else:
|
|
root_dir = None
|
|
|
|
if not root_dir or not root_dir.endswith("/"):
|
|
self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})")
|
|
zip_ref.extractall(extract_to)
|
|
return
|
|
|
|
root_len = len(root_dir)
|
|
for member in zip_ref.infolist():
|
|
filename = member.filename
|
|
if filename == root_dir:
|
|
self.logger.info("[MinerU] Ignore root folder...")
|
|
continue
|
|
|
|
path = filename
|
|
if path.startswith(root_dir):
|
|
path = path[root_len:]
|
|
|
|
full_path = os.path.join(extract_to, path)
|
|
if member.is_dir():
|
|
os.makedirs(full_path, exist_ok=True)
|
|
else:
|
|
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
with open(full_path, "wb") as f:
|
|
f.write(zip_ref.read(filename))
|
|
|
|
def _get_http_session(self) -> requests.Session:
|
|
"""Get or create a reusable HTTP session with retry logic."""
|
|
if self._session is None:
|
|
self._session = requests.Session()
|
|
retry_strategy = Retry(
|
|
total=3,
|
|
backoff_factor=0.5,
|
|
status_forcelist=[500, 502, 503, 504],
|
|
allowed_methods=["HEAD", "GET", "POST"],
|
|
)
|
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
self._session.mount("http://", adapter)
|
|
self._session.mount("https://", adapter)
|
|
return self._session
|
|
|
|
def _is_http_endpoint_valid(self, url, timeout=5):
|
|
try:
|
|
session = self._get_http_session()
|
|
response = session.head(url, timeout=timeout, allow_redirects=True)
|
|
return response.status_code in [200, 301, 302, 307, 308]
|
|
except Exception:
|
|
return False
|
|
|
|
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
|
reason = ""
|
|
|
|
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-direct"]
|
|
if backend not in valid_backends:
|
|
reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
|
self.logger.warning(reason)
|
|
return False, reason
|
|
|
|
subprocess_kwargs = {
|
|
"capture_output": True,
|
|
"text": True,
|
|
"check": True,
|
|
"encoding": "utf-8",
|
|
"errors": "ignore",
|
|
}
|
|
|
|
if platform.system() == "Windows":
|
|
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
|
|
|
if server_url is None:
|
|
server_url = self.mineru_server_url
|
|
|
|
# NEW: vlm-direct mode - direct connection to vLLM server, bypassing mineru-api
|
|
if backend == "vlm-direct" and server_url:
|
|
try:
|
|
# Check if vLLM server is accessible (OpenAI-compatible API)
|
|
health_endpoints = ["/health", "/v1/models", "/openapi.json"]
|
|
server_accessible = False
|
|
for endpoint in health_endpoints:
|
|
if self._is_http_endpoint_valid(server_url.rstrip("/") + endpoint):
|
|
server_accessible = True
|
|
break
|
|
|
|
if server_accessible:
|
|
self.logger.info(f"[MinerU] vlm-direct server accessible: {server_url}")
|
|
self.using_api = False
|
|
self.using_vlm_direct = True
|
|
return True, reason
|
|
else:
|
|
reason = f"[MinerU] vlm-direct server not accessible: {server_url}"
|
|
self.logger.warning(reason)
|
|
return False, reason
|
|
except Exception as e:
|
|
reason = f"[MinerU] vlm-direct server check failed: {server_url}: {e}"
|
|
self.logger.warning(reason)
|
|
return False, reason
|
|
|
|
if backend == "vlm-http-client" and server_url:
|
|
try:
|
|
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
|
self.logger.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
|
if server_accessible:
|
|
self.using_api = False # We are using http client, not API
|
|
return True, reason
|
|
else:
|
|
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
|
self.logger.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
|
return False, reason
|
|
except Exception as e:
|
|
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
|
try:
|
|
response = requests.get(server_url, timeout=5)
|
|
self.logger.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
|
self.using_api = False
|
|
return True, reason
|
|
except Exception as e:
|
|
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
|
self.logger.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
|
return False, reason
|
|
|
|
try:
|
|
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
|
version_info = result.stdout.strip()
|
|
if version_info:
|
|
self.logger.info(f"[MinerU] Detected version: {version_info}")
|
|
else:
|
|
self.logger.info("[MinerU] Detected MinerU, but version info is empty.")
|
|
return True, reason
|
|
except subprocess.CalledProcessError as e:
|
|
self.logger.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
|
except FileNotFoundError:
|
|
self.logger.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
|
|
except Exception as e:
|
|
self.logger.error(f"[MinerU] Unexpected error during installation check: {e}")
|
|
|
|
# If executable check fails, try API check
|
|
try:
|
|
if self.mineru_api:
|
|
# check openapi.json
|
|
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
|
|
if not openapi_exists:
|
|
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
|
return openapi_exists, reason
|
|
self.logger.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
|
self.using_api = openapi_exists
|
|
return openapi_exists, reason
|
|
else:
|
|
self.logger.info("[MinerU] api not exists.")
|
|
except Exception as e:
|
|
reason = f"[MinerU] Unexpected error during api check: {e}"
|
|
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
|
|
return False, reason
|
|
|
|
def _run_mineru(
|
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
|
):
|
|
if self.using_vlm_direct:
|
|
self._run_mineru_vlm_direct(input_path, output_dir, server_url, callback)
|
|
elif self.using_api:
|
|
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
|
else:
|
|
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
|
|
|
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
|
output_zip_path = os.path.join(str(output_dir), "output.zip")
|
|
|
|
pdf_file_path = str(input_path)
|
|
|
|
if not os.path.exists(pdf_file_path):
|
|
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
|
|
|
pdf_file_name = Path(pdf_file_path).stem.strip()
|
|
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
|
os.makedirs(output_path, exist_ok=True)
|
|
|
|
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
|
|
|
data = {
|
|
"output_dir": "./output",
|
|
"lang_list": lang,
|
|
"backend": backend,
|
|
"parse_method": method,
|
|
"formula_enable": True,
|
|
"table_enable": True,
|
|
"server_url": None,
|
|
"return_md": True,
|
|
"return_middle_json": True,
|
|
"return_model_output": True,
|
|
"return_content_list": True,
|
|
"return_images": True,
|
|
"response_format_zip": True,
|
|
"start_page_id": 0,
|
|
"end_page_id": 99999,
|
|
}
|
|
|
|
headers = {"Accept": "application/json"}
|
|
|
|
# Calculate dynamic timeout based on expected page count
|
|
# Estimate ~60s per page, with minimum 5 minutes
|
|
estimated_pages = data.get("end_page_id", 100) - data.get("start_page_id", 0)
|
|
dynamic_timeout = max(300, min(estimated_pages * self.API_READ_TIMEOUT_PER_PAGE, self.API_MAX_TIMEOUT))
|
|
|
|
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse (timeout={dynamic_timeout}s)")
|
|
if callback:
|
|
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
|
|
|
# Use session with retry logic and streaming response
|
|
session = self._get_http_session()
|
|
last_heartbeat = time.time()
|
|
|
|
try:
|
|
# Stream response to allow progress monitoring
|
|
with session.post(
|
|
url=f"{self.mineru_api}/file_parse",
|
|
files=files,
|
|
data=data,
|
|
headers=headers,
|
|
timeout=(self.API_CONNECT_TIMEOUT, dynamic_timeout),
|
|
stream=True
|
|
) as response:
|
|
response.raise_for_status()
|
|
|
|
if response.headers.get("Content-Type") == "application/zip":
|
|
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
if callback:
|
|
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
|
|
# Stream write with progress updates
|
|
total_size = int(response.headers.get('content-length', 0))
|
|
downloaded = 0
|
|
with open(output_zip_path, "wb") as f:
|
|
for chunk in response.iter_content(chunk_size=1024 * 1024): # 1MB chunks
|
|
if chunk:
|
|
f.write(chunk)
|
|
downloaded += len(chunk)
|
|
|
|
# Send heartbeat every 30 seconds during download
|
|
now = time.time()
|
|
if callback and now - last_heartbeat >= self.HEARTBEAT_INTERVAL:
|
|
progress = 0.30 + (0.10 * downloaded / total_size) if total_size else 0.35
|
|
callback(progress, f"[MinerU] Downloading... {downloaded // (1024*1024)}MB")
|
|
last_heartbeat = now
|
|
|
|
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
|
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
|
|
|
if callback:
|
|
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
|
else:
|
|
self.logger.warning("[MinerU] not zip returned from api: %s" % response.headers.get("Content-Type"))
|
|
|
|
except requests.exceptions.Timeout as e:
|
|
raise RuntimeError(f"[MinerU] API timeout after {dynamic_timeout}s. Consider using vlm-direct mode. Error: {e}")
|
|
except requests.exceptions.ConnectionError as e:
|
|
raise RuntimeError(f"[MinerU] API connection error. Is mineru-api running? Error: {e}")
|
|
except Exception as e:
|
|
raise RuntimeError(f"[MinerU] API failed with exception: {e}")
|
|
|
|
self.logger.info("[MinerU] API completed successfully.")
|
|
|
|
def _run_mineru_vlm_direct(
|
|
self, input_path: Path, output_dir: Path, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
|
):
|
|
"""
|
|
Direct vLLM integration mode - bypasses mineru-api and calls vLLM server directly.
|
|
|
|
This method uses mineru's internal vlm-http-client backend to connect directly to
|
|
a vLLM server, eliminating the mineru-api middleware that can cause deadlocks.
|
|
|
|
Requires: mineru[core] package installed
|
|
"""
|
|
if not server_url:
|
|
server_url = self.mineru_server_url
|
|
|
|
if not server_url:
|
|
raise RuntimeError("[MinerU] vlm-direct mode requires MINERU_SERVER_URL to be set")
|
|
|
|
self.logger.info(f"[MinerU] Starting vlm-direct mode with server: {server_url}")
|
|
if callback:
|
|
callback(0.15, f"[MinerU] Connecting to vLLM server: {server_url}")
|
|
|
|
try:
|
|
# Try to import mineru's vlm_analyze module
|
|
from mineru.backend.vlm.vlm_analyze import doc_analyze
|
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
except ImportError as e:
|
|
self.logger.warning(f"[MinerU] mineru package not available, falling back to executable mode: {e}")
|
|
# Fallback to executable mode with vlm-http-client backend
|
|
self._run_mineru_executable(
|
|
input_path, output_dir,
|
|
method="auto",
|
|
backend="vlm-http-client",
|
|
server_url=server_url,
|
|
callback=callback
|
|
)
|
|
return
|
|
|
|
pdf_file_name = input_path.stem.replace(" ", "")
|
|
local_md_dir = output_dir / pdf_file_name / "vlm"
|
|
local_image_dir = local_md_dir / "images"
|
|
local_md_dir.mkdir(parents=True, exist_ok=True)
|
|
local_image_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.logger.info(f"[MinerU] Output directory: {local_md_dir}")
|
|
if callback:
|
|
callback(0.20, f"[MinerU] Output directory: {local_md_dir}")
|
|
|
|
# Read PDF bytes
|
|
with open(input_path, "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
# Create image writer for mineru
|
|
image_writer = FileBasedDataWriter(str(local_image_dir))
|
|
|
|
try:
|
|
if callback:
|
|
callback(0.25, "[MinerU] Starting VLM processing (direct mode)...")
|
|
|
|
# Call mineru's doc_analyze with http-client backend
|
|
middle_json, results = doc_analyze(
|
|
pdf_bytes=pdf_bytes,
|
|
image_writer=image_writer,
|
|
backend="http-client",
|
|
server_url=server_url,
|
|
max_concurrency=10,
|
|
http_timeout=self.API_READ_TIMEOUT_PER_PAGE,
|
|
max_retries=3,
|
|
retry_backoff_factor=0.5,
|
|
)
|
|
|
|
if callback:
|
|
callback(0.60, "[MinerU] VLM processing completed, writing output...")
|
|
|
|
# Write content_list.json (required by _read_output)
|
|
content_list = self._convert_middle_json_to_content_list(middle_json)
|
|
content_list_path = local_md_dir / f"{pdf_file_name}_content_list.json"
|
|
with open(content_list_path, "w", encoding="utf-8") as f:
|
|
json.dump(content_list, f, ensure_ascii=False, indent=2)
|
|
|
|
self.logger.info(f"[MinerU] vlm-direct completed, output: {content_list_path}")
|
|
if callback:
|
|
callback(0.70, f"[MinerU] vlm-direct completed successfully")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"[MinerU] vlm-direct failed: {e}")
|
|
raise RuntimeError(f"[MinerU] vlm-direct mode failed: {e}")
|
|
|
|
def _convert_middle_json_to_content_list(self, middle_json: dict) -> list:
|
|
"""Convert mineru's middle_json format to content_list format."""
|
|
content_list = []
|
|
pdf_info = middle_json.get("pdf_info", [])
|
|
|
|
for page_idx, page_info in enumerate(pdf_info):
|
|
preproc_blocks = page_info.get("preproc_blocks", [])
|
|
for block in preproc_blocks:
|
|
block_type = block.get("type", "text")
|
|
|
|
item = {
|
|
"type": block_type,
|
|
"page_idx": page_idx,
|
|
"bbox": block.get("bbox", [0, 0, 0, 0]),
|
|
}
|
|
|
|
if block_type == "text":
|
|
lines = block.get("lines", [])
|
|
text = " ".join(
|
|
span.get("content", "")
|
|
for line in lines
|
|
for span in line.get("spans", [])
|
|
)
|
|
item["text"] = text
|
|
elif block_type == "table":
|
|
item["table_body"] = block.get("html", "")
|
|
item["table_caption"] = block.get("caption", [])
|
|
item["table_footnote"] = block.get("footnote", [])
|
|
elif block_type == "image":
|
|
item["image_caption"] = block.get("caption", [])
|
|
item["image_footnote"] = block.get("footnote", [])
|
|
item["img_path"] = block.get("img_path", "")
|
|
elif block_type == "equation":
|
|
item["text"] = block.get("latex", "")
|
|
|
|
content_list.append(item)
|
|
|
|
return content_list
|
|
|
|
def _run_mineru_executable(
|
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
|
):
|
|
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
|
if backend:
|
|
cmd.extend(["-b", backend])
|
|
if lang:
|
|
cmd.extend(["-l", lang])
|
|
if server_url and backend in ("vlm-http-client", "vlm-direct"):
|
|
cmd.extend(["-u", server_url])
|
|
|
|
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
|
|
if callback:
|
|
callback(0.20, f"[MinerU] Starting {backend} backend...")
|
|
|
|
subprocess_kwargs = {
|
|
"stdout": subprocess.PIPE,
|
|
"stderr": subprocess.PIPE,
|
|
"text": True,
|
|
"encoding": "utf-8",
|
|
"errors": "ignore",
|
|
"bufsize": 1,
|
|
}
|
|
|
|
if platform.system() == "Windows":
|
|
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
|
|
|
process = subprocess.Popen(cmd, **subprocess_kwargs)
|
|
stdout_queue, stderr_queue = Queue(), Queue()
|
|
|
|
# Create progress tracker (intelligent timeout + progress parsing)
|
|
tracker = ProgressTracker(
|
|
callback=callback,
|
|
stall_timeout=300, # 5 minutes without progress triggers warning
|
|
progress_base=0.25,
|
|
progress_range=0.45,
|
|
)
|
|
|
|
def enqueue_output(pipe, queue, prefix):
|
|
for line in iter(pipe.readline, ""):
|
|
if line.strip():
|
|
queue.put((prefix, line.strip()))
|
|
pipe.close()
|
|
|
|
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
|
|
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
|
|
|
|
while process.poll() is None:
|
|
for q in (stdout_queue, stderr_queue):
|
|
try:
|
|
while True:
|
|
prefix, line = q.get_nowait()
|
|
if prefix == "STDOUT":
|
|
self.logger.info(f"[MinerU] {line}")
|
|
# Parse progress and update callback
|
|
tracker.on_output(line, prefix)
|
|
else:
|
|
self.logger.warning(f"[MinerU] {line}")
|
|
except Empty:
|
|
pass
|
|
|
|
# Check if stalled (progress-based intelligent timeout)
|
|
if tracker.is_stalled():
|
|
stall_info = tracker.get_stall_info()
|
|
self.logger.warning(f"[MinerU] Process appears stalled: {stall_info}")
|
|
if callback:
|
|
callback(tracker.progress_base + tracker.last_progress_value * tracker.progress_range,
|
|
f"[MinerU] Warning: {stall_info}")
|
|
# Note: only warning here, not forcibly terminating the process
|
|
# because some PDFs do take a long time to process certain pages
|
|
tracker.last_progress_time = time.time() # Reset to avoid repeated warnings
|
|
|
|
time.sleep(0.1)
|
|
|
|
return_code = process.wait()
|
|
if return_code != 0:
|
|
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
|
|
self.logger.info("[MinerU] Command completed successfully.")
|
|
|
|
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
|
self.page_from = page_from
|
|
self.page_to = page_to
|
|
try:
|
|
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
|
self.pdf = pdf
|
|
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
|
except Exception as e:
|
|
self.page_images = None
|
|
self.total_page = 0
|
|
self.logger.exception(e)
|
|
|
|
def _line_tag(self, bx):
|
|
pn = [bx["page_idx"] + 1]
|
|
positions = bx.get("bbox", (0, 0, 0, 0))
|
|
x0, top, x1, bott = positions
|
|
|
|
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
|
|
page_width, page_height = self.page_images[bx["page_idx"]].size
|
|
x0 = (x0 / 1000.0) * page_width
|
|
x1 = (x1 / 1000.0) * page_width
|
|
top = (top / 1000.0) * page_height
|
|
bott = (bott / 1000.0) * page_height
|
|
|
|
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
|
|
|
def crop(self, text, ZM=1, need_position=False):
|
|
imgs = []
|
|
poss = self.extract_positions(text)
|
|
if not poss:
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
if not getattr(self, "page_images", None):
|
|
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
page_count = len(self.page_images)
|
|
|
|
filtered_poss = []
|
|
for pns, left, right, top, bottom in poss:
|
|
if not pns:
|
|
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
|
|
continue
|
|
valid_pns = [p for p in pns if 0 <= p < page_count]
|
|
if not valid_pns:
|
|
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
|
|
continue
|
|
filtered_poss.append((valid_pns, left, right, top, bottom))
|
|
|
|
poss = filtered_poss
|
|
if not poss:
|
|
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
|
GAP = 6
|
|
pos = poss[0]
|
|
first_page_idx = pos[0][0]
|
|
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
pos = poss[-1]
|
|
last_page_idx = pos[0][-1]
|
|
if not (0 <= last_page_idx < page_count):
|
|
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
last_page_height = self.page_images[last_page_idx].size[1]
|
|
poss.append(
|
|
(
|
|
[last_page_idx],
|
|
pos[1],
|
|
pos[2],
|
|
min(last_page_height, pos[4] + GAP),
|
|
min(last_page_height, pos[4] + 120),
|
|
)
|
|
)
|
|
|
|
positions = []
|
|
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
right = left + max_width
|
|
|
|
if bottom <= top:
|
|
bottom = top + 2
|
|
|
|
for pn in pns[1:]:
|
|
if 0 <= pn - 1 < page_count:
|
|
bottom += self.page_images[pn - 1].size[1]
|
|
else:
|
|
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
|
|
|
if not (0 <= pns[0] < page_count):
|
|
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
|
continue
|
|
|
|
img0 = self.page_images[pns[0]]
|
|
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
|
crop0 = img0.crop((x0, y0, x1, y1))
|
|
imgs.append(crop0)
|
|
if 0 < ii < len(poss) - 1:
|
|
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
|
|
|
bottom -= img0.size[1]
|
|
for pn in pns[1:]:
|
|
if not (0 <= pn < page_count):
|
|
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
|
continue
|
|
page = self.page_images[pn]
|
|
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
|
cimgp = page.crop((x0, y0, x1, y1))
|
|
imgs.append(cimgp)
|
|
if 0 < ii < len(poss) - 1:
|
|
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
|
bottom -= page.size[1]
|
|
|
|
if not imgs:
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
height = 0
|
|
for img in imgs:
|
|
height += img.size[1] + GAP
|
|
height = int(height)
|
|
width = int(np.max([i.size[0] for i in imgs]))
|
|
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
|
height = 0
|
|
for ii, img in enumerate(imgs):
|
|
if ii == 0 or ii + 1 == len(imgs):
|
|
img = img.convert("RGBA")
|
|
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
|
overlay.putalpha(128)
|
|
img = Image.alpha_composite(img, overlay).convert("RGB")
|
|
pic.paste(img, (0, int(height)))
|
|
height += img.size[1] + GAP
|
|
|
|
if need_position:
|
|
return pic, positions
|
|
return pic
|
|
|
|
@staticmethod
|
|
def extract_positions(txt: str):
|
|
poss = []
|
|
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
|
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
|
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
|
return poss
|
|
|
|
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
|
candidates = []
|
|
seen = set()
|
|
|
|
def add_candidate_path(p: Path):
|
|
if p not in seen:
|
|
seen.add(p)
|
|
candidates.append(p)
|
|
|
|
if backend.startswith("vlm-"):
|
|
add_candidate_path(output_dir / file_stem / "vlm")
|
|
if method:
|
|
add_candidate_path(output_dir / file_stem / method)
|
|
add_candidate_path(output_dir / file_stem / "auto")
|
|
else:
|
|
if method:
|
|
add_candidate_path(output_dir / file_stem / method)
|
|
add_candidate_path(output_dir / file_stem / "vlm")
|
|
add_candidate_path(output_dir / file_stem / "auto")
|
|
|
|
json_file = None
|
|
subdir = None
|
|
attempted = []
|
|
|
|
# mirror MinerU's sanitize_filename to align ZIP naming
|
|
def _sanitize_filename(name: str) -> str:
|
|
sanitized = re.sub(r"[/\\\.]{2,}|[/\\]", "", name)
|
|
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
|
if sanitized.startswith("."):
|
|
sanitized = "_" + sanitized[1:]
|
|
return sanitized or "unnamed"
|
|
|
|
safe_stem = _sanitize_filename(file_stem)
|
|
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
|
|
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
|
|
self.logger.info(f"[MinerU] Searching output candidates: {', '.join(str(c) for c in candidates)}")
|
|
|
|
for sub in candidates:
|
|
jf = sub / f"{file_stem}_content_list.json"
|
|
self.logger.info(f"[MinerU] Trying original path: {jf}")
|
|
attempted.append(jf)
|
|
if jf.exists():
|
|
subdir = sub
|
|
json_file = jf
|
|
break
|
|
|
|
# MinerU API sanitizes non-ASCII filenames inside the ZIP root and file names.
|
|
alt = sub / f"{safe_stem}_content_list.json"
|
|
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
|
|
attempted.append(alt)
|
|
if alt.exists():
|
|
subdir = sub
|
|
json_file = alt
|
|
break
|
|
|
|
nested_alt = sub / safe_stem / f"{safe_stem}_content_list.json"
|
|
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
|
|
attempted.append(nested_alt)
|
|
if nested_alt.exists():
|
|
subdir = nested_alt.parent
|
|
json_file = nested_alt
|
|
break
|
|
|
|
if not json_file:
|
|
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
|
|
|
with open(json_file, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
for item in data:
|
|
for key in ("img_path", "table_img_path", "equation_img_path"):
|
|
if key in item and item[key]:
|
|
item[key] = str((subdir / item[key]).resolve())
|
|
return data
|
|
|
|
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
|
sections = []
|
|
for output in outputs:
|
|
match output["type"]:
|
|
case MinerUContentType.TEXT:
|
|
section = output["text"]
|
|
case MinerUContentType.TABLE:
|
|
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
|
|
if not section.strip():
|
|
section = "FAILED TO PARSE TABLE"
|
|
case MinerUContentType.IMAGE:
|
|
section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", []))
|
|
case MinerUContentType.EQUATION:
|
|
section = output["text"]
|
|
case MinerUContentType.CODE:
|
|
section = output["code_body"] + "\n".join(output.get("code_caption", []))
|
|
case MinerUContentType.LIST:
|
|
section = "\n".join(output.get("list_items", []))
|
|
case MinerUContentType.DISCARDED:
|
|
pass
|
|
|
|
if section and parse_method == "manual":
|
|
sections.append((section, output["type"], self._line_tag(output)))
|
|
elif section and parse_method == "paper":
|
|
sections.append((section + self._line_tag(output), output["type"]))
|
|
else:
|
|
sections.append((section, self._line_tag(output)))
|
|
return sections
|
|
|
|
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
|
return []
|
|
|
|
def parse_pdf(
|
|
self,
|
|
filepath: str | PathLike[str],
|
|
binary: BytesIO | bytes,
|
|
callback: Optional[Callable] = None,
|
|
*,
|
|
output_dir: Optional[str] = None,
|
|
backend: str = "pipeline",
|
|
lang: Optional[str] = None,
|
|
method: str = "auto",
|
|
server_url: Optional[str] = None,
|
|
delete_output: bool = True,
|
|
parse_method: str = "raw",
|
|
) -> tuple:
|
|
import shutil
|
|
|
|
temp_pdf = None
|
|
created_tmp_dir = False
|
|
|
|
# remove spaces, or mineru crash, and _read_output fail too
|
|
file_path = Path(filepath)
|
|
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
|
|
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
|
|
|
|
if binary:
|
|
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
|
|
temp_pdf = temp_dir / pdf_file_name
|
|
with open(temp_pdf, "wb") as f:
|
|
f.write(binary)
|
|
pdf = temp_pdf
|
|
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
|
|
if callback:
|
|
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
|
|
else:
|
|
if pdf_file_path_valid != filepath:
|
|
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
|
|
shutil.move(filepath, pdf_file_path_valid)
|
|
pdf = Path(pdf_file_path_valid)
|
|
if not pdf.exists():
|
|
if callback:
|
|
callback(-1, f"[MinerU] PDF not found: {pdf}")
|
|
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
|
|
|
|
if output_dir:
|
|
out_dir = Path(output_dir)
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
|
|
created_tmp_dir = True
|
|
|
|
self.logger.info(f"[MinerU] Output directory: {out_dir}")
|
|
if callback:
|
|
callback(0.15, f"[MinerU] Output directory: {out_dir}")
|
|
|
|
self.__images__(pdf, zoomin=1)
|
|
|
|
try:
|
|
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
|
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
|
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
|
if callback:
|
|
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
|
|
|
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
|
|
finally:
|
|
if temp_pdf and temp_pdf.exists():
|
|
try:
|
|
temp_pdf.unlink()
|
|
temp_pdf.parent.rmdir()
|
|
except Exception:
|
|
pass
|
|
if delete_output and created_tmp_dir and out_dir.exists():
|
|
try:
|
|
shutil.rmtree(out_dir)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = MinerUParser("mineru")
|
|
ok, reason = parser.check_installation()
|
|
print("MinerU available:", ok)
|
|
|
|
filepath = ""
|
|
with open(filepath, "rb") as file:
|
|
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
|
|
for output in outputs:
|
|
print(output)
|