Fix: Generate missing images for MinerU text blocks using local crop

2025-12-09 19:53:56 +08:00 · 2025-12-09 19:53:56 +08:00 · b443d34faf
commit b443d34faf
parent 65a5a56d95
1 changed files with 70 additions and 0 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -51,6 +51,8 @@ class MinerUContentType(StrEnum):
    CODE = "code"
    LIST = "list"
    DISCARDED = "discarded"
    HEADER = "header"
    PAGE_NUMBER = "page_number"
 class MinerUParser(RAGFlowPdfParser):
@ -459,6 +461,67 @@ class MinerUParser(RAGFlowPdfParser):
            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
        return poss
    def _bbox_to_pixels(self, bbox, page_size):
        x0, y0, x1, y1 = bbox
        pw, ph = page_size
        maxv = max(bbox)
        # 经验：MinerU bbox 常为 0~1000 归一化；否则认为已是像素
        if maxv <= 1.5:
            sx, sy = pw, ph
        elif maxv <= 1200:
            sx, sy = pw / 1000.0, ph / 1000.0
        else:
            sx, sy = 1.0, 1.0
        return (
            int(x0 * sx),
            int(y0 * sy),
            int(x1 * sx),
            int(y1 * sy),
        )
    def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str):
        if not getattr(self, "page_images", None):
            return
        if not subdir:
            return
        img_root = subdir / "generated_images"
        img_root.mkdir(parents=True, exist_ok=True)
        text_types = {MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.CODE, MinerUContentType.HEADER}
        generated = 0
        for idx, item in enumerate(outputs):
            if item.get("type") not in text_types:
                continue
            if item.get("img_path"):
                continue
            bbox = item.get("bbox")
            if not bbox or len(bbox) != 4:
                continue
            page_idx = int(item.get("page_idx", 0))
            if page_idx < 0 or page_idx >= len(self.page_images):
                continue
            x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size)
            # guard invalid bbox
            if x1 - x0 < 2 or y1 - y0 < 2:
                continue
            try:
                crop = self.page_images[page_idx].crop((x0, y0, x1, y1))
                fname = f"{file_stem}_gen_{idx}.jpg"
                out_path = img_root / fname
                crop.save(out_path, format="JPEG", quality=80)
                item["img_path"] = str(out_path.resolve())
                generated += 1
            except Exception as e:
                self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}")
                continue
        if generated:
            self.logger.info(f"[MinerU] generated {generated} fallback images for text blocks")
    def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
        candidates = []
        seen = set()
@ -532,6 +595,13 @@ class MinerUParser(RAGFlowPdfParser):
            for key in ("img_path", "table_img_path", "equation_img_path"):
                if key in item and item[key]:
                    item[key] = str((subdir / item[key]).resolve())
        # MinerU(vlm-http-client) 不会为纯文本生成图片，这里兜底用本地页图裁剪生成，方便后续引用/MinIO 存图
        try:
            self._generate_missing_images(data, subdir, file_stem)
        except Exception as e:
            self.logger.warning(f"[MinerU] generate missing images failed: {e}")
        return data
    def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):