feat: enhance MinerU crop() with 3 major improvements

1. Sampling optimization: reduce from 20 to 12 images when exceeding threshold 2. Native image width normalization: re-crop page-width strips for consistent stitching - Preserves original native images for MinIO storage - Uses normalized versions only for thumbnail stitching 3. Low fallback threshold: stitch full page screenshots when 3 fallback images - Deduplicates and limits to max 3 pages - Provides better context for sparse thumbnails
2025-12-12 15:19:13 +08:00 · 2025-12-12 15:19:13 +08:00 · 58792dfe99
commit 58792dfe99
parent 02a4b79f90
1 changed files with 124 additions and 6 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -385,9 +385,10 @@ class MinerUParser(RAGFlowPdfParser):
            native_img_path = self._find_native_image_path(tag)
            if native_img_path:
                try:
-                    img = Image.open(native_img_path)
+                    # ✅ 使用页宽标准化版本（原生图保留入库MinIO）
                    img = self._normalize_native_image_width(native_img_path, tag)
                    images_to_stitch.append(("native", img, pos, tag))
-                    self.logger.debug(f"[MinerU] Using native image for tag: {tag}")
+                    self.logger.debug(f"[MinerU] Using normalized native image for tag: {tag}")
                    continue
                except Exception as e:
                    self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}")
@ -417,6 +418,12 @@ class MinerUParser(RAGFlowPdfParser):
                return None, None
            return
        # ✅ 兜底图≤3张时，拼接完整页（去重）
        fallback_count = sum(1 for src, _, _, _ in images_to_stitch if src == "cached")
        if fallback_count <= 3 and fallback_count > 0:
            self.logger.debug(f"[MinerU] Fallback count = {fallback_count}, using full page strategy")
            return self._handle_low_fallback_count(poss, need_position)
        # Step 2: 智能拼接（带阈值控制）
        return self._smart_stitch_with_thresholds(images_to_stitch, need_position)
@ -426,6 +433,116 @@ class MinerUParser(RAGFlowPdfParser):
        native_map = getattr(self, "_native_img_map", {})
        return native_map.get(tag)
    def _normalize_native_image_width(self, native_img_path, tag):
        """
        将Native图标准化为页宽版本（仅用于拼接）
        原理：根据tag中的bbox，从页面重新裁剪页宽条带
        - 横向：0 到 页宽
        - 纵向：bbox的y范围
        Args:
            native_img_path: MinerU原生图路径（保留入库MinIO）
            tag: 包含page_idx和bbox信息的tag字符串
        Returns:
            页宽标准化后的Image对象，失败则返回原生图
        """
        try:
            # 解析tag获取page_idx和bbox
            import re
            match = re.match(r"@@(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)##", tag)
            if not match:
                # 解析失败，返回原生图
                return Image.open(native_img_path)
            page_num, x0_str, x1_str, y0_str, y1_str = match.groups()
            page_idx = int(page_num) - 1  # 转为0-based
            bbox = [float(x0_str), float(y0_str), float(x1_str), float(y1_str)]
            # 检查page_images可用性
            if not hasattr(self, "page_images") or not self.page_images:
                return Image.open(native_img_path)
            if page_idx < 0 or page_idx >= len(self.page_images):
                return Image.open(native_img_path)
            # 获取页面图片
            page_img = self.page_images[page_idx]
            page_width, page_height = page_img.size
            # bbox转像素
            px0, py0, px1, py1 = self._bbox_to_pixels(bbox, (page_width, page_height))
            # 裁剪页宽条带（横向全宽，纵向bbox范围）
            crop_y0 = max(0, min(py0, page_height))
            crop_y1 = max(crop_y0 + 1, min(py1, page_height))
            if crop_y1 - crop_y0 < 2:
                # bbox无效，返回原生图
                return Image.open(native_img_path)
            page_width_img = page_img.crop((0, crop_y0, page_width, crop_y1))
            self.logger.debug(f"[MinerU] Normalized native image to page-width: {page_width}x{crop_y1-crop_y0}px")
            return page_width_img
        except Exception as e:
            self.logger.debug(f"[MinerU] Failed to normalize native image, using original: {e}")
            return Image.open(native_img_path)
    def _handle_low_fallback_count(self, poss, need_position):
        """
        兜底图≤3张时，拼接涉及页面截图（去重）
        策略：
        - 提取所有涉及页码
        - 去重并限制最多3页
        - 拼接这些完整页
        Args:
            poss: positions列表
            need_position: 是否需要返回positions
        Returns:
            拼接的完整页截图，或单页截图
        """
        if not hasattr(self, "page_images") or not self.page_images:
            if need_position:
                return None, None
            return
        # 提取所有涉及页码（0-based），去重并排序
        page_indices = sorted(set(
            pns[0] for pns, _, _, _, _ in poss 
            if pns and 0 <= pns[0] < len(self.page_images)
        ))
        # 限制最多3页
        page_indices = page_indices[:3]
        if not page_indices:
            if need_position:
                return None, None
            return
        self.logger.info(f"[MinerU] Low fallback count, stitching {len(page_indices)} page(s): {[idx+1 for idx in page_indices]}")
        # 单页直接返回
        if len(page_indices) == 1:
            page_img = self.page_images[page_indices[0]]
            if need_position:
                return page_img, [[page_indices[0], 0, page_img.width, 0, page_img.height]]
            return page_img
        # 多页垂直拼接
        page_imgs_with_meta = [
            ("fullpage", self.page_images[idx], ([idx], 0, 0, 0, 0), f"@@{idx+1}\t0\t0\t0\t0##")
            for idx in page_indices
        ]
        return self._stitch_images_vertically(page_imgs_with_meta, need_position, gap=10)
    def _smart_stitch_with_thresholds(self, images_with_metadata, need_position):
        """
        智能拼接：应用阈值控制
@ -435,18 +552,19 @@ class MinerUParser(RAGFlowPdfParser):
        - MAX_HEIGHT: 总高度不超过4000px
        Strategies:
-        - 数量过多: 均匀采样（保留首尾）
+        - 数量过多: 均匀采样到12张（保留首尾）
        - 高度过高: 截断到4000px
        - 不缩放图片（保持高清）
        """
        MAX_COUNT = 20
        SAMPLE_TARGET = 12  # 采样目标数量
        MAX_HEIGHT = 4000
        GAP = 6
-        # 1. 数量控制：如果超过20张，均匀采样
+        # 1. 数量控制：如果超过20张，均匀采样到12张
        if len(images_with_metadata) > MAX_COUNT:
-            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}")
+            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {SAMPLE_TARGET}")
-            images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT)
+            images_with_metadata = self._sample_images_uniformly(images_with_metadata, SAMPLE_TARGET)
        # 2. 高度控制：累加到4000px为止
        trimmed_images = []