feat: enhance MinerU crop() with 3 major improvements

1. Sampling optimization: reduce from 20 to 12 images when exceeding threshold 2. Native image width normalization: re-crop page-width strips for consistent stitching - Preserves original native images for MinIO storage - Uses normalized versions only for thumbnail stitching 3. Low fallback threshold: stitch full page screenshots when 3 fallback images - Deduplicates and limits to max 3 pages - Provides better context for sparse thumbnails
2025-12-12 15:19:13 +08:00 · 2025-12-12 15:19:13 +08:00 · 58792dfe99
commit 58792dfe99
parent 02a4b79f90
1 changed files with 124 additions and 6 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -385,9 +385,10 @@ class MinerUParser(RAGFlowPdfParser):
            native_img_path = self._find_native_image_path(tag)
            if native_img_path:
                try:
-                    img = Image.open(native_img_path)
+                    # ✅ 使用页宽标准化版本（原生图保留入库MinIO）
+                    img = self._normalize_native_image_width(native_img_path, tag)
                    images_to_stitch.append(("native", img, pos, tag))
-                    self.logger.debug(f"[MinerU] Using native image for tag: {tag}")
+                    self.logger.debug(f"[MinerU] Using normalized native image for tag: {tag}")
                    continue
                except Exception as e:
                    self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}")
@ -417,6 +418,12 @@ class MinerUParser(RAGFlowPdfParser):
                return None, None
            return
        
+        # ✅ 兜底图≤3张时，拼接完整页（去重）
+        fallback_count = sum(1 for src, _, _, _ in images_to_stitch if src == "cached")
+        if fallback_count <= 3 and fallback_count > 0:
+            self.logger.debug(f"[MinerU] Fallback count = {fallback_count}, using full page strategy")
+            return self._handle_low_fallback_count(poss, need_position)
+        
        # Step 2: 智能拼接（带阈值控制）
        return self._smart_stitch_with_thresholds(images_to_stitch, need_position)
    
@ -426,6 +433,116 @@ class MinerUParser(RAGFlowPdfParser):
        native_map = getattr(self, "_native_img_map", {})
        return native_map.get(tag)
    
+    def _normalize_native_image_width(self, native_img_path, tag):
+        """
+        将Native图标准化为页宽版本（仅用于拼接）
+        
+        原理：根据tag中的bbox，从页面重新裁剪页宽条带
+        - 横向：0 到 页宽
+        - 纵向：bbox的y范围
+        
+        Args:
+            native_img_path: MinerU原生图路径（保留入库MinIO）
+            tag: 包含page_idx和bbox信息的tag字符串
+            
+        Returns:
+            页宽标准化后的Image对象，失败则返回原生图
+        """
+        try:
+            # 解析tag获取page_idx和bbox
+            import re
+            match = re.match(r"@@(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)##", tag)
+            if not match:
+                # 解析失败，返回原生图
+                return Image.open(native_img_path)
+            
+            page_num, x0_str, x1_str, y0_str, y1_str = match.groups()
+            page_idx = int(page_num) - 1  # 转为0-based
+            bbox = [float(x0_str), float(y0_str), float(x1_str), float(y1_str)]
+            
+            # 检查page_images可用性
+            if not hasattr(self, "page_images") or not self.page_images:
+                return Image.open(native_img_path)
+            
+            if page_idx < 0 or page_idx >= len(self.page_images):
+                return Image.open(native_img_path)
+            
+            # 获取页面图片
+            page_img = self.page_images[page_idx]
+            page_width, page_height = page_img.size
+            
+            # bbox转像素
+            px0, py0, px1, py1 = self._bbox_to_pixels(bbox, (page_width, page_height))
+            
+            # 裁剪页宽条带（横向全宽，纵向bbox范围）
+            crop_y0 = max(0, min(py0, page_height))
+            crop_y1 = max(crop_y0 + 1, min(py1, page_height))
+            
+            if crop_y1 - crop_y0 < 2:
+                # bbox无效，返回原生图
+                return Image.open(native_img_path)
+            
+            page_width_img = page_img.crop((0, crop_y0, page_width, crop_y1))
+            self.logger.debug(f"[MinerU] Normalized native image to page-width: {page_width}x{crop_y1-crop_y0}px")
+            return page_width_img
+            
+        except Exception as e:
+            self.logger.debug(f"[MinerU] Failed to normalize native image, using original: {e}")
+            return Image.open(native_img_path)
+    
+    def _handle_low_fallback_count(self, poss, need_position):
+        """
+        兜底图≤3张时，拼接涉及页面截图（去重）
+        
+        策略：
+        - 提取所有涉及页码
+        - 去重并限制最多3页
+        - 拼接这些完整页
+        
+        Args:
+            poss: positions列表
+            need_position: 是否需要返回positions
+            
+        Returns:
+            拼接的完整页截图，或单页截图
+        """
+        if not hasattr(self, "page_images") or not self.page_images:
+            if need_position:
+                return None, None
+            return
+        
+        # 提取所有涉及页码（0-based），去重并排序
+        page_indices = sorted(set(
+            pns[0] for pns, _, _, _, _ in poss 
+            if pns and 0 <= pns[0] < len(self.page_images)
+        ))
+        
+        # 限制最多3页
+        page_indices = page_indices[:3]
+        
+        if not page_indices:
+            if need_position:
+                return None, None
+            return
+        
+        self.logger.info(f"[MinerU] Low fallback count, stitching {len(page_indices)} page(s): {[idx+1 for idx in page_indices]}")
+        
+        # 单页直接返回
+        if len(page_indices) == 1:
+            page_img = self.page_images[page_indices[0]]
+            if need_position:
+                return page_img, [[page_indices[0], 0, page_img.width, 0, page_img.height]]
+            return page_img
+        
+        # 多页垂直拼接
+        page_imgs_with_meta = [
+            ("fullpage", self.page_images[idx], ([idx], 0, 0, 0, 0), f"@@{idx+1}\t0\t0\t0\t0##")
+            for idx in page_indices
+        ]
+        
+        return self._stitch_images_vertically(page_imgs_with_meta, need_position, gap=10)
+    
+    
    def _smart_stitch_with_thresholds(self, images_with_metadata, need_position):
        """
        智能拼接：应用阈值控制
@ -435,18 +552,19 @@ class MinerUParser(RAGFlowPdfParser):
        - MAX_HEIGHT: 总高度不超过4000px
        
        Strategies:
-        - 数量过多: 均匀采样（保留首尾）
+        - 数量过多: 均匀采样到12张（保留首尾）
        - 高度过高: 截断到4000px
        - 不缩放图片（保持高清）
        """
        MAX_COUNT = 20
+        SAMPLE_TARGET = 12  # 采样目标数量
        MAX_HEIGHT = 4000
        GAP = 6
        
-        # 1. 数量控制：如果超过20张，均匀采样
+        # 1. 数量控制：如果超过20张，均匀采样到12张
        if len(images_with_metadata) > MAX_COUNT:
-            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}")
-            images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT)
+            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {SAMPLE_TARGET}")
+            images_with_metadata = self._sample_images_uniformly(images_with_metadata, SAMPLE_TARGET)
        
        # 2. 高度控制：累加到4000px为止
        trimmed_images = []