feat: enhance MinerU crop() with 3 major improvements
1. Sampling optimization: reduce from 20 to 12 images when exceeding threshold 2. Native image width normalization: re-crop page-width strips for consistent stitching - Preserves original native images for MinIO storage - Uses normalized versions only for thumbnail stitching 3. Low fallback threshold: stitch full page screenshots when 3 fallback images - Deduplicates and limits to max 3 pages - Provides better context for sparse thumbnails
This commit is contained in:
parent
02a4b79f90
commit
58792dfe99
1 changed files with 124 additions and 6 deletions
|
|
@ -385,9 +385,10 @@ class MinerUParser(RAGFlowPdfParser):
|
|||
native_img_path = self._find_native_image_path(tag)
|
||||
if native_img_path:
|
||||
try:
|
||||
img = Image.open(native_img_path)
|
||||
# ✅ 使用页宽标准化版本(原生图保留入库MinIO)
|
||||
img = self._normalize_native_image_width(native_img_path, tag)
|
||||
images_to_stitch.append(("native", img, pos, tag))
|
||||
self.logger.debug(f"[MinerU] Using native image for tag: {tag}")
|
||||
self.logger.debug(f"[MinerU] Using normalized native image for tag: {tag}")
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}")
|
||||
|
|
@ -417,6 +418,12 @@ class MinerUParser(RAGFlowPdfParser):
|
|||
return None, None
|
||||
return
|
||||
|
||||
# ✅ 兜底图≤3张时,拼接完整页(去重)
|
||||
fallback_count = sum(1 for src, _, _, _ in images_to_stitch if src == "cached")
|
||||
if fallback_count <= 3 and fallback_count > 0:
|
||||
self.logger.debug(f"[MinerU] Fallback count = {fallback_count}, using full page strategy")
|
||||
return self._handle_low_fallback_count(poss, need_position)
|
||||
|
||||
# Step 2: 智能拼接(带阈值控制)
|
||||
return self._smart_stitch_with_thresholds(images_to_stitch, need_position)
|
||||
|
||||
|
|
@ -426,6 +433,116 @@ class MinerUParser(RAGFlowPdfParser):
|
|||
native_map = getattr(self, "_native_img_map", {})
|
||||
return native_map.get(tag)
|
||||
|
||||
def _normalize_native_image_width(self, native_img_path, tag):
|
||||
"""
|
||||
将Native图标准化为页宽版本(仅用于拼接)
|
||||
|
||||
原理:根据tag中的bbox,从页面重新裁剪页宽条带
|
||||
- 横向:0 到 页宽
|
||||
- 纵向:bbox的y范围
|
||||
|
||||
Args:
|
||||
native_img_path: MinerU原生图路径(保留入库MinIO)
|
||||
tag: 包含page_idx和bbox信息的tag字符串
|
||||
|
||||
Returns:
|
||||
页宽标准化后的Image对象,失败则返回原生图
|
||||
"""
|
||||
try:
|
||||
# 解析tag获取page_idx和bbox
|
||||
import re
|
||||
match = re.match(r"@@(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)##", tag)
|
||||
if not match:
|
||||
# 解析失败,返回原生图
|
||||
return Image.open(native_img_path)
|
||||
|
||||
page_num, x0_str, x1_str, y0_str, y1_str = match.groups()
|
||||
page_idx = int(page_num) - 1 # 转为0-based
|
||||
bbox = [float(x0_str), float(y0_str), float(x1_str), float(y1_str)]
|
||||
|
||||
# 检查page_images可用性
|
||||
if not hasattr(self, "page_images") or not self.page_images:
|
||||
return Image.open(native_img_path)
|
||||
|
||||
if page_idx < 0 or page_idx >= len(self.page_images):
|
||||
return Image.open(native_img_path)
|
||||
|
||||
# 获取页面图片
|
||||
page_img = self.page_images[page_idx]
|
||||
page_width, page_height = page_img.size
|
||||
|
||||
# bbox转像素
|
||||
px0, py0, px1, py1 = self._bbox_to_pixels(bbox, (page_width, page_height))
|
||||
|
||||
# 裁剪页宽条带(横向全宽,纵向bbox范围)
|
||||
crop_y0 = max(0, min(py0, page_height))
|
||||
crop_y1 = max(crop_y0 + 1, min(py1, page_height))
|
||||
|
||||
if crop_y1 - crop_y0 < 2:
|
||||
# bbox无效,返回原生图
|
||||
return Image.open(native_img_path)
|
||||
|
||||
page_width_img = page_img.crop((0, crop_y0, page_width, crop_y1))
|
||||
self.logger.debug(f"[MinerU] Normalized native image to page-width: {page_width}x{crop_y1-crop_y0}px")
|
||||
return page_width_img
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"[MinerU] Failed to normalize native image, using original: {e}")
|
||||
return Image.open(native_img_path)
|
||||
|
||||
def _handle_low_fallback_count(self, poss, need_position):
|
||||
"""
|
||||
兜底图≤3张时,拼接涉及页面截图(去重)
|
||||
|
||||
策略:
|
||||
- 提取所有涉及页码
|
||||
- 去重并限制最多3页
|
||||
- 拼接这些完整页
|
||||
|
||||
Args:
|
||||
poss: positions列表
|
||||
need_position: 是否需要返回positions
|
||||
|
||||
Returns:
|
||||
拼接的完整页截图,或单页截图
|
||||
"""
|
||||
if not hasattr(self, "page_images") or not self.page_images:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
# 提取所有涉及页码(0-based),去重并排序
|
||||
page_indices = sorted(set(
|
||||
pns[0] for pns, _, _, _, _ in poss
|
||||
if pns and 0 <= pns[0] < len(self.page_images)
|
||||
))
|
||||
|
||||
# 限制最多3页
|
||||
page_indices = page_indices[:3]
|
||||
|
||||
if not page_indices:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
self.logger.info(f"[MinerU] Low fallback count, stitching {len(page_indices)} page(s): {[idx+1 for idx in page_indices]}")
|
||||
|
||||
# 单页直接返回
|
||||
if len(page_indices) == 1:
|
||||
page_img = self.page_images[page_indices[0]]
|
||||
if need_position:
|
||||
return page_img, [[page_indices[0], 0, page_img.width, 0, page_img.height]]
|
||||
return page_img
|
||||
|
||||
# 多页垂直拼接
|
||||
page_imgs_with_meta = [
|
||||
("fullpage", self.page_images[idx], ([idx], 0, 0, 0, 0), f"@@{idx+1}\t0\t0\t0\t0##")
|
||||
for idx in page_indices
|
||||
]
|
||||
|
||||
return self._stitch_images_vertically(page_imgs_with_meta, need_position, gap=10)
|
||||
|
||||
|
||||
def _smart_stitch_with_thresholds(self, images_with_metadata, need_position):
|
||||
"""
|
||||
智能拼接:应用阈值控制
|
||||
|
|
@ -435,18 +552,19 @@ class MinerUParser(RAGFlowPdfParser):
|
|||
- MAX_HEIGHT: 总高度不超过4000px
|
||||
|
||||
Strategies:
|
||||
- 数量过多: 均匀采样(保留首尾)
|
||||
- 数量过多: 均匀采样到12张(保留首尾)
|
||||
- 高度过高: 截断到4000px
|
||||
- 不缩放图片(保持高清)
|
||||
"""
|
||||
MAX_COUNT = 20
|
||||
SAMPLE_TARGET = 12 # 采样目标数量
|
||||
MAX_HEIGHT = 4000
|
||||
GAP = 6
|
||||
|
||||
# 1. 数量控制:如果超过20张,均匀采样
|
||||
# 1. 数量控制:如果超过20张,均匀采样到12张
|
||||
if len(images_with_metadata) > MAX_COUNT:
|
||||
self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}")
|
||||
images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT)
|
||||
self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {SAMPLE_TARGET}")
|
||||
images_with_metadata = self._sample_images_uniformly(images_with_metadata, SAMPLE_TARGET)
|
||||
|
||||
# 2. 高度控制:累加到4000px为止
|
||||
trimmed_images = []
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue