From 02a4b79f905007c10fad37c0a76db87d28c53a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Thu, 11 Dec 2025 21:12:00 +0800 Subject: [PATCH] chore: increase image stitching thresholds to 20/4000px - MAX_COUNT: 10 -> 20 images - MAX_HEIGHT: 2000px -> 4000px - Allows more complete chunk thumbnails for long documents --- deepdoc/parser/mineru_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 7d2fa0110..3a4e8bf10 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -431,24 +431,24 @@ class MinerUParser(RAGFlowPdfParser): 智能拼接:应用阈值控制 Thresholds: - - MAX_COUNT: 最多10张图 - - MAX_HEIGHT: 总高度不超过2000px + - MAX_COUNT: 最多20张图 + - MAX_HEIGHT: 总高度不超过4000px Strategies: - 数量过多: 均匀采样(保留首尾) - - 高度过高: 截断到2000px + - 高度过高: 截断到4000px - 不缩放图片(保持高清) """ - MAX_COUNT = 10 - MAX_HEIGHT = 2000 + MAX_COUNT = 20 + MAX_HEIGHT = 4000 GAP = 6 - # 1. 数量控制:如果超过10张,均匀采样 + # 1. 数量控制:如果超过20张,均匀采样 if len(images_with_metadata) > MAX_COUNT: self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}") images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT) - # 2. 高度控制:累加到2000px为止 + # 2. 高度控制:累加到4000px为止 trimmed_images = [] current_height = 0