From b443d34faf4c1619fd51794bd998b21b26b669bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Tue, 9 Dec 2025 19:53:56 +0800 Subject: [PATCH 1/9] Fix: Generate missing images for MinerU text blocks using local crop --- deepdoc/parser/mineru_parser.py | 70 +++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 7e3919bbd..be099bc9e 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -51,6 +51,8 @@ class MinerUContentType(StrEnum): CODE = "code" LIST = "list" DISCARDED = "discarded" + HEADER = "header" + PAGE_NUMBER = "page_number" class MinerUParser(RAGFlowPdfParser): @@ -459,6 +461,67 @@ class MinerUParser(RAGFlowPdfParser): poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) return poss + def _bbox_to_pixels(self, bbox, page_size): + x0, y0, x1, y1 = bbox + pw, ph = page_size + maxv = max(bbox) + # 经验:MinerU bbox 常为 0~1000 归一化;否则认为已是像素 + if maxv <= 1.5: + sx, sy = pw, ph + elif maxv <= 1200: + sx, sy = pw / 1000.0, ph / 1000.0 + else: + sx, sy = 1.0, 1.0 + return ( + int(x0 * sx), + int(y0 * sy), + int(x1 * sx), + int(y1 * sy), + ) + + def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str): + if not getattr(self, "page_images", None): + return + if not subdir: + return + img_root = subdir / "generated_images" + img_root.mkdir(parents=True, exist_ok=True) + text_types = {MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.CODE, MinerUContentType.HEADER} + generated = 0 + for idx, item in enumerate(outputs): + if item.get("type") not in text_types: + continue + if item.get("img_path"): + continue + + bbox = item.get("bbox") + if not bbox or len(bbox) != 4: + continue + + page_idx = int(item.get("page_idx", 0)) + if page_idx < 0 or page_idx >= len(self.page_images): + continue + + x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size) + + # guard invalid bbox + if x1 - x0 < 2 or y1 - y0 < 2: + continue + + try: + crop = self.page_images[page_idx].crop((x0, y0, x1, y1)) + fname = f"{file_stem}_gen_{idx}.jpg" + out_path = img_root / fname + crop.save(out_path, format="JPEG", quality=80) + item["img_path"] = str(out_path.resolve()) + generated += 1 + except Exception as e: + self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}") + continue + + if generated: + self.logger.info(f"[MinerU] generated {generated} fallback images for text blocks") + def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: candidates = [] seen = set() @@ -532,6 +595,13 @@ class MinerUParser(RAGFlowPdfParser): for key in ("img_path", "table_img_path", "equation_img_path"): if key in item and item[key]: item[key] = str((subdir / item[key]).resolve()) + + # MinerU(vlm-http-client) 不会为纯文本生成图片,这里兜底用本地页图裁剪生成,方便后续引用/MinIO 存图 + try: + self._generate_missing_images(data, subdir, file_stem) + except Exception as e: + self.logger.warning(f"[MinerU] generate missing images failed: {e}") + return data def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): From eb004b62542d7f13df329c2e4ace56ac1a92b165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Tue, 9 Dec 2025 20:28:47 +0800 Subject: [PATCH 2/9] fix(mineru): use cached img_path in crop() to consume generated_images - Add _img_path_cache dict to cache line_tag -> img_path mapping - Populate cache in _generate_missing_images for fallback text block images - Refactor crop() to check cache first, return cached image directly - Fallback to single-position cropping to avoid super-tall merged images - Fix text_types to use both string literals and enums for compatibility - Add bbox clamping to prevent cropping errors --- deepdoc/parser/mineru_parser.py | 90 +++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 21 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index be099bc9e..eaeedbb20 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -63,6 +63,7 @@ class MinerUParser(RAGFlowPdfParser): self.using_api = False self.outlines = [] self.logger = logging.getLogger(self.__class__.__name__) + self._img_path_cache = {} # line_tag -> img_path mapping for crop() lookup def _extract_zip_no_root(self, zip_path, extract_to, root_dir): self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}") @@ -334,13 +335,33 @@ class MinerUParser(RAGFlowPdfParser): return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott) def crop(self, text, ZM=1, need_position=False): - imgs = [] + """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop.""" poss = self.extract_positions(text) if not poss: if need_position: return None, None return + + # 优先使用缓存的 img_path (来自 MinerU 或 _generate_missing_images) + cache = getattr(self, "_img_path_cache", {}) + for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text): + # 尝试精确匹配或近似匹配缓存 + if tag in cache: + try: + img = Image.open(cache[tag]) + if need_position: + # 从第一个位置提取 position 信息 + first_pos = poss[0] + pn = first_pos[0][0] if first_pos[0] else 0 + left, right, top, bottom = first_pos[1], first_pos[2], first_pos[3], first_pos[4] + positions = [(pn + getattr(self, "page_from", 0), int(left), int(right), int(top), int(bottom))] + return img, positions + return img + except Exception as e: + self.logger.debug(f"[MinerU] cached img_path load failed: {e}") + break # fallback to crop + # Fallback: 使用 page_images 裁剪 if not getattr(self, "page_images", None): self.logger.warning("[MinerU] crop called without page images; skipping image generation.") if need_position: @@ -352,21 +373,22 @@ class MinerUParser(RAGFlowPdfParser): filtered_poss = [] for pns, left, right, top, bottom in poss: if not pns: - self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.") continue valid_pns = [p for p in pns if 0 <= p < page_count] if not valid_pns: - self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.") continue filtered_poss.append((valid_pns, left, right, top, bottom)) poss = filtered_poss if not poss: - self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.") if need_position: return None, None return + # 避免超长拼接图 - 只取首个位置 + if len(poss) > 1: + poss = [poss[0]] + max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 pos = poss[0] @@ -486,7 +508,7 @@ class MinerUParser(RAGFlowPdfParser): return img_root = subdir / "generated_images" img_root.mkdir(parents=True, exist_ok=True) - text_types = {MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.CODE, MinerUContentType.HEADER} + text_types = {"text", "list", "header", "code", MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.EQUATION, MinerUContentType.CODE} generated = 0 for idx, item in enumerate(outputs): if item.get("type") not in text_types: @@ -504,23 +526,43 @@ class MinerUParser(RAGFlowPdfParser): x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size) + # clamp to page boundary + pw, ph = self.page_images[page_idx].size + x0 = max(0, min(x0, pw)) + y0 = max(0, min(y0, ph)) + x1 = max(0, min(x1, pw)) + y1 = max(0, min(y1, ph)) + # guard invalid bbox if x1 - x0 < 2 or y1 - y0 < 2: continue try: - crop = self.page_images[page_idx].crop((x0, y0, x1, y1)) + cropped = self.page_images[page_idx].crop((x0, y0, x1, y1)) fname = f"{file_stem}_gen_{idx}.jpg" out_path = img_root / fname - crop.save(out_path, format="JPEG", quality=80) - item["img_path"] = str(out_path.resolve()) + cropped.save(out_path, format="JPEG", quality=80) + img_path_str = str(out_path.resolve()) + item["img_path"] = img_path_str + + # Cache for crop() lookup: map line_tag to img_path + # 缓存两种格式的 key,确保无论 _transfer_to_sections 怎么生成 tag 都能匹配 + line_tag = self._line_tag(item) + self._img_path_cache[line_tag] = img_path_str + + # 同时缓存原始 bbox 格式 (不依赖 page_images 的归一化坐标) + raw_bbox = item.get("bbox", [0, 0, 0, 0]) + raw_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format( + page_idx + 1, float(raw_bbox[0]), float(raw_bbox[2]), float(raw_bbox[1]), float(raw_bbox[3]) + ) + self._img_path_cache[raw_tag] = img_path_str generated += 1 except Exception as e: self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}") continue if generated: - self.logger.info(f"[MinerU] generated {generated} fallback images for text blocks") + self.logger.info(f"[MinerU] generated {generated} fallback images, cached {len(self._img_path_cache)} tags") def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: candidates = [] @@ -607,29 +649,35 @@ class MinerUParser(RAGFlowPdfParser): def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): sections = [] for output in outputs: - match output["type"]: - case MinerUContentType.TEXT: - section = output["text"] - case MinerUContentType.TABLE: + section = None + content_type = output.get("type", "") + + # 使用字符串匹配,兼容 MinerU API 返回的原始类型 + match content_type: + case "text" | MinerUContentType.TEXT: + section = output.get("text", "") + case "table" | MinerUContentType.TABLE: section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", [])) if not section.strip(): section = "FAILED TO PARSE TABLE" - case MinerUContentType.IMAGE: + case "image" | MinerUContentType.IMAGE: section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", [])) - case MinerUContentType.EQUATION: - section = output["text"] - case MinerUContentType.CODE: - section = output["code_body"] + "\n".join(output.get("code_caption", [])) - case MinerUContentType.LIST: + case "equation" | MinerUContentType.EQUATION: + section = output.get("text", "") + case "code" | MinerUContentType.CODE: + section = output.get("code_body", "") + "\n".join(output.get("code_caption", [])) + case "list" | MinerUContentType.LIST: section = "\n".join(output.get("list_items", [])) - case MinerUContentType.DISCARDED: + case "header": + section = output.get("text", "") + case "discarded" | MinerUContentType.DISCARDED: pass if section and parse_method == "manual": sections.append((section, output["type"], self._line_tag(output))) elif section and parse_method == "paper": sections.append((section + self._line_tag(output), output["type"])) - else: + elif section: sections.append((section, self._line_tag(output))) return sections From 8049cb9275f46f7e69b8dc9563f32020b589b5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Tue, 9 Dec 2025 22:17:15 +0800 Subject: [PATCH 3/9] fix(mineru): use consistent 0-1000 normalized coords for line_tag cache matching --- deepdoc/parser/mineru_parser.py | 55 ++++++++++++++------------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index eaeedbb20..943cc8cff 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -334,6 +334,13 @@ class MinerUParser(RAGFlowPdfParser): return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott) + def _raw_line_tag(self, bx): + """生成原始归一化坐标(0-1000)的line_tag,用于缓存key匹配""" + pn = bx.get("page_idx", 0) + 1 + bbox = bx.get("bbox", [0, 0, 0, 0]) + x0, y0, x1, y1 = bbox + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, x0, x1, y0, y1) + def crop(self, text, ZM=1, need_position=False): """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop.""" poss = self.extract_positions(text) @@ -545,16 +552,8 @@ class MinerUParser(RAGFlowPdfParser): img_path_str = str(out_path.resolve()) item["img_path"] = img_path_str - # Cache for crop() lookup: map line_tag to img_path - # 缓存两种格式的 key,确保无论 _transfer_to_sections 怎么生成 tag 都能匹配 - line_tag = self._line_tag(item) - self._img_path_cache[line_tag] = img_path_str - - # 同时缓存原始 bbox 格式 (不依赖 page_images 的归一化坐标) - raw_bbox = item.get("bbox", [0, 0, 0, 0]) - raw_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format( - page_idx + 1, float(raw_bbox[0]), float(raw_bbox[2]), float(raw_bbox[1]), float(raw_bbox[3]) - ) + # Cache for crop() lookup: use raw 0-1000 normalized tag for consistent matching + raw_tag = self._raw_line_tag(item) self._img_path_cache[raw_tag] = img_path_str generated += 1 except Exception as e: @@ -649,36 +648,30 @@ class MinerUParser(RAGFlowPdfParser): def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): sections = [] for output in outputs: - section = None - content_type = output.get("type", "") - - # 使用字符串匹配,兼容 MinerU API 返回的原始类型 - match content_type: - case "text" | MinerUContentType.TEXT: - section = output.get("text", "") - case "table" | MinerUContentType.TABLE: + match output["type"]: + case MinerUContentType.TEXT: + section = output["text"] + case MinerUContentType.TABLE: section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", [])) if not section.strip(): section = "FAILED TO PARSE TABLE" - case "image" | MinerUContentType.IMAGE: + case MinerUContentType.IMAGE: section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", [])) - case "equation" | MinerUContentType.EQUATION: - section = output.get("text", "") - case "code" | MinerUContentType.CODE: - section = output.get("code_body", "") + "\n".join(output.get("code_caption", [])) - case "list" | MinerUContentType.LIST: + case MinerUContentType.EQUATION: + section = output["text"] + case MinerUContentType.CODE: + section = output["code_body"] + "\n".join(output.get("code_caption", [])) + case MinerUContentType.LIST: section = "\n".join(output.get("list_items", [])) - case "header": - section = output.get("text", "") - case "discarded" | MinerUContentType.DISCARDED: + case MinerUContentType.DISCARDED: pass if section and parse_method == "manual": - sections.append((section, output["type"], self._line_tag(output))) + sections.append((section, output["type"], self._raw_line_tag(output))) elif section and parse_method == "paper": - sections.append((section + self._line_tag(output), output["type"])) - elif section: - sections.append((section, self._line_tag(output))) + sections.append((section + self._raw_line_tag(output), output["type"])) + else: + sections.append((section, self._raw_line_tag(output))) return sections def _transfer_to_tables(self, outputs: list[dict[str, Any]]): From 1c7bc4757916af4da94b4f9f1c7b55fc85c0d80d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Tue, 9 Dec 2025 23:32:27 +0800 Subject: [PATCH 4/9] fix(mineru): robust coordinate conversion in crop() fallback for 0-1000 tags - Implement coordinate conversion (normalized -> pixels) in crop() fallback loop - Ensures correct cropping from page_images when cache lookup fails - Works consistently with _raw_line_tag (0-1000 normalized) changes --- deepdoc/parser/mineru_parser.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 943cc8cff..dd3370ddb 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -392,6 +392,22 @@ class MinerUParser(RAGFlowPdfParser): return None, None return + # Convert 0-1000 normalized coordinates to pixels using page dimensions + # This ensures compatibility with GAP/padding logic and correct cropping + pixel_poss = [] + for pns, left, right, top, bottom in poss: + if not pns: continue + page_idx = pns[0] + if not (0 <= page_idx < page_count): continue + + W, H = self.page_images[page_idx].size + x0 = left * W / 1000.0 + x1 = right * W / 1000.0 + y0 = top * H / 1000.0 + y1 = bottom * H / 1000.0 + pixel_poss.append((pns, x0, x1, y0, y1)) + poss = pixel_poss + # 避免超长拼接图 - 只取首个位置 if len(poss) > 1: poss = [poss[0]] From 3bc3d82aa8b3de48c4160801431c9d4dcef26bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Wed, 10 Dec 2025 00:48:39 +0800 Subject: [PATCH 5/9] fix: Initialize imgs list in crop() fallback path - Critical bug fix: imgs list was not initialized before use (line 439) - Without this fix, NameError would occur when cache miss triggers fallback - Discovered during reliability audit of MinerU image generation fix --- deepdoc/parser/mineru_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index dd3370ddb..c57a5f98b 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -436,6 +436,7 @@ class MinerUParser(RAGFlowPdfParser): ) positions = [] + imgs = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): right = left + max_width From 8a285d123027cc605891418a42f291fbdec4231e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Wed, 10 Dec 2025 21:19:49 +0800 Subject: [PATCH 6/9] feat(mineru): implement smart crop with page-width fallback and native image mixing - Changed fallback image generation to page-width strips (full horizontal, bbox vertical) - Implemented smart crop() with native+fallback mixing and deduplication - Added thresholds: max 10 images, total height <2000px - Established native_img_map for table/image/equation priority - Removed 120px padding logic that caused super-long stitched thumbnails This fixes the issue where chunk thumbnails were either missing or excessively long due to: 1. MinerU not providing images for pure text blocks 2. Official crop() adding 120px padding and stitching across pages 3. Manual.py merging multiple sections into one chunk The new approach: - Priority 1: Use MinerU's native high-quality images (tables/equations) - Priority 2: Use page-width fallback strips (consistent width for stitching) - Priority 3: Use full page as last resort - Deduplicates identical bboxes during stitching - Limits output to reasonable dimensions for UX --- deepdoc/parser/mineru_parser.py | 331 ++++++++++++++++++-------------- 1 file changed, 183 insertions(+), 148 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index c57a5f98b..cb24d21a7 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -64,6 +64,7 @@ class MinerUParser(RAGFlowPdfParser): self.outlines = [] self.logger = logging.getLogger(self.__class__.__name__) self._img_path_cache = {} # line_tag -> img_path mapping for crop() lookup + self._native_img_map = {} # line_tag -> native mineru image (image/table/equation) def _extract_zip_no_root(self, zip_path, extract_to, root_dir): self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}") @@ -342,160 +343,171 @@ class MinerUParser(RAGFlowPdfParser): return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, x0, x1, y0, y1) def crop(self, text, ZM=1, need_position=False): - """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop.""" + """ + MinerU专用智能crop: + 1. 混合使用原生图(表格/图片)+ 兜底图(页宽条带) + 2. 拼接时去重(相同bbox的图只用一次) + 3. 阈值控制(最多10张,总高<2000px) + 4. 保持高清(不缩放) + """ poss = self.extract_positions(text) if not poss: if need_position: return None, None return - # 优先使用缓存的 img_path (来自 MinerU 或 _generate_missing_images) - cache = getattr(self, "_img_path_cache", {}) - for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text): - # 尝试精确匹配或近似匹配缓存 + # Step 1: 收集所有tag对应的图片 + images_to_stitch = [] + seen_tags = set() # 用于去重 + + for pos in poss: + # 构造tag用于查找 + pns, left, right, top, bottom = pos + if not pns: + continue + + page_num = pns[0] + 1 # 转为1-based + tag = f"@@{page_num}\t{left:.1f}\t{right:.1f}\t{top:.1f}\t{bottom:.1f}##" + + # ✅ 去重:如果tag已处理过,跳过 + if tag in seen_tags: + self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}") + continue + seen_tags.add(tag) + + # 优先级1: 查找MinerU原生图(表格/图片/公式) + native_img_path = self._find_native_image_path(tag) + if native_img_path: + try: + img = Image.open(native_img_path) + images_to_stitch.append(("native", img, pos, tag)) + self.logger.debug(f"[MinerU] Using native image for tag: {tag}") + continue + except Exception as e: + self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}") + + # 优先级2: 查找兜底生成的页宽图(缓存) + cache = getattr(self, "_img_path_cache", {}) if tag in cache: try: img = Image.open(cache[tag]) - if need_position: - # 从第一个位置提取 position 信息 - first_pos = poss[0] - pn = first_pos[0][0] if first_pos[0] else 0 - left, right, top, bottom = first_pos[1], first_pos[2], first_pos[3], first_pos[4] - positions = [(pn + getattr(self, "page_from", 0), int(left), int(right), int(top), int(bottom))] - return img, positions - return img - except Exception as e: - self.logger.debug(f"[MinerU] cached img_path load failed: {e}") - break # fallback to crop - - # Fallback: 使用 page_images 裁剪 - if not getattr(self, "page_images", None): - self.logger.warning("[MinerU] crop called without page images; skipping image generation.") - if need_position: - return None, None - return - - page_count = len(self.page_images) - - filtered_poss = [] - for pns, left, right, top, bottom in poss: - if not pns: - continue - valid_pns = [p for p in pns if 0 <= p < page_count] - if not valid_pns: - continue - filtered_poss.append((valid_pns, left, right, top, bottom)) - - poss = filtered_poss - if not poss: - if need_position: - return None, None - return - - # Convert 0-1000 normalized coordinates to pixels using page dimensions - # This ensures compatibility with GAP/padding logic and correct cropping - pixel_poss = [] - for pns, left, right, top, bottom in poss: - if not pns: continue - page_idx = pns[0] - if not (0 <= page_idx < page_count): continue - - W, H = self.page_images[page_idx].size - x0 = left * W / 1000.0 - x1 = right * W / 1000.0 - y0 = top * H / 1000.0 - y1 = bottom * H / 1000.0 - pixel_poss.append((pns, x0, x1, y0, y1)) - poss = pixel_poss - - # 避免超长拼接图 - 只取首个位置 - if len(poss) > 1: - poss = [poss[0]] - - max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) - GAP = 6 - pos = poss[0] - first_page_idx = pos[0][0] - poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) - pos = poss[-1] - last_page_idx = pos[0][-1] - if not (0 <= last_page_idx < page_count): - self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") - if need_position: - return None, None - return - last_page_height = self.page_images[last_page_idx].size[1] - poss.append( - ( - [last_page_idx], - pos[1], - pos[2], - min(last_page_height, pos[4] + GAP), - min(last_page_height, pos[4] + 120), - ) - ) - - positions = [] - imgs = [] - for ii, (pns, left, right, top, bottom) in enumerate(poss): - right = left + max_width - - if bottom <= top: - bottom = top + 2 - - for pn in pns[1:]: - if 0 <= pn - 1 < page_count: - bottom += self.page_images[pn - 1].size[1] - else: - self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") - - if not (0 <= pns[0] < page_count): - self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") - continue - - img0 = self.page_images[pns[0]] - x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1])) - crop0 = img0.crop((x0, y0, x1, y1)) - imgs.append(crop0) - if 0 < ii < len(poss) - 1: - positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) - - bottom -= img0.size[1] - for pn in pns[1:]: - if not (0 <= pn < page_count): - self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.") + images_to_stitch.append(("cached", img, pos, tag)) + self.logger.debug(f"[MinerU] Using cached fallback image for tag: {tag}") continue - page = self.page_images[pn] - x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1])) - cimgp = page.crop((x0, y0, x1, y1)) - imgs.append(cimgp) - if 0 < ii < len(poss) - 1: - positions.append((pn + self.page_from, x0, x1, y0, y1)) - bottom -= page.size[1] - - if not imgs: + except Exception as e: + self.logger.debug(f"[MinerU] Failed to load cached image: {e}") + + # 优先级3: 完整页兜底(如果page_images可用) + if hasattr(self, "page_images") and self.page_images: + page_idx = pns[0] + if 0 <= page_idx < len(self.page_images): + img = self.page_images[page_idx] + images_to_stitch.append(("fullpage", img, pos, tag)) + self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}") + + if not images_to_stitch: + self.logger.warning("[MinerU] No images found for chunk") if need_position: return None, None return - - height = 0 - for img in imgs: - height += img.size[1] + GAP - height = int(height) - width = int(np.max([i.size[0] for i in imgs])) - pic = Image.new("RGB", (width, height), (245, 245, 245)) - height = 0 - for ii, img in enumerate(imgs): - if ii == 0 or ii + 1 == len(imgs): - img = img.convert("RGBA") - overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) - overlay.putalpha(128) - img = Image.alpha_composite(img, overlay).convert("RGB") - pic.paste(img, (0, int(height))) - height += img.size[1] + GAP - + + # Step 2: 智能拼接(带阈值控制) + return self._smart_stitch_with_thresholds(images_to_stitch, need_position) + + def _find_native_image_path(self, tag): + """查找MinerU原生图片路径(表格/图片/公式)""" + # 需要在_read_output时建立 tag → native_img_path 的映射 + native_map = getattr(self, "_native_img_map", {}) + return native_map.get(tag) + + def _smart_stitch_with_thresholds(self, images_with_metadata, need_position): + """ + 智能拼接:应用阈值控制 + + Thresholds: + - MAX_COUNT: 最多10张图 + - MAX_HEIGHT: 总高度不超过2000px + + Strategies: + - 数量过多: 均匀采样(保留首尾) + - 高度过高: 截断到2000px + - 不缩放图片(保持高清) + """ + MAX_COUNT = 10 + MAX_HEIGHT = 2000 + GAP = 6 + + # 1. 数量控制:如果超过10张,均匀采样 + if len(images_with_metadata) > MAX_COUNT: + self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}") + images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT) + + # 2. 高度控制:累加到2000px为止 + trimmed_images = [] + current_height = 0 + + for src, img, pos, tag in images_with_metadata: + if current_height + img.height > MAX_HEIGHT: + self.logger.info(f"[MinerU] Reached max height {MAX_HEIGHT}px at {len(trimmed_images)} images, stopping") + break + trimmed_images.append((src, img, pos, tag)) + current_height += img.height + GAP + + # 至少保留一张图 + if not trimmed_images and images_with_metadata: + trimmed_images = [images_with_metadata[0]] + + # 3. 垂直拼接(不缩放) + return self._stitch_images_vertically(trimmed_images, need_position, GAP) + + def _sample_images_uniformly(self, images, target_count): + """均匀采样:保留首尾,均匀抽取中间""" + if len(images) <= target_count: + return images + + sampled = [images[0]] # 首张 + step = len(images) / (target_count - 1) + for i in range(1, target_count - 1): + idx = int(i * step) + sampled.append(images[idx]) + sampled.append(images[-1]) # 末张 + return sampled + + def _stitch_images_vertically(self, images_with_metadata, need_position, gap): + """垂直拼接图片(不加补丁,不缩放)""" + if not images_with_metadata: + if need_position: + return None, None + return + + imgs = [img for _, img, _, _ in images_with_metadata] + positions_list = [pos for _, _, pos, _ in images_with_metadata] + + # 计算画布尺寸 + total_height = sum(img.height for img in imgs) + gap * (len(imgs) - 1) + max_width = max(img.width for img in imgs) + + # 创建画布 + pic = Image.new("RGB", (max_width, total_height), (245, 245, 245)) + + # 逐张粘贴(垂直堆叠) + current_y = 0 + positions = [] + + for idx, (img, pos) in enumerate(zip(imgs, positions_list)): + pic.paste(img, (0, current_y)) + + # 提取position信息 + if pos and len(pos) >= 5: + pns, left, right, top, bottom = pos + if pns: + page_num = pns[0] + getattr(self, "page_from", 0) + positions.append((page_num, int(left), int(right), int(top), int(bottom))) + + current_y += img.height + gap + if need_position: - return pic, positions + return pic, positions if positions else [(0, 0, max_width, 0, total_height)] return pic @staticmethod @@ -526,6 +538,7 @@ class MinerUParser(RAGFlowPdfParser): ) def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str): + """生成兜底图:按页宽(横向全宽,纵向按bbox)""" if not getattr(self, "page_images", None): return if not subdir: @@ -550,19 +563,24 @@ class MinerUParser(RAGFlowPdfParser): x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size) - # clamp to page boundary + # 获取页面尺寸 pw, ph = self.page_images[page_idx].size - x0 = max(0, min(x0, pw)) - y0 = max(0, min(y0, ph)) - x1 = max(0, min(x1, pw)) - y1 = max(0, min(y1, ph)) + + # ✅ 改为按页宽生成:横向=整页宽度,纵向=bbox范围 + # x坐标:0 到 页宽 + # y坐标:bbox的y0到y1(clamp到页面内) + crop_x0 = 0 + crop_x1 = pw + crop_y0 = max(0, min(y0, ph)) + crop_y1 = max(0, min(y1, ph)) # guard invalid bbox - if x1 - x0 < 2 or y1 - y0 < 2: + if crop_y1 - crop_y0 < 2: continue try: - cropped = self.page_images[page_idx].crop((x0, y0, x1, y1)) + # 裁剪页宽条带 + cropped = self.page_images[page_idx].crop((crop_x0, crop_y0, crop_x1, crop_y1)) fname = f"{file_stem}_gen_{idx}.jpg" out_path = img_root / fname cropped.save(out_path, format="JPEG", quality=80) @@ -578,7 +596,7 @@ class MinerUParser(RAGFlowPdfParser): continue if generated: - self.logger.info(f"[MinerU] generated {generated} fallback images, cached {len(self._img_path_cache)} tags") + self.logger.info(f"[MinerU] generated {generated} page-width fallback images, cached {len(self._img_path_cache)} tags") def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: candidates = [] @@ -649,10 +667,24 @@ class MinerUParser(RAGFlowPdfParser): with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) + # 建立 tag → 原生img_path 的映射(表格/图片/公式) + self._native_img_map = {} + for item in data: + # 解析并补全路径 for key in ("img_path", "table_img_path", "equation_img_path"): if key in item and item[key]: item[key] = str((subdir / item[key]).resolve()) + + # 建立映射: tag → native_img_path + try: + tag = self._raw_line_tag(item) + self._native_img_map[tag] = item[key] + self.logger.debug(f"[MinerU] Mapped native image: {tag} → {item[key]}") + except Exception as e: + self.logger.debug(f"[MinerU] Failed to map native image: {e}") + + break # 只需要第一个找到的图片路径 # MinerU(vlm-http-client) 不会为纯文本生成图片,这里兜底用本地页图裁剪生成,方便后续引用/MinIO 存图 try: @@ -712,6 +744,9 @@ class MinerUParser(RAGFlowPdfParser): temp_pdf = None created_tmp_dir = False + # per-task cache reset to avoid stale images across documents + self._img_path_cache = {} + self._native_img_map = {} # remove spaces, or mineru crash, and _read_output fail too file_path = Path(filepath) From 2d4750535f78a88e7aa2e3ad4e91d2ce69a29b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Wed, 10 Dec 2025 23:43:01 +0800 Subject: [PATCH 7/9] fix: MinerU crop tag matching and manual.py bbox parsing - Fixed crop() to extract original tags from text instead of reconstructing - Added MinerU-specific logic in manual.py to handle space/tab separated tags - Removed redundant import re that caused UnboundLocalError - Ensures correct bbox coordinates for native images, fallback images, and page selection --- deepdoc/parser/mineru_parser.py | 22 +++++---- rag/app/manual.py | 85 +++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index cb24d21a7..7d2fa0110 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -350,25 +350,31 @@ class MinerUParser(RAGFlowPdfParser): 3. 阈值控制(最多10张,总高<2000px) 4. 保持高清(不缩放) """ + # 从text中提取原始tags(保持1-based页码) + original_tags = re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text) poss = self.extract_positions(text) - if not poss: + + if not poss or not original_tags: if need_position: return None, None return + # 确保tags和poss数量一致 + if len(original_tags) != len(poss): + self.logger.warning(f"[MinerU] Tag count ({len(original_tags)}) != position count ({len(poss)}), using first {min(len(original_tags), len(poss))} items") + min_len = min(len(original_tags), len(poss)) + original_tags = original_tags[:min_len] + poss = poss[:min_len] + # Step 1: 收集所有tag对应的图片 images_to_stitch = [] seen_tags = set() # 用于去重 - for pos in poss: - # 构造tag用于查找 + for tag, pos in zip(original_tags, poss): pns, left, right, top, bottom = pos if not pns: continue - page_num = pns[0] + 1 # 转为1-based - tag = f"@@{page_num}\t{left:.1f}\t{right:.1f}\t{top:.1f}\t{bottom:.1f}##" - # ✅ 去重:如果tag已处理过,跳过 if tag in seen_tags: self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}") @@ -399,11 +405,11 @@ class MinerUParser(RAGFlowPdfParser): # 优先级3: 完整页兜底(如果page_images可用) if hasattr(self, "page_images") and self.page_images: - page_idx = pns[0] + page_idx = pns[0] # pns[0]是0-based的页索引 if 0 <= page_idx < len(self.page_images): img = self.page_images[page_idx] images_to_stitch.append(("fullpage", img, pos, tag)) - self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}") + self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}, page_idx={page_idx}") if not images_to_stitch: self.logger.warning("[MinerU] No images found for chunk") diff --git a/rag/app/manual.py b/rag/app/manual.py index 54a05f192..7c049f059 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -20,7 +20,7 @@ import re from common.constants import ParserType from io import BytesIO -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from common.token_utils import num_tokens_from_string from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper @@ -155,7 +155,7 @@ class Docx(DocxParser): sum_question = '\n'.join(question_stack) if sum_question: ti_list.append((f'{sum_question}\n{last_answer}', last_image)) - + tbls = [] for tb in self.doc.tables: html= "" @@ -213,40 +213,61 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang = lang, callback = callback, pdf_cls = Pdf, - layout_recognizer = layout_recognizer, - parse_method = "manual", **kwargs ) - def _normalize_section(section): - # pad section to length 3: (txt, sec_id, poss) - if len(section) == 1: - section = (section[0], "", []) - elif len(section) == 2: - section = (section[0], "", section[1]) - elif len(section) != 3: - raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") - - txt, layoutno, poss = section - if isinstance(poss, str): - poss = pdf_parser.extract_positions(poss) - if poss: - first = poss[0] # tuple: ([pn], x1, x2, y1, y2) - pn = first[0] - if isinstance(pn, list) and pn: - pn = pn[0] # [pn] -> pn - poss[0] = (pn, *first[1:]) - - return (txt, layoutno, poss) - - sections = [_normalize_section(sec) for sec in sections] - if not sections and not tbls: return [] if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 + # Normalize sections to (text, layout, positions) even if parser only returns (text, tag) + def _extract_positions_from_tag(tag: str): + import re + poss = [] + for t in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", tag or ""): + pn, left, right, top, bottom = t.strip("#").strip("@").split("\t") + poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom))) + return poss + + normalized_sections = [] + # 🎯 MinerU专用逻辑:直接使用已有的positions,不重新解析tag + is_mineru = name == "mineru" + + for item in sections: + if len(item) >= 3: + # 已经是(text, layout, positions)格式 + normalized_sections.append(item) + continue + + txt, tag = item[0], item[1] if len(item) > 1 else "" + + # ✅ MinerU: 如果tag包含完整的bbox信息,直接解析并使用 + if is_mineru and tag: + poss = _extract_positions_from_tag(tag) + if not poss: + # 如果解析失败,尝试从tag字符串中手动提取(处理格式问题) + try: + # 更宽松的正则:允许空格或tab分隔 + matches = re.findall(r"@@([0-9-]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)##", tag) + if matches: + for match in matches: + pn, left, right, top, bottom = match + poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom))) + except Exception as e: + pass + else: + # 非MinerU:正常解析tag + poss = _extract_positions_from_tag(tag) + + # 如果还是没有positions,使用默认值 + if not poss: + poss = [(max(from_page, 0) + 1, 0.0, 0.0, 0.0, 0.0)] + + normalized_sections.append((txt, "", poss)) + sections = normalized_sections + callback(0.8, "Finish parsing.") if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: @@ -309,10 +330,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) res = tokenize_table(tbls, doc, eng) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) - table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) - image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) - if table_ctx or image_ctx: - attach_media_context(res, table_ctx, image_ctx) return res elif re.search(r"\.docx?$", filename, re.IGNORECASE): @@ -328,14 +345,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d["doc_type_kwd"] = "image" tokenize(d, text, eng) res.append(d) - table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) - image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) - if table_ctx or image_ctx: - attach_media_context(res, table_ctx, image_ctx) return res else: raise NotImplementedError("file type not supported yet(pdf and docx supported)") - + if __name__ == "__main__": import sys From 02a4b79f905007c10fad37c0a76db87d28c53a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Thu, 11 Dec 2025 21:12:00 +0800 Subject: [PATCH 8/9] chore: increase image stitching thresholds to 20/4000px - MAX_COUNT: 10 -> 20 images - MAX_HEIGHT: 2000px -> 4000px - Allows more complete chunk thumbnails for long documents --- deepdoc/parser/mineru_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 7d2fa0110..3a4e8bf10 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -431,24 +431,24 @@ class MinerUParser(RAGFlowPdfParser): 智能拼接:应用阈值控制 Thresholds: - - MAX_COUNT: 最多10张图 - - MAX_HEIGHT: 总高度不超过2000px + - MAX_COUNT: 最多20张图 + - MAX_HEIGHT: 总高度不超过4000px Strategies: - 数量过多: 均匀采样(保留首尾) - - 高度过高: 截断到2000px + - 高度过高: 截断到4000px - 不缩放图片(保持高清) """ - MAX_COUNT = 10 - MAX_HEIGHT = 2000 + MAX_COUNT = 20 + MAX_HEIGHT = 4000 GAP = 6 - # 1. 数量控制:如果超过10张,均匀采样 + # 1. 数量控制:如果超过20张,均匀采样 if len(images_with_metadata) > MAX_COUNT: self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}") images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT) - # 2. 高度控制:累加到2000px为止 + # 2. 高度控制:累加到4000px为止 trimmed_images = [] current_height = 0 From 58792dfe994eb6aba4d2b6618d693a59d624dcbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= Date: Fri, 12 Dec 2025 15:19:13 +0800 Subject: [PATCH 9/9] feat: enhance MinerU crop() with 3 major improvements 1. Sampling optimization: reduce from 20 to 12 images when exceeding threshold 2. Native image width normalization: re-crop page-width strips for consistent stitching - Preserves original native images for MinIO storage - Uses normalized versions only for thumbnail stitching 3. Low fallback threshold: stitch full page screenshots when 3 fallback images - Deduplicates and limits to max 3 pages - Provides better context for sparse thumbnails --- deepdoc/parser/mineru_parser.py | 130 ++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 6 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 3a4e8bf10..9da2e0917 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -385,9 +385,10 @@ class MinerUParser(RAGFlowPdfParser): native_img_path = self._find_native_image_path(tag) if native_img_path: try: - img = Image.open(native_img_path) + # ✅ 使用页宽标准化版本(原生图保留入库MinIO) + img = self._normalize_native_image_width(native_img_path, tag) images_to_stitch.append(("native", img, pos, tag)) - self.logger.debug(f"[MinerU] Using native image for tag: {tag}") + self.logger.debug(f"[MinerU] Using normalized native image for tag: {tag}") continue except Exception as e: self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}") @@ -417,6 +418,12 @@ class MinerUParser(RAGFlowPdfParser): return None, None return + # ✅ 兜底图≤3张时,拼接完整页(去重) + fallback_count = sum(1 for src, _, _, _ in images_to_stitch if src == "cached") + if fallback_count <= 3 and fallback_count > 0: + self.logger.debug(f"[MinerU] Fallback count = {fallback_count}, using full page strategy") + return self._handle_low_fallback_count(poss, need_position) + # Step 2: 智能拼接(带阈值控制) return self._smart_stitch_with_thresholds(images_to_stitch, need_position) @@ -426,6 +433,116 @@ class MinerUParser(RAGFlowPdfParser): native_map = getattr(self, "_native_img_map", {}) return native_map.get(tag) + def _normalize_native_image_width(self, native_img_path, tag): + """ + 将Native图标准化为页宽版本(仅用于拼接) + + 原理:根据tag中的bbox,从页面重新裁剪页宽条带 + - 横向:0 到 页宽 + - 纵向:bbox的y范围 + + Args: + native_img_path: MinerU原生图路径(保留入库MinIO) + tag: 包含page_idx和bbox信息的tag字符串 + + Returns: + 页宽标准化后的Image对象,失败则返回原生图 + """ + try: + # 解析tag获取page_idx和bbox + import re + match = re.match(r"@@(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)##", tag) + if not match: + # 解析失败,返回原生图 + return Image.open(native_img_path) + + page_num, x0_str, x1_str, y0_str, y1_str = match.groups() + page_idx = int(page_num) - 1 # 转为0-based + bbox = [float(x0_str), float(y0_str), float(x1_str), float(y1_str)] + + # 检查page_images可用性 + if not hasattr(self, "page_images") or not self.page_images: + return Image.open(native_img_path) + + if page_idx < 0 or page_idx >= len(self.page_images): + return Image.open(native_img_path) + + # 获取页面图片 + page_img = self.page_images[page_idx] + page_width, page_height = page_img.size + + # bbox转像素 + px0, py0, px1, py1 = self._bbox_to_pixels(bbox, (page_width, page_height)) + + # 裁剪页宽条带(横向全宽,纵向bbox范围) + crop_y0 = max(0, min(py0, page_height)) + crop_y1 = max(crop_y0 + 1, min(py1, page_height)) + + if crop_y1 - crop_y0 < 2: + # bbox无效,返回原生图 + return Image.open(native_img_path) + + page_width_img = page_img.crop((0, crop_y0, page_width, crop_y1)) + self.logger.debug(f"[MinerU] Normalized native image to page-width: {page_width}x{crop_y1-crop_y0}px") + return page_width_img + + except Exception as e: + self.logger.debug(f"[MinerU] Failed to normalize native image, using original: {e}") + return Image.open(native_img_path) + + def _handle_low_fallback_count(self, poss, need_position): + """ + 兜底图≤3张时,拼接涉及页面截图(去重) + + 策略: + - 提取所有涉及页码 + - 去重并限制最多3页 + - 拼接这些完整页 + + Args: + poss: positions列表 + need_position: 是否需要返回positions + + Returns: + 拼接的完整页截图,或单页截图 + """ + if not hasattr(self, "page_images") or not self.page_images: + if need_position: + return None, None + return + + # 提取所有涉及页码(0-based),去重并排序 + page_indices = sorted(set( + pns[0] for pns, _, _, _, _ in poss + if pns and 0 <= pns[0] < len(self.page_images) + )) + + # 限制最多3页 + page_indices = page_indices[:3] + + if not page_indices: + if need_position: + return None, None + return + + self.logger.info(f"[MinerU] Low fallback count, stitching {len(page_indices)} page(s): {[idx+1 for idx in page_indices]}") + + # 单页直接返回 + if len(page_indices) == 1: + page_img = self.page_images[page_indices[0]] + if need_position: + return page_img, [[page_indices[0], 0, page_img.width, 0, page_img.height]] + return page_img + + # 多页垂直拼接 + page_imgs_with_meta = [ + ("fullpage", self.page_images[idx], ([idx], 0, 0, 0, 0), f"@@{idx+1}\t0\t0\t0\t0##") + for idx in page_indices + ] + + return self._stitch_images_vertically(page_imgs_with_meta, need_position, gap=10) + + def _smart_stitch_with_thresholds(self, images_with_metadata, need_position): """ 智能拼接:应用阈值控制 @@ -435,18 +552,19 @@ class MinerUParser(RAGFlowPdfParser): - MAX_HEIGHT: 总高度不超过4000px Strategies: - - 数量过多: 均匀采样(保留首尾) + - 数量过多: 均匀采样到12张(保留首尾) - 高度过高: 截断到4000px - 不缩放图片(保持高清) """ MAX_COUNT = 20 + SAMPLE_TARGET = 12 # 采样目标数量 MAX_HEIGHT = 4000 GAP = 6 - # 1. 数量控制:如果超过20张,均匀采样 + # 1. 数量控制:如果超过20张,均匀采样到12张 if len(images_with_metadata) > MAX_COUNT: - self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}") - images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT) + self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {SAMPLE_TARGET}") + images_with_metadata = self._sample_images_uniformly(images_with_metadata, SAMPLE_TARGET) # 2. 高度控制:累加到4000px为止 trimmed_images = []