From b443d34faf4c1619fd51794bd998b21b26b669bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Tue, 9 Dec 2025 19:53:56 +0800
Subject: [PATCH 1/9] Fix: Generate missing images for MinerU text blocks using
 local crop

---
 deepdoc/parser/mineru_parser.py | 70 +++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 7e3919bbd..be099bc9e 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -51,6 +51,8 @@ class MinerUContentType(StrEnum):
     CODE = "code"
     LIST = "list"
     DISCARDED = "discarded"
+    HEADER = "header"
+    PAGE_NUMBER = "page_number"
 
 
 class MinerUParser(RAGFlowPdfParser):
@@ -459,6 +461,67 @@ class MinerUParser(RAGFlowPdfParser):
             poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
         return poss
 
+    def _bbox_to_pixels(self, bbox, page_size):
+        x0, y0, x1, y1 = bbox
+        pw, ph = page_size
+        maxv = max(bbox)
+        # 经验：MinerU bbox 常为 0~1000 归一化；否则认为已是像素
+        if maxv <= 1.5:
+            sx, sy = pw, ph
+        elif maxv <= 1200:
+            sx, sy = pw / 1000.0, ph / 1000.0
+        else:
+            sx, sy = 1.0, 1.0
+        return (
+            int(x0 * sx),
+            int(y0 * sy),
+            int(x1 * sx),
+            int(y1 * sy),
+        )
+
+    def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str):
+        if not getattr(self, "page_images", None):
+            return
+        if not subdir:
+            return
+        img_root = subdir / "generated_images"
+        img_root.mkdir(parents=True, exist_ok=True)
+        text_types = {MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.CODE, MinerUContentType.HEADER}
+        generated = 0
+        for idx, item in enumerate(outputs):
+            if item.get("type") not in text_types:
+                continue
+            if item.get("img_path"):
+                continue
+            
+            bbox = item.get("bbox")
+            if not bbox or len(bbox) != 4:
+                continue
+            
+            page_idx = int(item.get("page_idx", 0))
+            if page_idx < 0 or page_idx >= len(self.page_images):
+                continue
+                
+            x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size)
+            
+            # guard invalid bbox
+            if x1 - x0 < 2 or y1 - y0 < 2:
+                continue
+                
+            try:
+                crop = self.page_images[page_idx].crop((x0, y0, x1, y1))
+                fname = f"{file_stem}_gen_{idx}.jpg"
+                out_path = img_root / fname
+                crop.save(out_path, format="JPEG", quality=80)
+                item["img_path"] = str(out_path.resolve())
+                generated += 1
+            except Exception as e:
+                self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}")
+                continue
+                
+        if generated:
+            self.logger.info(f"[MinerU] generated {generated} fallback images for text blocks")
+
     def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
         candidates = []
         seen = set()
@@ -532,6 +595,13 @@ class MinerUParser(RAGFlowPdfParser):
             for key in ("img_path", "table_img_path", "equation_img_path"):
                 if key in item and item[key]:
                     item[key] = str((subdir / item[key]).resolve())
+
+        # MinerU(vlm-http-client) 不会为纯文本生成图片，这里兜底用本地页图裁剪生成，方便后续引用/MinIO 存图
+        try:
+            self._generate_missing_images(data, subdir, file_stem)
+        except Exception as e:
+            self.logger.warning(f"[MinerU] generate missing images failed: {e}")
+
         return data
 
     def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):

From eb004b62542d7f13df329c2e4ace56ac1a92b165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Tue, 9 Dec 2025 20:28:47 +0800
Subject: [PATCH 2/9] fix(mineru): use cached img_path in crop() to consume
 generated_images

- Add _img_path_cache dict to cache line_tag -> img_path mapping
- Populate cache in _generate_missing_images for fallback text block images
- Refactor crop() to check cache first, return cached image directly
- Fallback to single-position cropping to avoid super-tall merged images
- Fix text_types to use both string literals and enums for compatibility
- Add bbox clamping to prevent cropping errors
---
 deepdoc/parser/mineru_parser.py | 90 +++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 21 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index be099bc9e..eaeedbb20 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -63,6 +63,7 @@ class MinerUParser(RAGFlowPdfParser):
         self.using_api = False
         self.outlines = []
         self.logger = logging.getLogger(self.__class__.__name__)
+        self._img_path_cache = {}  # line_tag -> img_path mapping for crop() lookup
 
     def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
         self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
@@ -334,13 +335,33 @@ class MinerUParser(RAGFlowPdfParser):
         return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
 
     def crop(self, text, ZM=1, need_position=False):
-        imgs = []
+        """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop."""
         poss = self.extract_positions(text)
         if not poss:
             if need_position:
                 return None, None
             return
+        
+        # 优先使用缓存的 img_path (来自 MinerU 或 _generate_missing_images)
+        cache = getattr(self, "_img_path_cache", {})
+        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
+            # 尝试精确匹配或近似匹配缓存
+            if tag in cache:
+                try:
+                    img = Image.open(cache[tag])
+                    if need_position:
+                        # 从第一个位置提取 position 信息
+                        first_pos = poss[0]
+                        pn = first_pos[0][0] if first_pos[0] else 0
+                        left, right, top, bottom = first_pos[1], first_pos[2], first_pos[3], first_pos[4]
+                        positions = [(pn + getattr(self, "page_from", 0), int(left), int(right), int(top), int(bottom))]
+                        return img, positions
+                    return img
+                except Exception as e:
+                    self.logger.debug(f"[MinerU] cached img_path load failed: {e}")
+                    break  # fallback to crop
 
+        # Fallback: 使用 page_images 裁剪
         if not getattr(self, "page_images", None):
             self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
             if need_position:
@@ -352,21 +373,22 @@ class MinerUParser(RAGFlowPdfParser):
         filtered_poss = []
         for pns, left, right, top, bottom in poss:
             if not pns:
-                self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
                 continue
             valid_pns = [p for p in pns if 0 <= p < page_count]
             if not valid_pns:
-                self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
                 continue
             filtered_poss.append((valid_pns, left, right, top, bottom))
 
         poss = filtered_poss
         if not poss:
-            self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
             if need_position:
                 return None, None
             return
 
+        # 避免超长拼接图 - 只取首个位置
+        if len(poss) > 1:
+            poss = [poss[0]]
+
         max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
         GAP = 6
         pos = poss[0]
@@ -486,7 +508,7 @@ class MinerUParser(RAGFlowPdfParser):
             return
         img_root = subdir / "generated_images"
         img_root.mkdir(parents=True, exist_ok=True)
-        text_types = {MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.CODE, MinerUContentType.HEADER}
+        text_types = {"text", "list", "header", "code", MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.EQUATION, MinerUContentType.CODE}
         generated = 0
         for idx, item in enumerate(outputs):
             if item.get("type") not in text_types:
@@ -504,23 +526,43 @@ class MinerUParser(RAGFlowPdfParser):
                 
             x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size)
             
+            # clamp to page boundary
+            pw, ph = self.page_images[page_idx].size
+            x0 = max(0, min(x0, pw))
+            y0 = max(0, min(y0, ph))
+            x1 = max(0, min(x1, pw))
+            y1 = max(0, min(y1, ph))
+
             # guard invalid bbox
             if x1 - x0 < 2 or y1 - y0 < 2:
                 continue
                 
             try:
-                crop = self.page_images[page_idx].crop((x0, y0, x1, y1))
+                cropped = self.page_images[page_idx].crop((x0, y0, x1, y1))
                 fname = f"{file_stem}_gen_{idx}.jpg"
                 out_path = img_root / fname
-                crop.save(out_path, format="JPEG", quality=80)
-                item["img_path"] = str(out_path.resolve())
+                cropped.save(out_path, format="JPEG", quality=80)
+                img_path_str = str(out_path.resolve())
+                item["img_path"] = img_path_str
+                
+                # Cache for crop() lookup: map line_tag to img_path
+                # 缓存两种格式的 key,确保无论 _transfer_to_sections 怎么生成 tag 都能匹配
+                line_tag = self._line_tag(item)
+                self._img_path_cache[line_tag] = img_path_str
+                
+                # 同时缓存原始 bbox 格式 (不依赖 page_images 的归一化坐标)
+                raw_bbox = item.get("bbox", [0, 0, 0, 0])
+                raw_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(
+                    page_idx + 1, float(raw_bbox[0]), float(raw_bbox[2]), float(raw_bbox[1]), float(raw_bbox[3])
+                )
+                self._img_path_cache[raw_tag] = img_path_str
                 generated += 1
             except Exception as e:
                 self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}")
                 continue
                 
         if generated:
-            self.logger.info(f"[MinerU] generated {generated} fallback images for text blocks")
+            self.logger.info(f"[MinerU] generated {generated} fallback images, cached {len(self._img_path_cache)} tags")
 
     def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
         candidates = []
@@ -607,29 +649,35 @@ class MinerUParser(RAGFlowPdfParser):
     def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
         sections = []
         for output in outputs:
-            match output["type"]:
-                case MinerUContentType.TEXT:
-                    section = output["text"]
-                case MinerUContentType.TABLE:
+            section = None
+            content_type = output.get("type", "")
+            
+            # 使用字符串匹配,兼容 MinerU API 返回的原始类型
+            match content_type:
+                case "text" | MinerUContentType.TEXT:
+                    section = output.get("text", "")
+                case "table" | MinerUContentType.TABLE:
                     section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
                     if not section.strip():
                         section = "FAILED TO PARSE TABLE"
-                case MinerUContentType.IMAGE:
+                case "image" | MinerUContentType.IMAGE:
                     section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", []))
-                case MinerUContentType.EQUATION:
-                    section = output["text"]
-                case MinerUContentType.CODE:
-                    section = output["code_body"] + "\n".join(output.get("code_caption", []))
-                case MinerUContentType.LIST:
+                case "equation" | MinerUContentType.EQUATION:
+                    section = output.get("text", "")
+                case "code" | MinerUContentType.CODE:
+                    section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
+                case "list" | MinerUContentType.LIST:
                     section = "\n".join(output.get("list_items", []))
-                case MinerUContentType.DISCARDED:
+                case "header":
+                    section = output.get("text", "")
+                case "discarded" | MinerUContentType.DISCARDED:
                     pass
 
             if section and parse_method == "manual":
                 sections.append((section, output["type"], self._line_tag(output)))
             elif section and parse_method == "paper":
                 sections.append((section + self._line_tag(output), output["type"]))
-            else:
+            elif section:
                 sections.append((section, self._line_tag(output)))
         return sections
 

From 8049cb9275f46f7e69b8dc9563f32020b589b5ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Tue, 9 Dec 2025 22:17:15 +0800
Subject: [PATCH 3/9] fix(mineru): use consistent 0-1000 normalized coords for
 line_tag cache matching

---
 deepdoc/parser/mineru_parser.py | 55 ++++++++++++++-------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index eaeedbb20..943cc8cff 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -334,6 +334,13 @@ class MinerUParser(RAGFlowPdfParser):
 
         return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
 
+    def _raw_line_tag(self, bx):
+        """生成原始归一化坐标(0-1000)的line_tag,用于缓存key匹配"""
+        pn = bx.get("page_idx", 0) + 1
+        bbox = bx.get("bbox", [0, 0, 0, 0])
+        x0, y0, x1, y1 = bbox
+        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, x0, x1, y0, y1)
+
     def crop(self, text, ZM=1, need_position=False):
         """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop."""
         poss = self.extract_positions(text)
@@ -545,16 +552,8 @@ class MinerUParser(RAGFlowPdfParser):
                 img_path_str = str(out_path.resolve())
                 item["img_path"] = img_path_str
                 
-                # Cache for crop() lookup: map line_tag to img_path
-                # 缓存两种格式的 key,确保无论 _transfer_to_sections 怎么生成 tag 都能匹配
-                line_tag = self._line_tag(item)
-                self._img_path_cache[line_tag] = img_path_str
-                
-                # 同时缓存原始 bbox 格式 (不依赖 page_images 的归一化坐标)
-                raw_bbox = item.get("bbox", [0, 0, 0, 0])
-                raw_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(
-                    page_idx + 1, float(raw_bbox[0]), float(raw_bbox[2]), float(raw_bbox[1]), float(raw_bbox[3])
-                )
+                # Cache for crop() lookup: use raw 0-1000 normalized tag for consistent matching
+                raw_tag = self._raw_line_tag(item)
                 self._img_path_cache[raw_tag] = img_path_str
                 generated += 1
             except Exception as e:
@@ -649,36 +648,30 @@ class MinerUParser(RAGFlowPdfParser):
     def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
         sections = []
         for output in outputs:
-            section = None
-            content_type = output.get("type", "")
-            
-            # 使用字符串匹配,兼容 MinerU API 返回的原始类型
-            match content_type:
-                case "text" | MinerUContentType.TEXT:
-                    section = output.get("text", "")
-                case "table" | MinerUContentType.TABLE:
+            match output["type"]:
+                case MinerUContentType.TEXT:
+                    section = output["text"]
+                case MinerUContentType.TABLE:
                     section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
                     if not section.strip():
                         section = "FAILED TO PARSE TABLE"
-                case "image" | MinerUContentType.IMAGE:
+                case MinerUContentType.IMAGE:
                     section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", []))
-                case "equation" | MinerUContentType.EQUATION:
-                    section = output.get("text", "")
-                case "code" | MinerUContentType.CODE:
-                    section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
-                case "list" | MinerUContentType.LIST:
+                case MinerUContentType.EQUATION:
+                    section = output["text"]
+                case MinerUContentType.CODE:
+                    section = output["code_body"] + "\n".join(output.get("code_caption", []))
+                case MinerUContentType.LIST:
                     section = "\n".join(output.get("list_items", []))
-                case "header":
-                    section = output.get("text", "")
-                case "discarded" | MinerUContentType.DISCARDED:
+                case MinerUContentType.DISCARDED:
                     pass
 
             if section and parse_method == "manual":
-                sections.append((section, output["type"], self._line_tag(output)))
+                sections.append((section, output["type"], self._raw_line_tag(output)))
             elif section and parse_method == "paper":
-                sections.append((section + self._line_tag(output), output["type"]))
-            elif section:
-                sections.append((section, self._line_tag(output)))
+                sections.append((section + self._raw_line_tag(output), output["type"]))
+            else:
+                sections.append((section, self._raw_line_tag(output)))
         return sections
 
     def _transfer_to_tables(self, outputs: list[dict[str, Any]]):

From 1c7bc4757916af4da94b4f9f1c7b55fc85c0d80d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Tue, 9 Dec 2025 23:32:27 +0800
Subject: [PATCH 4/9] fix(mineru): robust coordinate conversion in crop()
 fallback for 0-1000 tags

- Implement coordinate conversion (normalized -> pixels) in crop() fallback loop
- Ensures correct cropping from page_images when cache lookup fails
- Works consistently with _raw_line_tag (0-1000 normalized) changes
---
 deepdoc/parser/mineru_parser.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 943cc8cff..dd3370ddb 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -392,6 +392,22 @@ class MinerUParser(RAGFlowPdfParser):
                 return None, None
             return
 
+        # Convert 0-1000 normalized coordinates to pixels using page dimensions
+        # This ensures compatibility with GAP/padding logic and correct cropping
+        pixel_poss = []
+        for pns, left, right, top, bottom in poss:
+            if not pns: continue
+            page_idx = pns[0]
+            if not (0 <= page_idx < page_count): continue
+            
+            W, H = self.page_images[page_idx].size
+            x0 = left * W / 1000.0
+            x1 = right * W / 1000.0
+            y0 = top * H / 1000.0
+            y1 = bottom * H / 1000.0
+            pixel_poss.append((pns, x0, x1, y0, y1))
+        poss = pixel_poss
+
         # 避免超长拼接图 - 只取首个位置
         if len(poss) > 1:
             poss = [poss[0]]

From 3bc3d82aa8b3de48c4160801431c9d4dcef26bb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Wed, 10 Dec 2025 00:48:39 +0800
Subject: [PATCH 5/9] fix: Initialize imgs list in crop() fallback path

- Critical bug fix: imgs list was not initialized before use (line 439)
- Without this fix, NameError would occur when cache miss triggers fallback
- Discovered during reliability audit of MinerU image generation fix
---
 deepdoc/parser/mineru_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index dd3370ddb..c57a5f98b 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -436,6 +436,7 @@ class MinerUParser(RAGFlowPdfParser):
         )
 
         positions = []
+        imgs = []
         for ii, (pns, left, right, top, bottom) in enumerate(poss):
             right = left + max_width
 

From 8a285d123027cc605891418a42f291fbdec4231e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Wed, 10 Dec 2025 21:19:49 +0800
Subject: [PATCH 6/9] feat(mineru): implement smart crop with page-width
 fallback and native image mixing

- Changed fallback image generation to page-width strips (full horizontal, bbox vertical)
- Implemented smart crop() with native+fallback mixing and deduplication
- Added thresholds: max 10 images, total height <2000px
- Established native_img_map for table/image/equation priority
- Removed 120px padding logic that caused super-long stitched thumbnails

This fixes the issue where chunk thumbnails were either missing or excessively long due to:
1. MinerU not providing images for pure text blocks
2. Official crop() adding 120px padding and stitching across pages
3. Manual.py merging multiple sections into one chunk

The new approach:
- Priority 1: Use MinerU's native high-quality images (tables/equations)
- Priority 2: Use page-width fallback strips (consistent width for stitching)
- Priority 3: Use full page as last resort
- Deduplicates identical bboxes during stitching
- Limits output to reasonable dimensions for UX
---
 deepdoc/parser/mineru_parser.py | 331 ++++++++++++++++++--------------
 1 file changed, 183 insertions(+), 148 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index c57a5f98b..cb24d21a7 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -64,6 +64,7 @@ class MinerUParser(RAGFlowPdfParser):
         self.outlines = []
         self.logger = logging.getLogger(self.__class__.__name__)
         self._img_path_cache = {}  # line_tag -> img_path mapping for crop() lookup
+        self._native_img_map = {}  # line_tag -> native mineru image (image/table/equation)
 
     def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
         self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
@@ -342,160 +343,171 @@ class MinerUParser(RAGFlowPdfParser):
         return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, x0, x1, y0, y1)
 
     def crop(self, text, ZM=1, need_position=False):
-        """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop."""
+        """
+        MinerU专用智能crop：
+        1. 混合使用原生图（表格/图片）+ 兜底图（页宽条带）
+        2. 拼接时去重（相同bbox的图只用一次）
+        3. 阈值控制（最多10张，总高<2000px）
+        4. 保持高清（不缩放）
+        """
         poss = self.extract_positions(text)
         if not poss:
             if need_position:
                 return None, None
             return
         
-        # 优先使用缓存的 img_path (来自 MinerU 或 _generate_missing_images)
-        cache = getattr(self, "_img_path_cache", {})
-        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
-            # 尝试精确匹配或近似匹配缓存
+        # Step 1: 收集所有tag对应的图片
+        images_to_stitch = []
+        seen_tags = set()  # 用于去重
+        
+        for pos in poss:
+            # 构造tag用于查找
+            pns, left, right, top, bottom = pos
+            if not pns:
+                continue
+            
+            page_num = pns[0] + 1  # 转为1-based
+            tag = f"@@{page_num}\t{left:.1f}\t{right:.1f}\t{top:.1f}\t{bottom:.1f}##"
+            
+            # ✅ 去重：如果tag已处理过，跳过
+            if tag in seen_tags:
+                self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}")
+                continue
+            seen_tags.add(tag)
+            
+            # 优先级1: 查找MinerU原生图（表格/图片/公式）
+            native_img_path = self._find_native_image_path(tag)
+            if native_img_path:
+                try:
+                    img = Image.open(native_img_path)
+                    images_to_stitch.append(("native", img, pos, tag))
+                    self.logger.debug(f"[MinerU] Using native image for tag: {tag}")
+                    continue
+                except Exception as e:
+                    self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}")
+            
+            # 优先级2: 查找兜底生成的页宽图（缓存）
+            cache = getattr(self, "_img_path_cache", {})
             if tag in cache:
                 try:
                     img = Image.open(cache[tag])
-                    if need_position:
-                        # 从第一个位置提取 position 信息
-                        first_pos = poss[0]
-                        pn = first_pos[0][0] if first_pos[0] else 0
-                        left, right, top, bottom = first_pos[1], first_pos[2], first_pos[3], first_pos[4]
-                        positions = [(pn + getattr(self, "page_from", 0), int(left), int(right), int(top), int(bottom))]
-                        return img, positions
-                    return img
-                except Exception as e:
-                    self.logger.debug(f"[MinerU] cached img_path load failed: {e}")
-                    break  # fallback to crop
-
-        # Fallback: 使用 page_images 裁剪
-        if not getattr(self, "page_images", None):
-            self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
-            if need_position:
-                return None, None
-            return
-
-        page_count = len(self.page_images)
-
-        filtered_poss = []
-        for pns, left, right, top, bottom in poss:
-            if not pns:
-                continue
-            valid_pns = [p for p in pns if 0 <= p < page_count]
-            if not valid_pns:
-                continue
-            filtered_poss.append((valid_pns, left, right, top, bottom))
-
-        poss = filtered_poss
-        if not poss:
-            if need_position:
-                return None, None
-            return
-
-        # Convert 0-1000 normalized coordinates to pixels using page dimensions
-        # This ensures compatibility with GAP/padding logic and correct cropping
-        pixel_poss = []
-        for pns, left, right, top, bottom in poss:
-            if not pns: continue
-            page_idx = pns[0]
-            if not (0 <= page_idx < page_count): continue
-            
-            W, H = self.page_images[page_idx].size
-            x0 = left * W / 1000.0
-            x1 = right * W / 1000.0
-            y0 = top * H / 1000.0
-            y1 = bottom * H / 1000.0
-            pixel_poss.append((pns, x0, x1, y0, y1))
-        poss = pixel_poss
-
-        # 避免超长拼接图 - 只取首个位置
-        if len(poss) > 1:
-            poss = [poss[0]]
-
-        max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
-        GAP = 6
-        pos = poss[0]
-        first_page_idx = pos[0][0]
-        poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
-        pos = poss[-1]
-        last_page_idx = pos[0][-1]
-        if not (0 <= last_page_idx < page_count):
-            self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
-            if need_position:
-                return None, None
-            return
-        last_page_height = self.page_images[last_page_idx].size[1]
-        poss.append(
-            (
-                [last_page_idx],
-                pos[1],
-                pos[2],
-                min(last_page_height, pos[4] + GAP),
-                min(last_page_height, pos[4] + 120),
-            )
-        )
-
-        positions = []
-        imgs = []
-        for ii, (pns, left, right, top, bottom) in enumerate(poss):
-            right = left + max_width
-
-            if bottom <= top:
-                bottom = top + 2
-
-            for pn in pns[1:]:
-                if 0 <= pn - 1 < page_count:
-                    bottom += self.page_images[pn - 1].size[1]
-                else:
-                    self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
-
-            if not (0 <= pns[0] < page_count):
-                self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
-                continue
-
-            img0 = self.page_images[pns[0]]
-            x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
-            crop0 = img0.crop((x0, y0, x1, y1))
-            imgs.append(crop0)
-            if 0 < ii < len(poss) - 1:
-                positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
-
-            bottom -= img0.size[1]
-            for pn in pns[1:]:
-                if not (0 <= pn < page_count):
-                    self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
+                    images_to_stitch.append(("cached", img, pos, tag))
+                    self.logger.debug(f"[MinerU] Using cached fallback image for tag: {tag}")
                     continue
-                page = self.page_images[pn]
-                x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
-                cimgp = page.crop((x0, y0, x1, y1))
-                imgs.append(cimgp)
-                if 0 < ii < len(poss) - 1:
-                    positions.append((pn + self.page_from, x0, x1, y0, y1))
-                bottom -= page.size[1]
-
-        if not imgs:
+                except Exception as e:
+                    self.logger.debug(f"[MinerU] Failed to load cached image: {e}")
+            
+            # 优先级3: 完整页兜底（如果page_images可用）
+            if hasattr(self, "page_images") and self.page_images:
+                page_idx = pns[0]
+                if 0 <= page_idx < len(self.page_images):
+                    img = self.page_images[page_idx]
+                    images_to_stitch.append(("fullpage", img, pos, tag))
+                    self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}")
+        
+        if not images_to_stitch:
+            self.logger.warning("[MinerU] No images found for chunk")
             if need_position:
                 return None, None
             return
-
-        height = 0
-        for img in imgs:
-            height += img.size[1] + GAP
-        height = int(height)
-        width = int(np.max([i.size[0] for i in imgs]))
-        pic = Image.new("RGB", (width, height), (245, 245, 245))
-        height = 0
-        for ii, img in enumerate(imgs):
-            if ii == 0 or ii + 1 == len(imgs):
-                img = img.convert("RGBA")
-                overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
-                overlay.putalpha(128)
-                img = Image.alpha_composite(img, overlay).convert("RGB")
-            pic.paste(img, (0, int(height)))
-            height += img.size[1] + GAP
-
+        
+        # Step 2: 智能拼接（带阈值控制）
+        return self._smart_stitch_with_thresholds(images_to_stitch, need_position)
+    
+    def _find_native_image_path(self, tag):
+        """查找MinerU原生图片路径（表格/图片/公式）"""
+        # 需要在_read_output时建立 tag → native_img_path 的映射
+        native_map = getattr(self, "_native_img_map", {})
+        return native_map.get(tag)
+    
+    def _smart_stitch_with_thresholds(self, images_with_metadata, need_position):
+        """
+        智能拼接：应用阈值控制
+        
+        Thresholds:
+        - MAX_COUNT: 最多10张图
+        - MAX_HEIGHT: 总高度不超过2000px
+        
+        Strategies:
+        - 数量过多: 均匀采样（保留首尾）
+        - 高度过高: 截断到2000px
+        - 不缩放图片（保持高清）
+        """
+        MAX_COUNT = 10
+        MAX_HEIGHT = 2000
+        GAP = 6
+        
+        # 1. 数量控制：如果超过10张，均匀采样
+        if len(images_with_metadata) > MAX_COUNT:
+            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}")
+            images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT)
+        
+        # 2. 高度控制：累加到2000px为止
+        trimmed_images = []
+        current_height = 0
+        
+        for src, img, pos, tag in images_with_metadata:
+            if current_height + img.height > MAX_HEIGHT:
+                self.logger.info(f"[MinerU] Reached max height {MAX_HEIGHT}px at {len(trimmed_images)} images, stopping")
+                break
+            trimmed_images.append((src, img, pos, tag))
+            current_height += img.height + GAP
+        
+        # 至少保留一张图
+        if not trimmed_images and images_with_metadata:
+            trimmed_images = [images_with_metadata[0]]
+        
+        # 3. 垂直拼接（不缩放）
+        return self._stitch_images_vertically(trimmed_images, need_position, GAP)
+    
+    def _sample_images_uniformly(self, images, target_count):
+        """均匀采样：保留首尾，均匀抽取中间"""
+        if len(images) <= target_count:
+            return images
+        
+        sampled = [images[0]]  # 首张
+        step = len(images) / (target_count - 1)
+        for i in range(1, target_count - 1):
+            idx = int(i * step)
+            sampled.append(images[idx])
+        sampled.append(images[-1])  # 末张
+        return sampled
+    
+    def _stitch_images_vertically(self, images_with_metadata, need_position, gap):
+        """垂直拼接图片（不加补丁，不缩放）"""
+        if not images_with_metadata:
+            if need_position:
+                return None, None
+            return
+        
+        imgs = [img for _, img, _, _ in images_with_metadata]
+        positions_list = [pos for _, _, pos, _ in images_with_metadata]
+        
+        # 计算画布尺寸
+        total_height = sum(img.height for img in imgs) + gap * (len(imgs) - 1)
+        max_width = max(img.width for img in imgs)
+        
+        # 创建画布
+        pic = Image.new("RGB", (max_width, total_height), (245, 245, 245))
+        
+        # 逐张粘贴（垂直堆叠）
+        current_y = 0
+        positions = []
+        
+        for idx, (img, pos) in enumerate(zip(imgs, positions_list)):
+            pic.paste(img, (0, current_y))
+            
+            # 提取position信息
+            if pos and len(pos) >= 5:
+                pns, left, right, top, bottom = pos
+                if pns:
+                    page_num = pns[0] + getattr(self, "page_from", 0)
+                    positions.append((page_num, int(left), int(right), int(top), int(bottom)))
+            
+            current_y += img.height + gap
+        
         if need_position:
-            return pic, positions
+            return pic, positions if positions else [(0, 0, max_width, 0, total_height)]
         return pic
 
     @staticmethod
@@ -526,6 +538,7 @@ class MinerUParser(RAGFlowPdfParser):
         )
 
     def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str):
+        """生成兜底图：按页宽（横向全宽，纵向按bbox）"""
         if not getattr(self, "page_images", None):
             return
         if not subdir:
@@ -550,19 +563,24 @@ class MinerUParser(RAGFlowPdfParser):
                 
             x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size)
             
-            # clamp to page boundary
+            # 获取页面尺寸
             pw, ph = self.page_images[page_idx].size
-            x0 = max(0, min(x0, pw))
-            y0 = max(0, min(y0, ph))
-            x1 = max(0, min(x1, pw))
-            y1 = max(0, min(y1, ph))
+            
+            # ✅ 改为按页宽生成：横向=整页宽度，纵向=bbox范围
+            # x坐标：0 到 页宽
+            # y坐标：bbox的y0到y1（clamp到页面内）
+            crop_x0 = 0
+            crop_x1 = pw
+            crop_y0 = max(0, min(y0, ph))
+            crop_y1 = max(0, min(y1, ph))
 
             # guard invalid bbox
-            if x1 - x0 < 2 or y1 - y0 < 2:
+            if crop_y1 - crop_y0 < 2:
                 continue
                 
             try:
-                cropped = self.page_images[page_idx].crop((x0, y0, x1, y1))
+                # 裁剪页宽条带
+                cropped = self.page_images[page_idx].crop((crop_x0, crop_y0, crop_x1, crop_y1))
                 fname = f"{file_stem}_gen_{idx}.jpg"
                 out_path = img_root / fname
                 cropped.save(out_path, format="JPEG", quality=80)
@@ -578,7 +596,7 @@ class MinerUParser(RAGFlowPdfParser):
                 continue
                 
         if generated:
-            self.logger.info(f"[MinerU] generated {generated} fallback images, cached {len(self._img_path_cache)} tags")
+            self.logger.info(f"[MinerU] generated {generated} page-width fallback images, cached {len(self._img_path_cache)} tags")
 
     def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
         candidates = []
@@ -649,10 +667,24 @@ class MinerUParser(RAGFlowPdfParser):
         with open(json_file, "r", encoding="utf-8") as f:
             data = json.load(f)
 
+        # 建立 tag → 原生img_path 的映射（表格/图片/公式）
+        self._native_img_map = {}
+        
         for item in data:
+            # 解析并补全路径
             for key in ("img_path", "table_img_path", "equation_img_path"):
                 if key in item and item[key]:
                     item[key] = str((subdir / item[key]).resolve())
+                    
+                    # 建立映射: tag → native_img_path
+                    try:
+                        tag = self._raw_line_tag(item)
+                        self._native_img_map[tag] = item[key]
+                        self.logger.debug(f"[MinerU] Mapped native image: {tag} → {item[key]}")
+                    except Exception as e:
+                        self.logger.debug(f"[MinerU] Failed to map native image: {e}")
+                    
+                    break  # 只需要第一个找到的图片路径
 
         # MinerU(vlm-http-client) 不会为纯文本生成图片，这里兜底用本地页图裁剪生成，方便后续引用/MinIO 存图
         try:
@@ -712,6 +744,9 @@ class MinerUParser(RAGFlowPdfParser):
 
         temp_pdf = None
         created_tmp_dir = False
+        # per-task cache reset to avoid stale images across documents
+        self._img_path_cache = {}
+        self._native_img_map = {}
 
         # remove spaces, or mineru crash, and _read_output fail too
         file_path = Path(filepath)

From 2d4750535f78a88e7aa2e3ad4e91d2ce69a29b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Wed, 10 Dec 2025 23:43:01 +0800
Subject: [PATCH 7/9] fix: MinerU crop tag matching and manual.py bbox parsing

- Fixed crop() to extract original tags from text instead of reconstructing
- Added MinerU-specific logic in manual.py to handle space/tab separated tags
- Removed redundant import re that caused UnboundLocalError
- Ensures correct bbox coordinates for native images, fallback images, and page selection
---
 deepdoc/parser/mineru_parser.py | 22 +++++----
 rag/app/manual.py               | 85 +++++++++++++++++++--------------
 2 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index cb24d21a7..7d2fa0110 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -350,25 +350,31 @@ class MinerUParser(RAGFlowPdfParser):
         3. 阈值控制（最多10张，总高<2000px）
         4. 保持高清（不缩放）
         """
+        # 从text中提取原始tags（保持1-based页码）
+        original_tags = re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text)
         poss = self.extract_positions(text)
-        if not poss:
+        
+        if not poss or not original_tags:
             if need_position:
                 return None, None
             return
         
+        # 确保tags和poss数量一致
+        if len(original_tags) != len(poss):
+            self.logger.warning(f"[MinerU] Tag count ({len(original_tags)}) != position count ({len(poss)}), using first {min(len(original_tags), len(poss))} items")
+            min_len = min(len(original_tags), len(poss))
+            original_tags = original_tags[:min_len]
+            poss = poss[:min_len]
+        
         # Step 1: 收集所有tag对应的图片
         images_to_stitch = []
         seen_tags = set()  # 用于去重
         
-        for pos in poss:
-            # 构造tag用于查找
+        for tag, pos in zip(original_tags, poss):
             pns, left, right, top, bottom = pos
             if not pns:
                 continue
             
-            page_num = pns[0] + 1  # 转为1-based
-            tag = f"@@{page_num}\t{left:.1f}\t{right:.1f}\t{top:.1f}\t{bottom:.1f}##"
-            
             # ✅ 去重：如果tag已处理过，跳过
             if tag in seen_tags:
                 self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}")
@@ -399,11 +405,11 @@ class MinerUParser(RAGFlowPdfParser):
             
             # 优先级3: 完整页兜底（如果page_images可用）
             if hasattr(self, "page_images") and self.page_images:
-                page_idx = pns[0]
+                page_idx = pns[0]  # pns[0]是0-based的页索引
                 if 0 <= page_idx < len(self.page_images):
                     img = self.page_images[page_idx]
                     images_to_stitch.append(("fullpage", img, pos, tag))
-                    self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}")
+                    self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}, page_idx={page_idx}")
         
         if not images_to_stitch:
             self.logger.warning("[MinerU] No images found for chunk")
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 54a05f192..7c049f059 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -20,7 +20,7 @@ import re
 
 from common.constants import ParserType
 from io import BytesIO
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
 from common.token_utils import num_tokens_from_string
 from deepdoc.parser import PdfParser, DocxParser
 from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@@ -155,7 +155,7 @@ class Docx(DocxParser):
             sum_question = '\n'.join(question_stack)
             if sum_question:
                 ti_list.append((f'{sum_question}\n{last_answer}', last_image))
-
+                
         tbls = []
         for tb in self.doc.tables:
             html= "<table>"
@@ -213,40 +213,61 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             lang = lang,
             callback = callback,
             pdf_cls = Pdf,
-            layout_recognizer = layout_recognizer,
-            parse_method = "manual",
             **kwargs
         )
 
-        def _normalize_section(section):
-            # pad section to length 3: (txt, sec_id, poss)
-            if len(section) == 1:
-                section = (section[0], "", [])
-            elif len(section) == 2:
-                section = (section[0], "", section[1])
-            elif len(section) != 3:
-                raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
-
-            txt, layoutno, poss = section
-            if isinstance(poss, str):
-                poss = pdf_parser.extract_positions(poss)
-                if poss:
-                    first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
-                    pn = first[0]           
-                    if isinstance(pn, list) and pn:
-                        pn = pn[0]           # [pn] -> pn
-                        poss[0] = (pn, *first[1:])
-
-            return (txt, layoutno, poss)
-
-        sections = [_normalize_section(sec) for sec in sections]
-
         if not sections and not tbls:
             return []
 
         if name in ["tcadp", "docling", "mineru"]:
             parser_config["chunk_token_num"] = 0
 
+        # Normalize sections to (text, layout, positions) even if parser only returns (text, tag)
+        def _extract_positions_from_tag(tag: str):
+            import re
+            poss = []
+            for t in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", tag or ""):
+                pn, left, right, top, bottom = t.strip("#").strip("@").split("\t")
+                poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom)))
+            return poss
+
+        normalized_sections = []
+        # 🎯 MinerU专用逻辑：直接使用已有的positions，不重新解析tag
+        is_mineru = name == "mineru"
+        
+        for item in sections:
+            if len(item) >= 3:
+                # 已经是(text, layout, positions)格式
+                normalized_sections.append(item)
+                continue
+            
+            txt, tag = item[0], item[1] if len(item) > 1 else ""
+            
+            # ✅ MinerU: 如果tag包含完整的bbox信息，直接解析并使用
+            if is_mineru and tag:
+                poss = _extract_positions_from_tag(tag)
+                if not poss:
+                    # 如果解析失败，尝试从tag字符串中手动提取（处理格式问题）
+                    try:
+                        # 更宽松的正则：允许空格或tab分隔
+                        matches = re.findall(r"@@([0-9-]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)##", tag)
+                        if matches:
+                            for match in matches:
+                                pn, left, right, top, bottom = match
+                                poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom)))
+                    except Exception as e:
+                        pass
+            else:
+                # 非MinerU：正常解析tag
+                poss = _extract_positions_from_tag(tag)
+            
+            # 如果还是没有positions，使用默认值
+            if not poss:
+                poss = [(max(from_page, 0) + 1, 0.0, 0.0, 0.0, 0.0)]
+            
+            normalized_sections.append((txt, "", poss))
+        sections = normalized_sections
+
         callback(0.8, "Finish parsing.")
 
         if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
@@ -309,10 +330,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
         tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
         res = tokenize_table(tbls, doc, eng)
         res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
-        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
-        image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
-        if table_ctx or image_ctx:
-            attach_media_context(res, table_ctx, image_ctx)
         return res
 
     elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@@ -328,14 +345,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                 d["doc_type_kwd"] = "image"
             tokenize(d, text, eng)
             res.append(d)
-        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
-        image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
-        if table_ctx or image_ctx:
-            attach_media_context(res, table_ctx, image_ctx)
         return res
     else:
         raise NotImplementedError("file type not supported yet(pdf and docx supported)")
-
+    
 
 if __name__ == "__main__":
     import sys

From 02a4b79f905007c10fad37c0a76db87d28c53a8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Thu, 11 Dec 2025 21:12:00 +0800
Subject: [PATCH 8/9] chore: increase image stitching thresholds to 20/4000px

- MAX_COUNT: 10 -> 20 images
- MAX_HEIGHT: 2000px -> 4000px
- Allows more complete chunk thumbnails for long documents
---
 deepdoc/parser/mineru_parser.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 7d2fa0110..3a4e8bf10 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -431,24 +431,24 @@ class MinerUParser(RAGFlowPdfParser):
         智能拼接：应用阈值控制
         
         Thresholds:
-        - MAX_COUNT: 最多10张图
-        - MAX_HEIGHT: 总高度不超过2000px
+        - MAX_COUNT: 最多20张图
+        - MAX_HEIGHT: 总高度不超过4000px
         
         Strategies:
         - 数量过多: 均匀采样（保留首尾）
-        - 高度过高: 截断到2000px
+        - 高度过高: 截断到4000px
         - 不缩放图片（保持高清）
         """
-        MAX_COUNT = 10
-        MAX_HEIGHT = 2000
+        MAX_COUNT = 20
+        MAX_HEIGHT = 4000
         GAP = 6
         
-        # 1. 数量控制：如果超过10张，均匀采样
+        # 1. 数量控制：如果超过20张，均匀采样
         if len(images_with_metadata) > MAX_COUNT:
             self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}")
             images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT)
         
-        # 2. 高度控制：累加到2000px为止
+        # 2. 高度控制：累加到4000px为止
         trimmed_images = []
         current_height = 0
         

From 58792dfe994eb6aba4d2b6618d693a59d624dcbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <zhengchi.cntv@gmail.com>
Date: Fri, 12 Dec 2025 15:19:13 +0800
Subject: [PATCH 9/9] feat: enhance MinerU crop() with 3 major improvements

1. Sampling optimization: reduce from 20 to 12 images when exceeding threshold
2. Native image width normalization: re-crop page-width strips for consistent stitching
   - Preserves original native images for MinIO storage
   - Uses normalized versions only for thumbnail stitching
3. Low fallback threshold: stitch full page screenshots when 3 fallback images
   - Deduplicates and limits to max 3 pages
   - Provides better context for sparse thumbnails
---
 deepdoc/parser/mineru_parser.py | 130 ++++++++++++++++++++++++++++++--
 1 file changed, 124 insertions(+), 6 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 3a4e8bf10..9da2e0917 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -385,9 +385,10 @@ class MinerUParser(RAGFlowPdfParser):
             native_img_path = self._find_native_image_path(tag)
             if native_img_path:
                 try:
-                    img = Image.open(native_img_path)
+                    # ✅ 使用页宽标准化版本（原生图保留入库MinIO）
+                    img = self._normalize_native_image_width(native_img_path, tag)
                     images_to_stitch.append(("native", img, pos, tag))
-                    self.logger.debug(f"[MinerU] Using native image for tag: {tag}")
+                    self.logger.debug(f"[MinerU] Using normalized native image for tag: {tag}")
                     continue
                 except Exception as e:
                     self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}")
@@ -417,6 +418,12 @@ class MinerUParser(RAGFlowPdfParser):
                 return None, None
             return
         
+        # ✅ 兜底图≤3张时，拼接完整页（去重）
+        fallback_count = sum(1 for src, _, _, _ in images_to_stitch if src == "cached")
+        if fallback_count <= 3 and fallback_count > 0:
+            self.logger.debug(f"[MinerU] Fallback count = {fallback_count}, using full page strategy")
+            return self._handle_low_fallback_count(poss, need_position)
+        
         # Step 2: 智能拼接（带阈值控制）
         return self._smart_stitch_with_thresholds(images_to_stitch, need_position)
     
@@ -426,6 +433,116 @@ class MinerUParser(RAGFlowPdfParser):
         native_map = getattr(self, "_native_img_map", {})
         return native_map.get(tag)
     
+    def _normalize_native_image_width(self, native_img_path, tag):
+        """
+        将Native图标准化为页宽版本（仅用于拼接）
+        
+        原理：根据tag中的bbox，从页面重新裁剪页宽条带
+        - 横向：0 到 页宽
+        - 纵向：bbox的y范围
+        
+        Args:
+            native_img_path: MinerU原生图路径（保留入库MinIO）
+            tag: 包含page_idx和bbox信息的tag字符串
+            
+        Returns:
+            页宽标准化后的Image对象，失败则返回原生图
+        """
+        try:
+            # 解析tag获取page_idx和bbox
+            import re
+            match = re.match(r"@@(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)##", tag)
+            if not match:
+                # 解析失败，返回原生图
+                return Image.open(native_img_path)
+            
+            page_num, x0_str, x1_str, y0_str, y1_str = match.groups()
+            page_idx = int(page_num) - 1  # 转为0-based
+            bbox = [float(x0_str), float(y0_str), float(x1_str), float(y1_str)]
+            
+            # 检查page_images可用性
+            if not hasattr(self, "page_images") or not self.page_images:
+                return Image.open(native_img_path)
+            
+            if page_idx < 0 or page_idx >= len(self.page_images):
+                return Image.open(native_img_path)
+            
+            # 获取页面图片
+            page_img = self.page_images[page_idx]
+            page_width, page_height = page_img.size
+            
+            # bbox转像素
+            px0, py0, px1, py1 = self._bbox_to_pixels(bbox, (page_width, page_height))
+            
+            # 裁剪页宽条带（横向全宽，纵向bbox范围）
+            crop_y0 = max(0, min(py0, page_height))
+            crop_y1 = max(crop_y0 + 1, min(py1, page_height))
+            
+            if crop_y1 - crop_y0 < 2:
+                # bbox无效，返回原生图
+                return Image.open(native_img_path)
+            
+            page_width_img = page_img.crop((0, crop_y0, page_width, crop_y1))
+            self.logger.debug(f"[MinerU] Normalized native image to page-width: {page_width}x{crop_y1-crop_y0}px")
+            return page_width_img
+            
+        except Exception as e:
+            self.logger.debug(f"[MinerU] Failed to normalize native image, using original: {e}")
+            return Image.open(native_img_path)
+    
+    def _handle_low_fallback_count(self, poss, need_position):
+        """
+        兜底图≤3张时，拼接涉及页面截图（去重）
+        
+        策略：
+        - 提取所有涉及页码
+        - 去重并限制最多3页
+        - 拼接这些完整页
+        
+        Args:
+            poss: positions列表
+            need_position: 是否需要返回positions
+            
+        Returns:
+            拼接的完整页截图，或单页截图
+        """
+        if not hasattr(self, "page_images") or not self.page_images:
+            if need_position:
+                return None, None
+            return
+        
+        # 提取所有涉及页码（0-based），去重并排序
+        page_indices = sorted(set(
+            pns[0] for pns, _, _, _, _ in poss 
+            if pns and 0 <= pns[0] < len(self.page_images)
+        ))
+        
+        # 限制最多3页
+        page_indices = page_indices[:3]
+        
+        if not page_indices:
+            if need_position:
+                return None, None
+            return
+        
+        self.logger.info(f"[MinerU] Low fallback count, stitching {len(page_indices)} page(s): {[idx+1 for idx in page_indices]}")
+        
+        # 单页直接返回
+        if len(page_indices) == 1:
+            page_img = self.page_images[page_indices[0]]
+            if need_position:
+                return page_img, [[page_indices[0], 0, page_img.width, 0, page_img.height]]
+            return page_img
+        
+        # 多页垂直拼接
+        page_imgs_with_meta = [
+            ("fullpage", self.page_images[idx], ([idx], 0, 0, 0, 0), f"@@{idx+1}\t0\t0\t0\t0##")
+            for idx in page_indices
+        ]
+        
+        return self._stitch_images_vertically(page_imgs_with_meta, need_position, gap=10)
+    
+    
     def _smart_stitch_with_thresholds(self, images_with_metadata, need_position):
         """
         智能拼接：应用阈值控制
@@ -435,18 +552,19 @@ class MinerUParser(RAGFlowPdfParser):
         - MAX_HEIGHT: 总高度不超过4000px
         
         Strategies:
-        - 数量过多: 均匀采样（保留首尾）
+        - 数量过多: 均匀采样到12张（保留首尾）
         - 高度过高: 截断到4000px
         - 不缩放图片（保持高清）
         """
         MAX_COUNT = 20
+        SAMPLE_TARGET = 12  # 采样目标数量
         MAX_HEIGHT = 4000
         GAP = 6
         
-        # 1. 数量控制：如果超过20张，均匀采样
+        # 1. 数量控制：如果超过20张，均匀采样到12张
         if len(images_with_metadata) > MAX_COUNT:
-            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {MAX_COUNT}")
-            images_with_metadata = self._sample_images_uniformly(images_with_metadata, MAX_COUNT)
+            self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {SAMPLE_TARGET}")
+            images_with_metadata = self._sample_images_uniformly(images_with_metadata, SAMPLE_TARGET)
         
         # 2. 高度控制：累加到4000px为止
         trimmed_images = []