fix(mineru): use consistent 0-1000 normalized coords for line_tag cache matching

This commit is contained in:
少卿 2025-12-09 22:17:15 +08:00
parent eb004b6254
commit 8049cb9275

View file

@ -334,6 +334,13 @@ class MinerUParser(RAGFlowPdfParser):
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott) return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
def _raw_line_tag(self, bx):
"""生成原始归一化坐标(0-1000)的line_tag,用于缓存key匹配"""
pn = bx.get("page_idx", 0) + 1
bbox = bx.get("bbox", [0, 0, 0, 0])
x0, y0, x1, y1 = bbox
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, x0, x1, y0, y1)
def crop(self, text, ZM=1, need_position=False): def crop(self, text, ZM=1, need_position=False):
"""Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop.""" """Crop image for chunk. Prioritize cached img_path from MinerU/兜底生成, fallback to page crop."""
poss = self.extract_positions(text) poss = self.extract_positions(text)
@ -545,16 +552,8 @@ class MinerUParser(RAGFlowPdfParser):
img_path_str = str(out_path.resolve()) img_path_str = str(out_path.resolve())
item["img_path"] = img_path_str item["img_path"] = img_path_str
# Cache for crop() lookup: map line_tag to img_path # Cache for crop() lookup: use raw 0-1000 normalized tag for consistent matching
# 缓存两种格式的 key,确保无论 _transfer_to_sections 怎么生成 tag 都能匹配 raw_tag = self._raw_line_tag(item)
line_tag = self._line_tag(item)
self._img_path_cache[line_tag] = img_path_str
# 同时缓存原始 bbox 格式 (不依赖 page_images 的归一化坐标)
raw_bbox = item.get("bbox", [0, 0, 0, 0])
raw_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(
page_idx + 1, float(raw_bbox[0]), float(raw_bbox[2]), float(raw_bbox[1]), float(raw_bbox[3])
)
self._img_path_cache[raw_tag] = img_path_str self._img_path_cache[raw_tag] = img_path_str
generated += 1 generated += 1
except Exception as e: except Exception as e:
@ -649,36 +648,30 @@ class MinerUParser(RAGFlowPdfParser):
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
sections = [] sections = []
for output in outputs: for output in outputs:
section = None match output["type"]:
content_type = output.get("type", "") case MinerUContentType.TEXT:
section = output["text"]
# 使用字符串匹配,兼容 MinerU API 返回的原始类型 case MinerUContentType.TABLE:
match content_type:
case "text" | MinerUContentType.TEXT:
section = output.get("text", "")
case "table" | MinerUContentType.TABLE:
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", [])) section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
if not section.strip(): if not section.strip():
section = "FAILED TO PARSE TABLE" section = "FAILED TO PARSE TABLE"
case "image" | MinerUContentType.IMAGE: case MinerUContentType.IMAGE:
section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", [])) section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", []))
case "equation" | MinerUContentType.EQUATION: case MinerUContentType.EQUATION:
section = output.get("text", "") section = output["text"]
case "code" | MinerUContentType.CODE: case MinerUContentType.CODE:
section = output.get("code_body", "") + "\n".join(output.get("code_caption", [])) section = output["code_body"] + "\n".join(output.get("code_caption", []))
case "list" | MinerUContentType.LIST: case MinerUContentType.LIST:
section = "\n".join(output.get("list_items", [])) section = "\n".join(output.get("list_items", []))
case "header": case MinerUContentType.DISCARDED:
section = output.get("text", "")
case "discarded" | MinerUContentType.DISCARDED:
pass pass
if section and parse_method == "manual": if section and parse_method == "manual":
sections.append((section, output["type"], self._line_tag(output))) sections.append((section, output["type"], self._raw_line_tag(output)))
elif section and parse_method == "paper": elif section and parse_method == "paper":
sections.append((section + self._line_tag(output), output["type"])) sections.append((section + self._raw_line_tag(output), output["type"]))
elif section: else:
sections.append((section, self._line_tag(output))) sections.append((section, self._raw_line_tag(output)))
return sections return sections
def _transfer_to_tables(self, outputs: list[dict[str, Any]]): def _transfer_to_tables(self, outputs: list[dict[str, Any]]):