Fix: Generate missing images for MinerU text blocks using local crop
This commit is contained in:
parent
65a5a56d95
commit
b443d34faf
1 changed files with 70 additions and 0 deletions
|
|
@ -51,6 +51,8 @@ class MinerUContentType(StrEnum):
|
||||||
CODE = "code"
|
CODE = "code"
|
||||||
LIST = "list"
|
LIST = "list"
|
||||||
DISCARDED = "discarded"
|
DISCARDED = "discarded"
|
||||||
|
HEADER = "header"
|
||||||
|
PAGE_NUMBER = "page_number"
|
||||||
|
|
||||||
|
|
||||||
class MinerUParser(RAGFlowPdfParser):
|
class MinerUParser(RAGFlowPdfParser):
|
||||||
|
|
@ -459,6 +461,67 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||||
return poss
|
return poss
|
||||||
|
|
||||||
|
def _bbox_to_pixels(self, bbox, page_size):
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
pw, ph = page_size
|
||||||
|
maxv = max(bbox)
|
||||||
|
# 经验:MinerU bbox 常为 0~1000 归一化;否则认为已是像素
|
||||||
|
if maxv <= 1.5:
|
||||||
|
sx, sy = pw, ph
|
||||||
|
elif maxv <= 1200:
|
||||||
|
sx, sy = pw / 1000.0, ph / 1000.0
|
||||||
|
else:
|
||||||
|
sx, sy = 1.0, 1.0
|
||||||
|
return (
|
||||||
|
int(x0 * sx),
|
||||||
|
int(y0 * sy),
|
||||||
|
int(x1 * sx),
|
||||||
|
int(y1 * sy),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str):
|
||||||
|
if not getattr(self, "page_images", None):
|
||||||
|
return
|
||||||
|
if not subdir:
|
||||||
|
return
|
||||||
|
img_root = subdir / "generated_images"
|
||||||
|
img_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
text_types = {MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.CODE, MinerUContentType.HEADER}
|
||||||
|
generated = 0
|
||||||
|
for idx, item in enumerate(outputs):
|
||||||
|
if item.get("type") not in text_types:
|
||||||
|
continue
|
||||||
|
if item.get("img_path"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
bbox = item.get("bbox")
|
||||||
|
if not bbox or len(bbox) != 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
page_idx = int(item.get("page_idx", 0))
|
||||||
|
if page_idx < 0 or page_idx >= len(self.page_images):
|
||||||
|
continue
|
||||||
|
|
||||||
|
x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size)
|
||||||
|
|
||||||
|
# guard invalid bbox
|
||||||
|
if x1 - x0 < 2 or y1 - y0 < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
crop = self.page_images[page_idx].crop((x0, y0, x1, y1))
|
||||||
|
fname = f"{file_stem}_gen_{idx}.jpg"
|
||||||
|
out_path = img_root / fname
|
||||||
|
crop.save(out_path, format="JPEG", quality=80)
|
||||||
|
item["img_path"] = str(out_path.resolve())
|
||||||
|
generated += 1
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if generated:
|
||||||
|
self.logger.info(f"[MinerU] generated {generated} fallback images for text blocks")
|
||||||
|
|
||||||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
||||||
candidates = []
|
candidates = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
@ -532,6 +595,13 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
for key in ("img_path", "table_img_path", "equation_img_path"):
|
for key in ("img_path", "table_img_path", "equation_img_path"):
|
||||||
if key in item and item[key]:
|
if key in item and item[key]:
|
||||||
item[key] = str((subdir / item[key]).resolve())
|
item[key] = str((subdir / item[key]).resolve())
|
||||||
|
|
||||||
|
# MinerU(vlm-http-client) 不会为纯文本生成图片,这里兜底用本地页图裁剪生成,方便后续引用/MinIO 存图
|
||||||
|
try:
|
||||||
|
self._generate_missing_images(data, subdir, file_stem)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"[MinerU] generate missing images failed: {e}")
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue