Merge 58792dfe99 into 81eb03d230
This commit is contained in:
commit
073c94a34c
2 changed files with 443 additions and 143 deletions
|
|
@ -51,6 +51,8 @@ class MinerUContentType(StrEnum):
|
||||||
CODE = "code"
|
CODE = "code"
|
||||||
LIST = "list"
|
LIST = "list"
|
||||||
DISCARDED = "discarded"
|
DISCARDED = "discarded"
|
||||||
|
HEADER = "header"
|
||||||
|
PAGE_NUMBER = "page_number"
|
||||||
|
|
||||||
|
|
||||||
class MinerUParser(RAGFlowPdfParser):
|
class MinerUParser(RAGFlowPdfParser):
|
||||||
|
|
@ -61,6 +63,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
self.using_api = False
|
self.using_api = False
|
||||||
self.outlines = []
|
self.outlines = []
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
self._img_path_cache = {} # line_tag -> img_path mapping for crop() lookup
|
||||||
|
self._native_img_map = {} # line_tag -> native mineru image (image/table/equation)
|
||||||
|
|
||||||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||||||
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
||||||
|
|
@ -333,123 +337,303 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
|
|
||||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
||||||
|
|
||||||
|
def _raw_line_tag(self, bx):
|
||||||
|
"""生成原始归一化坐标(0-1000)的line_tag,用于缓存key匹配"""
|
||||||
|
pn = bx.get("page_idx", 0) + 1
|
||||||
|
bbox = bx.get("bbox", [0, 0, 0, 0])
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, x0, x1, y0, y1)
|
||||||
|
|
||||||
def crop(self, text, ZM=1, need_position=False):
|
def crop(self, text, ZM=1, need_position=False):
|
||||||
imgs = []
|
"""
|
||||||
|
MinerU专用智能crop:
|
||||||
|
1. 混合使用原生图(表格/图片)+ 兜底图(页宽条带)
|
||||||
|
2. 拼接时去重(相同bbox的图只用一次)
|
||||||
|
3. 阈值控制(最多10张,总高<2000px)
|
||||||
|
4. 保持高清(不缩放)
|
||||||
|
"""
|
||||||
|
# 从text中提取原始tags(保持1-based页码)
|
||||||
|
original_tags = re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text)
|
||||||
poss = self.extract_positions(text)
|
poss = self.extract_positions(text)
|
||||||
if not poss:
|
|
||||||
|
if not poss or not original_tags:
|
||||||
if need_position:
|
if need_position:
|
||||||
return None, None
|
return None, None
|
||||||
return
|
return
|
||||||
|
|
||||||
if not getattr(self, "page_images", None):
|
# 确保tags和poss数量一致
|
||||||
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
|
if len(original_tags) != len(poss):
|
||||||
if need_position:
|
self.logger.warning(f"[MinerU] Tag count ({len(original_tags)}) != position count ({len(poss)}), using first {min(len(original_tags), len(poss))} items")
|
||||||
return None, None
|
min_len = min(len(original_tags), len(poss))
|
||||||
return
|
original_tags = original_tags[:min_len]
|
||||||
|
poss = poss[:min_len]
|
||||||
|
|
||||||
page_count = len(self.page_images)
|
# Step 1: 收集所有tag对应的图片
|
||||||
|
images_to_stitch = []
|
||||||
|
seen_tags = set() # 用于去重
|
||||||
|
|
||||||
filtered_poss = []
|
for tag, pos in zip(original_tags, poss):
|
||||||
for pns, left, right, top, bottom in poss:
|
pns, left, right, top, bottom = pos
|
||||||
if not pns:
|
if not pns:
|
||||||
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
|
|
||||||
continue
|
|
||||||
valid_pns = [p for p in pns if 0 <= p < page_count]
|
|
||||||
if not valid_pns:
|
|
||||||
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
|
|
||||||
continue
|
|
||||||
filtered_poss.append((valid_pns, left, right, top, bottom))
|
|
||||||
|
|
||||||
poss = filtered_poss
|
|
||||||
if not poss:
|
|
||||||
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
|
|
||||||
if need_position:
|
|
||||||
return None, None
|
|
||||||
return
|
|
||||||
|
|
||||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
|
||||||
GAP = 6
|
|
||||||
pos = poss[0]
|
|
||||||
first_page_idx = pos[0][0]
|
|
||||||
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
||||||
pos = poss[-1]
|
|
||||||
last_page_idx = pos[0][-1]
|
|
||||||
if not (0 <= last_page_idx < page_count):
|
|
||||||
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
|
||||||
if need_position:
|
|
||||||
return None, None
|
|
||||||
return
|
|
||||||
last_page_height = self.page_images[last_page_idx].size[1]
|
|
||||||
poss.append(
|
|
||||||
(
|
|
||||||
[last_page_idx],
|
|
||||||
pos[1],
|
|
||||||
pos[2],
|
|
||||||
min(last_page_height, pos[4] + GAP),
|
|
||||||
min(last_page_height, pos[4] + 120),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
positions = []
|
|
||||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
||||||
right = left + max_width
|
|
||||||
|
|
||||||
if bottom <= top:
|
|
||||||
bottom = top + 2
|
|
||||||
|
|
||||||
for pn in pns[1:]:
|
|
||||||
if 0 <= pn - 1 < page_count:
|
|
||||||
bottom += self.page_images[pn - 1].size[1]
|
|
||||||
else:
|
|
||||||
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
|
||||||
|
|
||||||
if not (0 <= pns[0] < page_count):
|
|
||||||
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
img0 = self.page_images[pns[0]]
|
# ✅ 去重:如果tag已处理过,跳过
|
||||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
if tag in seen_tags:
|
||||||
crop0 = img0.crop((x0, y0, x1, y1))
|
self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}")
|
||||||
imgs.append(crop0)
|
continue
|
||||||
if 0 < ii < len(poss) - 1:
|
seen_tags.add(tag)
|
||||||
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
|
||||||
|
|
||||||
bottom -= img0.size[1]
|
# 优先级1: 查找MinerU原生图(表格/图片/公式)
|
||||||
for pn in pns[1:]:
|
native_img_path = self._find_native_image_path(tag)
|
||||||
if not (0 <= pn < page_count):
|
if native_img_path:
|
||||||
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
try:
|
||||||
|
# ✅ 使用页宽标准化版本(原生图保留入库MinIO)
|
||||||
|
img = self._normalize_native_image_width(native_img_path, tag)
|
||||||
|
images_to_stitch.append(("native", img, pos, tag))
|
||||||
|
self.logger.debug(f"[MinerU] Using normalized native image for tag: {tag}")
|
||||||
continue
|
continue
|
||||||
page = self.page_images[pn]
|
except Exception as e:
|
||||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
self.logger.debug(f"[MinerU] Failed to load native image {native_img_path}: {e}")
|
||||||
cimgp = page.crop((x0, y0, x1, y1))
|
|
||||||
imgs.append(cimgp)
|
|
||||||
if 0 < ii < len(poss) - 1:
|
|
||||||
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
|
||||||
bottom -= page.size[1]
|
|
||||||
|
|
||||||
if not imgs:
|
# 优先级2: 查找兜底生成的页宽图(缓存)
|
||||||
|
cache = getattr(self, "_img_path_cache", {})
|
||||||
|
if tag in cache:
|
||||||
|
try:
|
||||||
|
img = Image.open(cache[tag])
|
||||||
|
images_to_stitch.append(("cached", img, pos, tag))
|
||||||
|
self.logger.debug(f"[MinerU] Using cached fallback image for tag: {tag}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"[MinerU] Failed to load cached image: {e}")
|
||||||
|
|
||||||
|
# 优先级3: 完整页兜底(如果page_images可用)
|
||||||
|
if hasattr(self, "page_images") and self.page_images:
|
||||||
|
page_idx = pns[0] # pns[0]是0-based的页索引
|
||||||
|
if 0 <= page_idx < len(self.page_images):
|
||||||
|
img = self.page_images[page_idx]
|
||||||
|
images_to_stitch.append(("fullpage", img, pos, tag))
|
||||||
|
self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}, page_idx={page_idx}")
|
||||||
|
|
||||||
|
if not images_to_stitch:
|
||||||
|
self.logger.warning("[MinerU] No images found for chunk")
|
||||||
if need_position:
|
if need_position:
|
||||||
return None, None
|
return None, None
|
||||||
return
|
return
|
||||||
|
|
||||||
height = 0
|
# ✅ 兜底图≤3张时,拼接完整页(去重)
|
||||||
for img in imgs:
|
fallback_count = sum(1 for src, _, _, _ in images_to_stitch if src == "cached")
|
||||||
height += img.size[1] + GAP
|
if fallback_count <= 3 and fallback_count > 0:
|
||||||
height = int(height)
|
self.logger.debug(f"[MinerU] Fallback count = {fallback_count}, using full page strategy")
|
||||||
width = int(np.max([i.size[0] for i in imgs]))
|
return self._handle_low_fallback_count(poss, need_position)
|
||||||
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
|
||||||
height = 0
|
# Step 2: 智能拼接(带阈值控制)
|
||||||
for ii, img in enumerate(imgs):
|
return self._smart_stitch_with_thresholds(images_to_stitch, need_position)
|
||||||
if ii == 0 or ii + 1 == len(imgs):
|
|
||||||
img = img.convert("RGBA")
|
def _find_native_image_path(self, tag):
|
||||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
"""查找MinerU原生图片路径(表格/图片/公式)"""
|
||||||
overlay.putalpha(128)
|
# 需要在_read_output时建立 tag → native_img_path 的映射
|
||||||
img = Image.alpha_composite(img, overlay).convert("RGB")
|
native_map = getattr(self, "_native_img_map", {})
|
||||||
pic.paste(img, (0, int(height)))
|
return native_map.get(tag)
|
||||||
height += img.size[1] + GAP
|
|
||||||
|
def _normalize_native_image_width(self, native_img_path, tag):
|
||||||
|
"""
|
||||||
|
将Native图标准化为页宽版本(仅用于拼接)
|
||||||
|
|
||||||
|
原理:根据tag中的bbox,从页面重新裁剪页宽条带
|
||||||
|
- 横向:0 到 页宽
|
||||||
|
- 纵向:bbox的y范围
|
||||||
|
|
||||||
|
Args:
|
||||||
|
native_img_path: MinerU原生图路径(保留入库MinIO)
|
||||||
|
tag: 包含page_idx和bbox信息的tag字符串
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
页宽标准化后的Image对象,失败则返回原生图
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 解析tag获取page_idx和bbox
|
||||||
|
import re
|
||||||
|
match = re.match(r"@@(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)##", tag)
|
||||||
|
if not match:
|
||||||
|
# 解析失败,返回原生图
|
||||||
|
return Image.open(native_img_path)
|
||||||
|
|
||||||
|
page_num, x0_str, x1_str, y0_str, y1_str = match.groups()
|
||||||
|
page_idx = int(page_num) - 1 # 转为0-based
|
||||||
|
bbox = [float(x0_str), float(y0_str), float(x1_str), float(y1_str)]
|
||||||
|
|
||||||
|
# 检查page_images可用性
|
||||||
|
if not hasattr(self, "page_images") or not self.page_images:
|
||||||
|
return Image.open(native_img_path)
|
||||||
|
|
||||||
|
if page_idx < 0 or page_idx >= len(self.page_images):
|
||||||
|
return Image.open(native_img_path)
|
||||||
|
|
||||||
|
# 获取页面图片
|
||||||
|
page_img = self.page_images[page_idx]
|
||||||
|
page_width, page_height = page_img.size
|
||||||
|
|
||||||
|
# bbox转像素
|
||||||
|
px0, py0, px1, py1 = self._bbox_to_pixels(bbox, (page_width, page_height))
|
||||||
|
|
||||||
|
# 裁剪页宽条带(横向全宽,纵向bbox范围)
|
||||||
|
crop_y0 = max(0, min(py0, page_height))
|
||||||
|
crop_y1 = max(crop_y0 + 1, min(py1, page_height))
|
||||||
|
|
||||||
|
if crop_y1 - crop_y0 < 2:
|
||||||
|
# bbox无效,返回原生图
|
||||||
|
return Image.open(native_img_path)
|
||||||
|
|
||||||
|
page_width_img = page_img.crop((0, crop_y0, page_width, crop_y1))
|
||||||
|
self.logger.debug(f"[MinerU] Normalized native image to page-width: {page_width}x{crop_y1-crop_y0}px")
|
||||||
|
return page_width_img
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"[MinerU] Failed to normalize native image, using original: {e}")
|
||||||
|
return Image.open(native_img_path)
|
||||||
|
|
||||||
|
def _handle_low_fallback_count(self, poss, need_position):
|
||||||
|
"""
|
||||||
|
兜底图≤3张时,拼接涉及页面截图(去重)
|
||||||
|
|
||||||
|
策略:
|
||||||
|
- 提取所有涉及页码
|
||||||
|
- 去重并限制最多3页
|
||||||
|
- 拼接这些完整页
|
||||||
|
|
||||||
|
Args:
|
||||||
|
poss: positions列表
|
||||||
|
need_position: 是否需要返回positions
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
拼接的完整页截图,或单页截图
|
||||||
|
"""
|
||||||
|
if not hasattr(self, "page_images") or not self.page_images:
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
|
# 提取所有涉及页码(0-based),去重并排序
|
||||||
|
page_indices = sorted(set(
|
||||||
|
pns[0] for pns, _, _, _, _ in poss
|
||||||
|
if pns and 0 <= pns[0] < len(self.page_images)
|
||||||
|
))
|
||||||
|
|
||||||
|
# 限制最多3页
|
||||||
|
page_indices = page_indices[:3]
|
||||||
|
|
||||||
|
if not page_indices:
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.info(f"[MinerU] Low fallback count, stitching {len(page_indices)} page(s): {[idx+1 for idx in page_indices]}")
|
||||||
|
|
||||||
|
# 单页直接返回
|
||||||
|
if len(page_indices) == 1:
|
||||||
|
page_img = self.page_images[page_indices[0]]
|
||||||
|
if need_position:
|
||||||
|
return page_img, [[page_indices[0], 0, page_img.width, 0, page_img.height]]
|
||||||
|
return page_img
|
||||||
|
|
||||||
|
# 多页垂直拼接
|
||||||
|
page_imgs_with_meta = [
|
||||||
|
("fullpage", self.page_images[idx], ([idx], 0, 0, 0, 0), f"@@{idx+1}\t0\t0\t0\t0##")
|
||||||
|
for idx in page_indices
|
||||||
|
]
|
||||||
|
|
||||||
|
return self._stitch_images_vertically(page_imgs_with_meta, need_position, gap=10)
|
||||||
|
|
||||||
|
|
||||||
|
def _smart_stitch_with_thresholds(self, images_with_metadata, need_position):
|
||||||
|
"""
|
||||||
|
智能拼接:应用阈值控制
|
||||||
|
|
||||||
|
Thresholds:
|
||||||
|
- MAX_COUNT: 最多20张图
|
||||||
|
- MAX_HEIGHT: 总高度不超过4000px
|
||||||
|
|
||||||
|
Strategies:
|
||||||
|
- 数量过多: 均匀采样到12张(保留首尾)
|
||||||
|
- 高度过高: 截断到4000px
|
||||||
|
- 不缩放图片(保持高清)
|
||||||
|
"""
|
||||||
|
MAX_COUNT = 20
|
||||||
|
SAMPLE_TARGET = 12 # 采样目标数量
|
||||||
|
MAX_HEIGHT = 4000
|
||||||
|
GAP = 6
|
||||||
|
|
||||||
|
# 1. 数量控制:如果超过20张,均匀采样到12张
|
||||||
|
if len(images_with_metadata) > MAX_COUNT:
|
||||||
|
self.logger.info(f"[MinerU] Too many images ({len(images_with_metadata)}), sampling to {SAMPLE_TARGET}")
|
||||||
|
images_with_metadata = self._sample_images_uniformly(images_with_metadata, SAMPLE_TARGET)
|
||||||
|
|
||||||
|
# 2. 高度控制:累加到4000px为止
|
||||||
|
trimmed_images = []
|
||||||
|
current_height = 0
|
||||||
|
|
||||||
|
for src, img, pos, tag in images_with_metadata:
|
||||||
|
if current_height + img.height > MAX_HEIGHT:
|
||||||
|
self.logger.info(f"[MinerU] Reached max height {MAX_HEIGHT}px at {len(trimmed_images)} images, stopping")
|
||||||
|
break
|
||||||
|
trimmed_images.append((src, img, pos, tag))
|
||||||
|
current_height += img.height + GAP
|
||||||
|
|
||||||
|
# 至少保留一张图
|
||||||
|
if not trimmed_images and images_with_metadata:
|
||||||
|
trimmed_images = [images_with_metadata[0]]
|
||||||
|
|
||||||
|
# 3. 垂直拼接(不缩放)
|
||||||
|
return self._stitch_images_vertically(trimmed_images, need_position, GAP)
|
||||||
|
|
||||||
|
def _sample_images_uniformly(self, images, target_count):
|
||||||
|
"""均匀采样:保留首尾,均匀抽取中间"""
|
||||||
|
if len(images) <= target_count:
|
||||||
|
return images
|
||||||
|
|
||||||
|
sampled = [images[0]] # 首张
|
||||||
|
step = len(images) / (target_count - 1)
|
||||||
|
for i in range(1, target_count - 1):
|
||||||
|
idx = int(i * step)
|
||||||
|
sampled.append(images[idx])
|
||||||
|
sampled.append(images[-1]) # 末张
|
||||||
|
return sampled
|
||||||
|
|
||||||
|
def _stitch_images_vertically(self, images_with_metadata, need_position, gap):
|
||||||
|
"""垂直拼接图片(不加补丁,不缩放)"""
|
||||||
|
if not images_with_metadata:
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
|
imgs = [img for _, img, _, _ in images_with_metadata]
|
||||||
|
positions_list = [pos for _, _, pos, _ in images_with_metadata]
|
||||||
|
|
||||||
|
# 计算画布尺寸
|
||||||
|
total_height = sum(img.height for img in imgs) + gap * (len(imgs) - 1)
|
||||||
|
max_width = max(img.width for img in imgs)
|
||||||
|
|
||||||
|
# 创建画布
|
||||||
|
pic = Image.new("RGB", (max_width, total_height), (245, 245, 245))
|
||||||
|
|
||||||
|
# 逐张粘贴(垂直堆叠)
|
||||||
|
current_y = 0
|
||||||
|
positions = []
|
||||||
|
|
||||||
|
for idx, (img, pos) in enumerate(zip(imgs, positions_list)):
|
||||||
|
pic.paste(img, (0, current_y))
|
||||||
|
|
||||||
|
# 提取position信息
|
||||||
|
if pos and len(pos) >= 5:
|
||||||
|
pns, left, right, top, bottom = pos
|
||||||
|
if pns:
|
||||||
|
page_num = pns[0] + getattr(self, "page_from", 0)
|
||||||
|
positions.append((page_num, int(left), int(right), int(top), int(bottom)))
|
||||||
|
|
||||||
|
current_y += img.height + gap
|
||||||
|
|
||||||
if need_position:
|
if need_position:
|
||||||
return pic, positions
|
return pic, positions if positions else [(0, 0, max_width, 0, total_height)]
|
||||||
return pic
|
return pic
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -461,6 +645,85 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||||
return poss
|
return poss
|
||||||
|
|
||||||
|
def _bbox_to_pixels(self, bbox, page_size):
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
pw, ph = page_size
|
||||||
|
maxv = max(bbox)
|
||||||
|
# 经验:MinerU bbox 常为 0~1000 归一化;否则认为已是像素
|
||||||
|
if maxv <= 1.5:
|
||||||
|
sx, sy = pw, ph
|
||||||
|
elif maxv <= 1200:
|
||||||
|
sx, sy = pw / 1000.0, ph / 1000.0
|
||||||
|
else:
|
||||||
|
sx, sy = 1.0, 1.0
|
||||||
|
return (
|
||||||
|
int(x0 * sx),
|
||||||
|
int(y0 * sy),
|
||||||
|
int(x1 * sx),
|
||||||
|
int(y1 * sy),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _generate_missing_images(self, outputs: list[dict[str, Any]], subdir: Path, file_stem: str):
|
||||||
|
"""生成兜底图:按页宽(横向全宽,纵向按bbox)"""
|
||||||
|
if not getattr(self, "page_images", None):
|
||||||
|
return
|
||||||
|
if not subdir:
|
||||||
|
return
|
||||||
|
img_root = subdir / "generated_images"
|
||||||
|
img_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
text_types = {"text", "list", "header", "code", MinerUContentType.TEXT, MinerUContentType.LIST, MinerUContentType.EQUATION, MinerUContentType.CODE}
|
||||||
|
generated = 0
|
||||||
|
for idx, item in enumerate(outputs):
|
||||||
|
if item.get("type") not in text_types:
|
||||||
|
continue
|
||||||
|
if item.get("img_path"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
bbox = item.get("bbox")
|
||||||
|
if not bbox or len(bbox) != 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
page_idx = int(item.get("page_idx", 0))
|
||||||
|
if page_idx < 0 or page_idx >= len(self.page_images):
|
||||||
|
continue
|
||||||
|
|
||||||
|
x0, y0, x1, y1 = self._bbox_to_pixels(bbox, self.page_images[page_idx].size)
|
||||||
|
|
||||||
|
# 获取页面尺寸
|
||||||
|
pw, ph = self.page_images[page_idx].size
|
||||||
|
|
||||||
|
# ✅ 改为按页宽生成:横向=整页宽度,纵向=bbox范围
|
||||||
|
# x坐标:0 到 页宽
|
||||||
|
# y坐标:bbox的y0到y1(clamp到页面内)
|
||||||
|
crop_x0 = 0
|
||||||
|
crop_x1 = pw
|
||||||
|
crop_y0 = max(0, min(y0, ph))
|
||||||
|
crop_y1 = max(0, min(y1, ph))
|
||||||
|
|
||||||
|
# guard invalid bbox
|
||||||
|
if crop_y1 - crop_y0 < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 裁剪页宽条带
|
||||||
|
cropped = self.page_images[page_idx].crop((crop_x0, crop_y0, crop_x1, crop_y1))
|
||||||
|
fname = f"{file_stem}_gen_{idx}.jpg"
|
||||||
|
out_path = img_root / fname
|
||||||
|
cropped.save(out_path, format="JPEG", quality=80)
|
||||||
|
img_path_str = str(out_path.resolve())
|
||||||
|
item["img_path"] = img_path_str
|
||||||
|
|
||||||
|
# Cache for crop() lookup: use raw 0-1000 normalized tag for consistent matching
|
||||||
|
raw_tag = self._raw_line_tag(item)
|
||||||
|
self._img_path_cache[raw_tag] = img_path_str
|
||||||
|
generated += 1
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"[MinerU] skip image gen idx={idx} page={page_idx}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if generated:
|
||||||
|
self.logger.info(f"[MinerU] generated {generated} page-width fallback images, cached {len(self._img_path_cache)} tags")
|
||||||
|
|
||||||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
||||||
candidates = []
|
candidates = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
@ -530,10 +793,31 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
with open(json_file, "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
|
# 建立 tag → 原生img_path 的映射(表格/图片/公式)
|
||||||
|
self._native_img_map = {}
|
||||||
|
|
||||||
for item in data:
|
for item in data:
|
||||||
|
# 解析并补全路径
|
||||||
for key in ("img_path", "table_img_path", "equation_img_path"):
|
for key in ("img_path", "table_img_path", "equation_img_path"):
|
||||||
if key in item and item[key]:
|
if key in item and item[key]:
|
||||||
item[key] = str((subdir / item[key]).resolve())
|
item[key] = str((subdir / item[key]).resolve())
|
||||||
|
|
||||||
|
# 建立映射: tag → native_img_path
|
||||||
|
try:
|
||||||
|
tag = self._raw_line_tag(item)
|
||||||
|
self._native_img_map[tag] = item[key]
|
||||||
|
self.logger.debug(f"[MinerU] Mapped native image: {tag} → {item[key]}")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"[MinerU] Failed to map native image: {e}")
|
||||||
|
|
||||||
|
break # 只需要第一个找到的图片路径
|
||||||
|
|
||||||
|
# MinerU(vlm-http-client) 不会为纯文本生成图片,这里兜底用本地页图裁剪生成,方便后续引用/MinIO 存图
|
||||||
|
try:
|
||||||
|
self._generate_missing_images(data, subdir, file_stem)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"[MinerU] generate missing images failed: {e}")
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
||||||
|
|
@ -558,11 +842,11 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if section and parse_method == "manual":
|
if section and parse_method == "manual":
|
||||||
sections.append((section, output["type"], self._line_tag(output)))
|
sections.append((section, output["type"], self._raw_line_tag(output)))
|
||||||
elif section and parse_method == "paper":
|
elif section and parse_method == "paper":
|
||||||
sections.append((section + self._line_tag(output), output["type"]))
|
sections.append((section + self._raw_line_tag(output), output["type"]))
|
||||||
else:
|
else:
|
||||||
sections.append((section, self._line_tag(output)))
|
sections.append((section, self._raw_line_tag(output)))
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
||||||
|
|
@ -586,6 +870,9 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
|
|
||||||
temp_pdf = None
|
temp_pdf = None
|
||||||
created_tmp_dir = False
|
created_tmp_dir = False
|
||||||
|
# per-task cache reset to avoid stale images across documents
|
||||||
|
self._img_path_cache = {}
|
||||||
|
self._native_img_map = {}
|
||||||
|
|
||||||
# remove spaces, or mineru crash, and _read_output fail too
|
# remove spaces, or mineru crash, and _read_output fail too
|
||||||
file_path = Path(filepath)
|
file_path = Path(filepath)
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ import re
|
||||||
|
|
||||||
from common.constants import ParserType
|
from common.constants import ParserType
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
|
||||||
from common.token_utils import num_tokens_from_string
|
from common.token_utils import num_tokens_from_string
|
||||||
from deepdoc.parser import PdfParser, DocxParser
|
from deepdoc.parser import PdfParser, DocxParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
||||||
|
|
@ -213,40 +213,61 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||||
lang = lang,
|
lang = lang,
|
||||||
callback = callback,
|
callback = callback,
|
||||||
pdf_cls = Pdf,
|
pdf_cls = Pdf,
|
||||||
layout_recognizer = layout_recognizer,
|
|
||||||
parse_method = "manual",
|
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
def _normalize_section(section):
|
|
||||||
# pad section to length 3: (txt, sec_id, poss)
|
|
||||||
if len(section) == 1:
|
|
||||||
section = (section[0], "", [])
|
|
||||||
elif len(section) == 2:
|
|
||||||
section = (section[0], "", section[1])
|
|
||||||
elif len(section) != 3:
|
|
||||||
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
|
||||||
|
|
||||||
txt, layoutno, poss = section
|
|
||||||
if isinstance(poss, str):
|
|
||||||
poss = pdf_parser.extract_positions(poss)
|
|
||||||
if poss:
|
|
||||||
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
|
||||||
pn = first[0]
|
|
||||||
if isinstance(pn, list) and pn:
|
|
||||||
pn = pn[0] # [pn] -> pn
|
|
||||||
poss[0] = (pn, *first[1:])
|
|
||||||
|
|
||||||
return (txt, layoutno, poss)
|
|
||||||
|
|
||||||
sections = [_normalize_section(sec) for sec in sections]
|
|
||||||
|
|
||||||
if not sections and not tbls:
|
if not sections and not tbls:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if name in ["tcadp", "docling", "mineru"]:
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
|
# Normalize sections to (text, layout, positions) even if parser only returns (text, tag)
|
||||||
|
def _extract_positions_from_tag(tag: str):
|
||||||
|
import re
|
||||||
|
poss = []
|
||||||
|
for t in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", tag or ""):
|
||||||
|
pn, left, right, top, bottom = t.strip("#").strip("@").split("\t")
|
||||||
|
poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom)))
|
||||||
|
return poss
|
||||||
|
|
||||||
|
normalized_sections = []
|
||||||
|
# 🎯 MinerU专用逻辑:直接使用已有的positions,不重新解析tag
|
||||||
|
is_mineru = name == "mineru"
|
||||||
|
|
||||||
|
for item in sections:
|
||||||
|
if len(item) >= 3:
|
||||||
|
# 已经是(text, layout, positions)格式
|
||||||
|
normalized_sections.append(item)
|
||||||
|
continue
|
||||||
|
|
||||||
|
txt, tag = item[0], item[1] if len(item) > 1 else ""
|
||||||
|
|
||||||
|
# ✅ MinerU: 如果tag包含完整的bbox信息,直接解析并使用
|
||||||
|
if is_mineru and tag:
|
||||||
|
poss = _extract_positions_from_tag(tag)
|
||||||
|
if not poss:
|
||||||
|
# 如果解析失败,尝试从tag字符串中手动提取(处理格式问题)
|
||||||
|
try:
|
||||||
|
# 更宽松的正则:允许空格或tab分隔
|
||||||
|
matches = re.findall(r"@@([0-9-]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)##", tag)
|
||||||
|
if matches:
|
||||||
|
for match in matches:
|
||||||
|
pn, left, right, top, bottom = match
|
||||||
|
poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom)))
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# 非MinerU:正常解析tag
|
||||||
|
poss = _extract_positions_from_tag(tag)
|
||||||
|
|
||||||
|
# 如果还是没有positions,使用默认值
|
||||||
|
if not poss:
|
||||||
|
poss = [(max(from_page, 0) + 1, 0.0, 0.0, 0.0, 0.0)]
|
||||||
|
|
||||||
|
normalized_sections.append((txt, "", poss))
|
||||||
|
sections = normalized_sections
|
||||||
|
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
||||||
|
|
@ -309,10 +330,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
||||||
res = tokenize_table(tbls, doc, eng)
|
res = tokenize_table(tbls, doc, eng)
|
||||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||||
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
|
||||||
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
attach_media_context(res, table_ctx, image_ctx)
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||||
|
|
@ -328,10 +345,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||||
d["doc_type_kwd"] = "image"
|
d["doc_type_kwd"] = "image"
|
||||||
tokenize(d, text, eng)
|
tokenize(d, text, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
|
||||||
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
attach_media_context(res, table_ctx, image_ctx)
|
|
||||||
return res
|
return res
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
|
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue