fix: MinerU crop tag matching and manual.py bbox parsing
- Fixed crop() to extract original tags from text instead of reconstructing - Added MinerU-specific logic in manual.py to handle space/tab separated tags - Removed redundant import re that caused UnboundLocalError - Ensures correct bbox coordinates for native images, fallback images, and page selection
This commit is contained in:
parent
3ce7b02d50
commit
2d4750535f
2 changed files with 63 additions and 44 deletions
|
|
@ -350,25 +350,31 @@ class MinerUParser(RAGFlowPdfParser):
|
|||
3. 阈值控制(最多10张,总高<2000px)
|
||||
4. 保持高清(不缩放)
|
||||
"""
|
||||
# 从text中提取原始tags(保持1-based页码)
|
||||
original_tags = re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text)
|
||||
poss = self.extract_positions(text)
|
||||
if not poss:
|
||||
|
||||
if not poss or not original_tags:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
# 确保tags和poss数量一致
|
||||
if len(original_tags) != len(poss):
|
||||
self.logger.warning(f"[MinerU] Tag count ({len(original_tags)}) != position count ({len(poss)}), using first {min(len(original_tags), len(poss))} items")
|
||||
min_len = min(len(original_tags), len(poss))
|
||||
original_tags = original_tags[:min_len]
|
||||
poss = poss[:min_len]
|
||||
|
||||
# Step 1: 收集所有tag对应的图片
|
||||
images_to_stitch = []
|
||||
seen_tags = set() # 用于去重
|
||||
|
||||
for pos in poss:
|
||||
# 构造tag用于查找
|
||||
for tag, pos in zip(original_tags, poss):
|
||||
pns, left, right, top, bottom = pos
|
||||
if not pns:
|
||||
continue
|
||||
|
||||
page_num = pns[0] + 1 # 转为1-based
|
||||
tag = f"@@{page_num}\t{left:.1f}\t{right:.1f}\t{top:.1f}\t{bottom:.1f}##"
|
||||
|
||||
# ✅ 去重:如果tag已处理过,跳过
|
||||
if tag in seen_tags:
|
||||
self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}")
|
||||
|
|
@ -399,11 +405,11 @@ class MinerUParser(RAGFlowPdfParser):
|
|||
|
||||
# 优先级3: 完整页兜底(如果page_images可用)
|
||||
if hasattr(self, "page_images") and self.page_images:
|
||||
page_idx = pns[0]
|
||||
page_idx = pns[0] # pns[0]是0-based的页索引
|
||||
if 0 <= page_idx < len(self.page_images):
|
||||
img = self.page_images[page_idx]
|
||||
images_to_stitch.append(("fullpage", img, pos, tag))
|
||||
self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}")
|
||||
self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}, page_idx={page_idx}")
|
||||
|
||||
if not images_to_stitch:
|
||||
self.logger.warning("[MinerU] No images found for chunk")
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ import re
|
|||
|
||||
from common.constants import ParserType
|
||||
from io import BytesIO
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
|
||||
from common.token_utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
||||
|
|
@ -155,7 +155,7 @@ class Docx(DocxParser):
|
|||
sum_question = '\n'.join(question_stack)
|
||||
if sum_question:
|
||||
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
||||
|
||||
|
||||
tbls = []
|
||||
for tb in self.doc.tables:
|
||||
html= "<table>"
|
||||
|
|
@ -213,40 +213,61 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||
lang = lang,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
layout_recognizer = layout_recognizer,
|
||||
parse_method = "manual",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def _normalize_section(section):
|
||||
# pad section to length 3: (txt, sec_id, poss)
|
||||
if len(section) == 1:
|
||||
section = (section[0], "", [])
|
||||
elif len(section) == 2:
|
||||
section = (section[0], "", section[1])
|
||||
elif len(section) != 3:
|
||||
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
||||
|
||||
txt, layoutno, poss = section
|
||||
if isinstance(poss, str):
|
||||
poss = pdf_parser.extract_positions(poss)
|
||||
if poss:
|
||||
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
||||
pn = first[0]
|
||||
if isinstance(pn, list) and pn:
|
||||
pn = pn[0] # [pn] -> pn
|
||||
poss[0] = (pn, *first[1:])
|
||||
|
||||
return (txt, layoutno, poss)
|
||||
|
||||
sections = [_normalize_section(sec) for sec in sections]
|
||||
|
||||
if not sections and not tbls:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
# Normalize sections to (text, layout, positions) even if parser only returns (text, tag)
|
||||
def _extract_positions_from_tag(tag: str):
|
||||
import re
|
||||
poss = []
|
||||
for t in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", tag or ""):
|
||||
pn, left, right, top, bottom = t.strip("#").strip("@").split("\t")
|
||||
poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom)))
|
||||
return poss
|
||||
|
||||
normalized_sections = []
|
||||
# 🎯 MinerU专用逻辑:直接使用已有的positions,不重新解析tag
|
||||
is_mineru = name == "mineru"
|
||||
|
||||
for item in sections:
|
||||
if len(item) >= 3:
|
||||
# 已经是(text, layout, positions)格式
|
||||
normalized_sections.append(item)
|
||||
continue
|
||||
|
||||
txt, tag = item[0], item[1] if len(item) > 1 else ""
|
||||
|
||||
# ✅ MinerU: 如果tag包含完整的bbox信息,直接解析并使用
|
||||
if is_mineru and tag:
|
||||
poss = _extract_positions_from_tag(tag)
|
||||
if not poss:
|
||||
# 如果解析失败,尝试从tag字符串中手动提取(处理格式问题)
|
||||
try:
|
||||
# 更宽松的正则:允许空格或tab分隔
|
||||
matches = re.findall(r"@@([0-9-]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)##", tag)
|
||||
if matches:
|
||||
for match in matches:
|
||||
pn, left, right, top, bottom = match
|
||||
poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom)))
|
||||
except Exception as e:
|
||||
pass
|
||||
else:
|
||||
# 非MinerU:正常解析tag
|
||||
poss = _extract_positions_from_tag(tag)
|
||||
|
||||
# 如果还是没有positions,使用默认值
|
||||
if not poss:
|
||||
poss = [(max(from_page, 0) + 1, 0.0, 0.0, 0.0, 0.0)]
|
||||
|
||||
normalized_sections.append((txt, "", poss))
|
||||
sections = normalized_sections
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
||||
|
|
@ -309,10 +330,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||
if table_ctx or image_ctx:
|
||||
attach_media_context(res, table_ctx, image_ctx)
|
||||
return res
|
||||
|
||||
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
|
|
@ -328,14 +345,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||
d["doc_type_kwd"] = "image"
|
||||
tokenize(d, text, eng)
|
||||
res.append(d)
|
||||
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||
if table_ctx or image_ctx:
|
||||
attach_media_context(res, table_ctx, image_ctx)
|
||||
return res
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue