diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index cb24d21a7..7d2fa0110 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -350,25 +350,31 @@ class MinerUParser(RAGFlowPdfParser): 3. 阈值控制(最多10张,总高<2000px) 4. 保持高清(不缩放) """ + # 从text中提取原始tags(保持1-based页码) + original_tags = re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text) poss = self.extract_positions(text) - if not poss: + + if not poss or not original_tags: if need_position: return None, None return + # 确保tags和poss数量一致 + if len(original_tags) != len(poss): + self.logger.warning(f"[MinerU] Tag count ({len(original_tags)}) != position count ({len(poss)}), using first {min(len(original_tags), len(poss))} items") + min_len = min(len(original_tags), len(poss)) + original_tags = original_tags[:min_len] + poss = poss[:min_len] + # Step 1: 收集所有tag对应的图片 images_to_stitch = [] seen_tags = set() # 用于去重 - for pos in poss: - # 构造tag用于查找 + for tag, pos in zip(original_tags, poss): pns, left, right, top, bottom = pos if not pns: continue - page_num = pns[0] + 1 # 转为1-based - tag = f"@@{page_num}\t{left:.1f}\t{right:.1f}\t{top:.1f}\t{bottom:.1f}##" - # ✅ 去重:如果tag已处理过,跳过 if tag in seen_tags: self.logger.debug(f"[MinerU] Skipping duplicate tag: {tag}") @@ -399,11 +405,11 @@ class MinerUParser(RAGFlowPdfParser): # 优先级3: 完整页兜底(如果page_images可用) if hasattr(self, "page_images") and self.page_images: - page_idx = pns[0] + page_idx = pns[0] # pns[0]是0-based的页索引 if 0 <= page_idx < len(self.page_images): img = self.page_images[page_idx] images_to_stitch.append(("fullpage", img, pos, tag)) - self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}") + self.logger.debug(f"[MinerU] Using full page fallback for tag: {tag}, page_idx={page_idx}") if not images_to_stitch: self.logger.warning("[MinerU] No images found for chunk") diff --git a/rag/app/manual.py b/rag/app/manual.py index 54a05f192..7c049f059 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -20,7 +20,7 @@ import re from common.constants import ParserType from io import BytesIO -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from common.token_utils import num_tokens_from_string from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper @@ -155,7 +155,7 @@ class Docx(DocxParser): sum_question = '\n'.join(question_stack) if sum_question: ti_list.append((f'{sum_question}\n{last_answer}', last_image)) - + tbls = [] for tb in self.doc.tables: html= "" @@ -213,40 +213,61 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang = lang, callback = callback, pdf_cls = Pdf, - layout_recognizer = layout_recognizer, - parse_method = "manual", **kwargs ) - def _normalize_section(section): - # pad section to length 3: (txt, sec_id, poss) - if len(section) == 1: - section = (section[0], "", []) - elif len(section) == 2: - section = (section[0], "", section[1]) - elif len(section) != 3: - raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") - - txt, layoutno, poss = section - if isinstance(poss, str): - poss = pdf_parser.extract_positions(poss) - if poss: - first = poss[0] # tuple: ([pn], x1, x2, y1, y2) - pn = first[0] - if isinstance(pn, list) and pn: - pn = pn[0] # [pn] -> pn - poss[0] = (pn, *first[1:]) - - return (txt, layoutno, poss) - - sections = [_normalize_section(sec) for sec in sections] - if not sections and not tbls: return [] if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 + # Normalize sections to (text, layout, positions) even if parser only returns (text, tag) + def _extract_positions_from_tag(tag: str): + import re + poss = [] + for t in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", tag or ""): + pn, left, right, top, bottom = t.strip("#").strip("@").split("\t") + poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom))) + return poss + + normalized_sections = [] + # 🎯 MinerU专用逻辑:直接使用已有的positions,不重新解析tag + is_mineru = name == "mineru" + + for item in sections: + if len(item) >= 3: + # 已经是(text, layout, positions)格式 + normalized_sections.append(item) + continue + + txt, tag = item[0], item[1] if len(item) > 1 else "" + + # ✅ MinerU: 如果tag包含完整的bbox信息,直接解析并使用 + if is_mineru and tag: + poss = _extract_positions_from_tag(tag) + if not poss: + # 如果解析失败,尝试从tag字符串中手动提取(处理格式问题) + try: + # 更宽松的正则:允许空格或tab分隔 + matches = re.findall(r"@@([0-9-]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)##", tag) + if matches: + for match in matches: + pn, left, right, top, bottom = match + poss.append((int(pn.split("-")[0]), float(left), float(right), float(top), float(bottom))) + except Exception as e: + pass + else: + # 非MinerU:正常解析tag + poss = _extract_positions_from_tag(tag) + + # 如果还是没有positions,使用默认值 + if not poss: + poss = [(max(from_page, 0) + 1, 0.0, 0.0, 0.0, 0.0)] + + normalized_sections.append((txt, "", poss)) + sections = normalized_sections + callback(0.8, "Finish parsing.") if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: @@ -309,10 +330,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) res = tokenize_table(tbls, doc, eng) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) - table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) - image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) - if table_ctx or image_ctx: - attach_media_context(res, table_ctx, image_ctx) return res elif re.search(r"\.docx?$", filename, re.IGNORECASE): @@ -328,14 +345,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d["doc_type_kwd"] = "image" tokenize(d, text, eng) res.append(d) - table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) - image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) - if table_ctx or image_ctx: - attach_media_context(res, table_ctx, image_ctx) return res else: raise NotImplementedError("file type not supported yet(pdf and docx supported)") - + if __name__ == "__main__": import sys