From 4ba17361e91bca253d840fd20a23912f9b8e39a2 Mon Sep 17 00:00:00 2001 From: rommy2017 Date: Tue, 2 Dec 2025 17:35:14 +0800 Subject: [PATCH] feat: improve presentation PdfParser (#11639) The old presentation PdfParser lost table format after parse ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/app/presentation.py | 150 +++++++++++++++++++++++++++++----------- 1 file changed, 110 insertions(+), 40 deletions(-) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 6a872528f..1404ac19b 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -16,15 +16,17 @@ import copy import re +from collections import defaultdict from io import BytesIO from PIL import Image - -from rag.nlp import tokenize, is_english -from rag.nlp import rag_tokenizer -from deepdoc.parser import PdfParser, PptParser, PlainParser from PyPDF2 import PdfReader as pdf2_read + +from deepdoc.parser import PdfParser, PptParser, PlainParser from rag.app.naive import by_plaintext, PARSERS +from rag.nlp import rag_tokenizer +from rag.nlp import tokenize, is_english + class Ppt(PptParser): def __call__(self, fnm, from_page, to_page, callback=None): @@ -44,42 +46,106 @@ class Ppt(PptParser): buffered.seek(0) imgs.append(Image.open(buffered).copy()) except RuntimeError as e: - raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e + raise RuntimeError( + f'ppt parse error at page {i + 1}, original error: {str(e)}') from e assert len(imgs) == len( - txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) + txts), "Slides text and image do not match: {} vs. {}".format( + len(imgs), len(txts)) callback(0.9, "Image extraction finished") self.is_english = is_english(txts) return [(txts[i], imgs[i]) for i in range(len(txts))] + class Pdf(PdfParser): def __init__(self): super().__init__() - def __garbage(self, txt): - txt = txt.lower().strip() - if re.match(r"[0-9\.,%/-]+$", txt): - return True - if len(txt) < 3: - return True - return False - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): - from timeit import default_timer as timer - start = timer() + to_page=100000, zoomin=3, callback=None, **kwargs): + # 1. OCR callback(msg="OCR started") - self.__images__(filename if not binary else binary, - zoomin, from_page, to_page, callback) - callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start)) - assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( - len(self.boxes), len(self.page_images)) + self.__images__(filename if not binary else binary, zoomin, from_page, + to_page, callback) + + # 2. Layout Analysis + callback(msg="Layout Analysis") + self._layouts_rec(zoomin) + + # 3. Table Analysis + callback(msg="Table Analysis") + self._table_transformer_job(zoomin) + + # 4. Text Merge + self._text_merge() + + # 5. Extract Tables (Force HTML) + tbls = self._extract_table_figure(True, zoomin, True, True) + + # 6. Re-assemble Page Content + page_items = defaultdict(list) + + # (A) Add text + for b in self.boxes: + if not (from_page < b["page_number"] <= to_page + from_page): + continue + page_items[b["page_number"]].append({ + "top": b["top"], + "x0": b["x0"], + "text": b["text"], + "type": "text" + }) + + # (B) Add table and figure + for (img, content), positions in tbls: + if not positions: + continue + + # Handle content type (list vs str) + if isinstance(content, list): + final_text = "\n".join(content) + elif isinstance(content, str): + final_text = content + else: + final_text = str(content) + + try: + # Parse positions + pn_index = positions[0][0] + if isinstance(pn_index, list): + pn_index = pn_index[0] + current_page_num = int(pn_index) + 1 + except Exception as e: + print(f"Error parsing position: {e}") + continue + + if not (from_page < current_page_num <= to_page + from_page): + continue + + top = positions[0][3] + left = positions[0][1] + + page_items[current_page_num].append({ + "top": top, + "x0": left, + "text": final_text, + "type": "table_or_figure" + }) + + # 7. Generate result res = [] - for i in range(len(self.boxes)): - lines = "\n".join([b["text"] for b in self.boxes[i] - if not self.__garbage(b["text"])]) - res.append((lines, self.page_images[i])) - callback(0.9, "Page {}~{}: Parsing finished".format( - from_page, min(to_page, self.total_page))) + for i in range(len(self.page_images)): + current_pn = from_page + i + 1 + items = page_items.get(current_pn, []) + # Sort by vertical position + items.sort(key=lambda x: (x["top"], x["x0"])) + full_page_text = "\n\n".join([item["text"] for item in items]) + if not full_page_text.strip(): + full_page_text = f"[No text or data found in Page {current_pn}]" + page_img = self.page_images[i] + res.append((full_page_text, page_img)) + + callback(0.9, "Parsing finished") + return res, [] @@ -106,14 +172,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, eng = lang.lower() == "english" doc = { "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize( + re.sub(r"\.[a-zA-Z]+$", "", filename)) } doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): ppt_parser = Ppt() for pn, (txt, img) in enumerate(ppt_parser( - filename if not binary else binary, from_page, 1000000, callback)): + filename if not binary else binary, from_page, 1000000, + callback)): d = copy.deepcopy(doc) pn += from_page d["image"] = img @@ -135,14 +203,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.1, "Start to parse.") sections, _, _ = parser( - filename = filename, - binary = binary, - from_page = from_page, - to_page = to_page, - lang = lang, - callback = callback, - pdf_cls = Pdf, - layout_recognizer = layout_recognizer, + filename=filename, + binary=binary, + from_page=from_page, + to_page=to_page, + lang=lang, + callback=callback, + pdf_cls=Pdf, + layout_recognizer=layout_recognizer, **kwargs ) @@ -151,7 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 - + callback(0.8, "Finish parsing.") for pn, (txt, img) in enumerate(sections): @@ -161,7 +229,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d["image"] = img d["page_num_int"] = [pn + 1] d["top_int"] = [0] - d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] + d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, + img.size[1] if img else 0)] tokenize(d, txt, eng) res.append(d) return res @@ -175,4 +244,5 @@ if __name__ == "__main__": def dummy(a, b): pass + chunk(sys.argv[1], callback=dummy)