diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 6a872528f..96ad61a1d 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -16,15 +16,16 @@ import copy import re +from collections import defaultdict from io import BytesIO from PIL import Image - -from rag.nlp import tokenize, is_english -from rag.nlp import rag_tokenizer -from deepdoc.parser import PdfParser, PptParser, PlainParser from PyPDF2 import PdfReader as pdf2_read -from rag.app.naive import by_plaintext, PARSERS + +from deepdoc.parser import PdfParser, PptParser, PlainParser +from rag.nlp import rag_tokenizer +from rag.nlp import tokenize, is_english + class Ppt(PptParser): def __call__(self, fnm, from_page, to_page, callback=None): @@ -51,35 +52,90 @@ class Ppt(PptParser): self.is_english = is_english(txts) return [(txts[i], imgs[i]) for i in range(len(txts))] + class Pdf(PdfParser): def __init__(self): super().__init__() - def __garbage(self, txt): - txt = txt.lower().strip() - if re.match(r"[0-9\.,%/-]+$", txt): - return True - if len(txt) < 3: - return True - return False - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - from timeit import default_timer as timer - start = timer() + # 1. OCR callback(msg="OCR started") - self.__images__(filename if not binary else binary, - zoomin, from_page, to_page, callback) - callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start)) - assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( - len(self.boxes), len(self.page_images)) + self.__images__(filename if not binary else binary, zoomin, from_page, + to_page, callback) + + # 2. Layout Analysis + callback(msg="Layout Analysis") + self._layouts_rec(zoomin) + + # 3. Table Analysis + callback(msg="Table Analysis") + self._table_transformer_job(zoomin) + + # 4. Text Merge + self._text_merge() + + # 5. Extract Tables + tbls = self._extract_table_figure(True, zoomin, True, True) + + # 6. Re-assemble Page Content + page_items = defaultdict(list) + + # 7. Add text + for b in self.boxes: + if not (from_page < b["page_number"] <= to_page + from_page): + continue + page_items[b["page_number"]].append({ + "top": b["top"], + "x0": b["x0"], + "text": b["text"], + "type": "text" + }) + + # 8. Add table and figure + for (img, content), positions in tbls: + if not positions: + continue + if isinstance(content, list): + final_text = "\n".join(content) + elif isinstance(content, str): + final_text = content + else: + final_text = str(content) + try: + # deal positions: [ (pn, left, right, top, bottom), ... ] + pn_index = positions[0][0] + if isinstance(pn_index, list): + pn_index = pn_index[0] + current_page_num = int(pn_index) + 1 + except Exception as e: + print(f"Error parsing position: {e}") + continue + + if not (from_page < current_page_num <= to_page + from_page): + continue + + top = positions[0][3] + left = positions[0][1] + + page_items[current_page_num].append({ + "top": top, + "x0": left, + "text": final_text, # 确保这里一定是字符串 + "type": "table_or_figure" + }) + + # 9. Generate result res = [] - for i in range(len(self.boxes)): - lines = "\n".join([b["text"] for b in self.boxes[i] - if not self.__garbage(b["text"])]) - res.append((lines, self.page_images[i])) - callback(0.9, "Page {}~{}: Parsing finished".format( - from_page, min(to_page, self.total_page))) + for i in range(len(self.page_images)): + current_pn = from_page + i + 1 + items = page_items.get(current_pn, []) + items.sort(key=lambda x: (x["top"], x["x0"])) + full_page_text = "\n\n".join([item["text"] for item in items]) + page_img = self.page_images[i] + res.append((full_page_text, page_img)) + + callback(0.9, "Parsing finished") return res, [] @@ -125,33 +181,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res.append(d) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + parser = Pdf() - if isinstance(layout_recognizer, bool): - layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" - - name = layout_recognizer.strip().lower() - parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") - sections, _, _ = parser( - filename = filename, - binary = binary, - from_page = from_page, - to_page = to_page, - lang = lang, - callback = callback, - pdf_cls = Pdf, - layout_recognizer = layout_recognizer, - **kwargs + sections, _ = parser( + filename=filename, + binary=binary, + from_page=from_page, + to_page=to_page, + zoomin=3, + callback=callback ) - if not sections: - return [] - - if name in ["tcadp", "docling", "mineru"]: - parser_config["chunk_token_num"] = 0 - callback(0.8, "Finish parsing.") for pn, (txt, img) in enumerate(sections): @@ -161,7 +203,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d["image"] = img d["page_num_int"] = [pn + 1] d["top_int"] = [0] - d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] + d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, + img.size[1] if img else 0)] tokenize(d, txt, eng) res.append(d) return res