feat: improve presentation PdfParser

The old presentation PdfParser lost table format after parse
This commit is contained in:
rommy2017 2025-12-01 17:50:40 +08:00 committed by GitHub
parent 81ae6cf78d
commit 956a36651e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -16,15 +16,16 @@
import copy import copy
import re import re
from collections import defaultdict
from io import BytesIO from io import BytesIO
from PIL import Image from PIL import Image
from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read from PyPDF2 import PdfReader as pdf2_read
from rag.app.naive import by_plaintext, PARSERS
from deepdoc.parser import PdfParser, PptParser, PlainParser
from rag.nlp import rag_tokenizer
from rag.nlp import tokenize, is_english
class Ppt(PptParser): class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None): def __call__(self, fnm, from_page, to_page, callback=None):
@ -51,35 +52,90 @@ class Ppt(PptParser):
self.is_english = is_english(txts) self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))] return [(txts[i], imgs[i]) for i in range(len(txts))]
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
def __garbage(self, txt):
txt = txt.lower().strip()
if re.match(r"[0-9\.,%/-]+$", txt):
return True
if len(txt) < 3:
return True
return False
def __call__(self, filename, binary=None, from_page=0, def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None): to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer # 1. OCR
start = timer()
callback(msg="OCR started") callback(msg="OCR started")
self.__images__(filename if not binary else binary, self.__images__(filename if not binary else binary, zoomin, from_page,
zoomin, from_page, to_page, callback) to_page, callback)
callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( # 2. Layout Analysis
len(self.boxes), len(self.page_images)) callback(msg="Layout Analysis")
self._layouts_rec(zoomin)
# 3. Table Analysis
callback(msg="Table Analysis")
self._table_transformer_job(zoomin)
# 4. Text Merge
self._text_merge()
# 5. Extract Tables
tbls = self._extract_table_figure(True, zoomin, True, True)
# 6. Re-assemble Page Content
page_items = defaultdict(list)
# 7. Add text
for b in self.boxes:
if not (from_page < b["page_number"] <= to_page + from_page):
continue
page_items[b["page_number"]].append({
"top": b["top"],
"x0": b["x0"],
"text": b["text"],
"type": "text"
})
# 8. Add table and figure
for (img, content), positions in tbls:
if not positions:
continue
if isinstance(content, list):
final_text = "\n".join(content)
elif isinstance(content, str):
final_text = content
else:
final_text = str(content)
try:
# deal positions: [ (pn, left, right, top, bottom), ... ]
pn_index = positions[0][0]
if isinstance(pn_index, list):
pn_index = pn_index[0]
current_page_num = int(pn_index) + 1
except Exception as e:
print(f"Error parsing position: {e}")
continue
if not (from_page < current_page_num <= to_page + from_page):
continue
top = positions[0][3]
left = positions[0][1]
page_items[current_page_num].append({
"top": top,
"x0": left,
"text": final_text, # 确保这里一定是字符串
"type": "table_or_figure"
})
# 9. Generate result
res = [] res = []
for i in range(len(self.boxes)): for i in range(len(self.page_images)):
lines = "\n".join([b["text"] for b in self.boxes[i] current_pn = from_page + i + 1
if not self.__garbage(b["text"])]) items = page_items.get(current_pn, [])
res.append((lines, self.page_images[i])) items.sort(key=lambda x: (x["top"], x["x0"]))
callback(0.9, "Page {}~{}: Parsing finished".format( full_page_text = "\n\n".join([item["text"] for item in items])
from_page, min(to_page, self.total_page))) page_img = self.page_images[i]
res.append((full_page_text, page_img))
callback(0.9, "Parsing finished")
return res, [] return res, []
@ -125,33 +181,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.append(d) res.append(d)
return res return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") parser = Pdf()
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, _, _ = parser( sections, _ = parser(
filename = filename, filename=filename,
binary = binary, binary=binary,
from_page = from_page, from_page=from_page,
to_page = to_page, to_page=to_page,
lang = lang, zoomin=3,
callback = callback, callback=callback
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
**kwargs
) )
if not sections:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
for pn, (txt, img) in enumerate(sections): for pn, (txt, img) in enumerate(sections):
@ -161,7 +203,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["image"] = img d["image"] = img
d["page_num_int"] = [pn + 1] d["page_num_int"] = [pn + 1]
d["top_int"] = [0] d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0,
img.size[1] if img else 0)]
tokenize(d, txt, eng) tokenize(d, txt, eng)
res.append(d) res.append(d)
return res return res