- Add ProgressTracker class for real-time progress parsing from mineru stdout - Implement intelligent stall-based timeout (5min no progress triggers warning) - Add vlm-direct backend mode to directly connect to vLLM server - Improve API mode with streaming response, HTTP session reuse, and auto-retry - Add heartbeat updates during long-running operations - Support dynamic timeout based on page count This addresses the deadlock issue where tasks hang indefinitely when using MinerU API backend, even after the API returns 200 OK.
965 lines
39 KiB
Python
965 lines
39 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import logging
|
|
import re
|
|
import os
|
|
from functools import reduce
|
|
from io import BytesIO
|
|
from timeit import default_timer as timer
|
|
from docx import Document
|
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
|
from docx.opc.oxml import parse_xml
|
|
from markdown import markdown
|
|
from PIL import Image
|
|
from common.token_utils import num_tokens_from_string
|
|
|
|
from common.constants import LLMType
|
|
from api.db.services.llm_service import LLMBundle
|
|
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
|
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
|
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
|
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
|
from deepdoc.parser.mineru_parser import MinerUParser
|
|
from deepdoc.parser.docling_parser import DoclingParser
|
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
|
|
|
|
|
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
|
callback = callback
|
|
binary = binary
|
|
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
|
sections, tables = pdf_parser(
|
|
filename if not binary else binary,
|
|
from_page=from_page,
|
|
to_page=to_page,
|
|
callback=callback
|
|
)
|
|
|
|
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
|
|
callback=callback,
|
|
**kwargs)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
|
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
|
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
|
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
|
|
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
|
|
parse_method = kwargs.get("parse_method", "raw")
|
|
|
|
# Initialize parser with all configuration
|
|
pdf_parser = MinerUParser(
|
|
mineru_path=mineru_executable,
|
|
mineru_api=mineru_api,
|
|
mineru_server_url=mineru_server_url
|
|
)
|
|
|
|
# Check installation with the specified backend
|
|
ok, reason = pdf_parser.check_installation(backend=mineru_backend, server_url=mineru_server_url)
|
|
if not ok:
|
|
callback(-1, f"MinerU not found or backend unavailable: {reason}")
|
|
return None, None, pdf_parser
|
|
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
|
backend=mineru_backend,
|
|
server_url=mineru_server_url,
|
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
|
parse_method=parse_method
|
|
)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
|
pdf_parser = DoclingParser()
|
|
parse_method = kwargs.get("parse_method", "raw")
|
|
|
|
if not pdf_parser.check_installation():
|
|
callback(-1, "Docling not found.")
|
|
return None, None, pdf_parser
|
|
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
|
parse_method=parse_method
|
|
)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
|
tcadp_parser = TCADPParser()
|
|
|
|
if not tcadp_parser.check_installation():
|
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
|
return None, None, tcadp_parser
|
|
|
|
sections, tables = tcadp_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
|
file_type="PDF"
|
|
)
|
|
return sections, tables, tcadp_parser
|
|
|
|
|
|
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
|
if kwargs.get("layout_recognizer", "") == "Plain Text":
|
|
pdf_parser = PlainParser()
|
|
else:
|
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
|
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
|
|
|
sections, tables = pdf_parser(
|
|
filename if not binary else binary,
|
|
from_page=from_page,
|
|
to_page=to_page,
|
|
callback=callback
|
|
)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
PARSERS = {
|
|
"deepdoc": by_deepdoc,
|
|
"mineru": by_mineru,
|
|
"docling": by_docling,
|
|
"tcadp": by_tcadp,
|
|
"plaintext": by_plaintext, # default
|
|
}
|
|
|
|
|
|
class Docx(DocxParser):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def get_picture(self, document, paragraph):
|
|
imgs = paragraph._element.xpath('.//pic:pic')
|
|
if not imgs:
|
|
return None
|
|
res_img = None
|
|
for img in imgs:
|
|
embed = img.xpath('.//a:blip/@r:embed')
|
|
if not embed:
|
|
continue
|
|
embed = embed[0]
|
|
try:
|
|
related_part = document.part.related_parts[embed]
|
|
image_blob = related_part.image.blob
|
|
except UnrecognizedImageError:
|
|
logging.info("Unrecognized image format. Skipping image.")
|
|
continue
|
|
except UnexpectedEndOfFileError:
|
|
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
|
continue
|
|
except InvalidImageStreamError:
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
|
continue
|
|
except UnicodeDecodeError:
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
|
continue
|
|
except Exception:
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
|
continue
|
|
try:
|
|
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
|
if res_img is None:
|
|
res_img = image
|
|
else:
|
|
res_img = concat_img(res_img, image)
|
|
except Exception:
|
|
continue
|
|
|
|
return res_img
|
|
|
|
def __clean(self, line):
|
|
line = re.sub(r"\u3000", " ", line).strip()
|
|
return line
|
|
|
|
def __get_nearest_title(self, table_index, filename):
|
|
"""Get the hierarchical title structure before the table"""
|
|
import re
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
titles = []
|
|
blocks = []
|
|
|
|
# Get document name from filename parameter
|
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
|
|
if not doc_name:
|
|
doc_name = "Untitled Document"
|
|
|
|
# Collect all document blocks while maintaining document order
|
|
try:
|
|
# Iterate through all paragraphs and tables in document order
|
|
for i, block in enumerate(self.doc._element.body):
|
|
if block.tag.endswith('p'): # Paragraph
|
|
p = Paragraph(block, self.doc)
|
|
blocks.append(('p', i, p))
|
|
elif block.tag.endswith('tbl'): # Table
|
|
blocks.append(('t', i, None)) # Table object will be retrieved later
|
|
except Exception as e:
|
|
logging.error(f"Error collecting blocks: {e}")
|
|
return ""
|
|
|
|
# Find the target table position
|
|
target_table_pos = -1
|
|
table_count = 0
|
|
for i, (block_type, pos, _) in enumerate(blocks):
|
|
if block_type == 't':
|
|
if table_count == table_index:
|
|
target_table_pos = pos
|
|
break
|
|
table_count += 1
|
|
|
|
if target_table_pos == -1:
|
|
return "" # Target table not found
|
|
|
|
# Find the nearest heading paragraph in reverse order
|
|
nearest_title = None
|
|
for i in range(len(blocks)-1, -1, -1):
|
|
block_type, pos, block = blocks[i]
|
|
if pos >= target_table_pos: # Skip blocks after the table
|
|
continue
|
|
|
|
if block_type != 'p':
|
|
continue
|
|
|
|
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
|
try:
|
|
level_match = re.search(r"(\d+)", block.style.name)
|
|
if level_match:
|
|
level = int(level_match.group(1))
|
|
if level <= 7: # Support up to 7 heading levels
|
|
title_text = block.text.strip()
|
|
if title_text: # Avoid empty titles
|
|
nearest_title = (level, title_text)
|
|
break
|
|
except Exception as e:
|
|
logging.error(f"Error parsing heading level: {e}")
|
|
|
|
if nearest_title:
|
|
# Add current title
|
|
titles.append(nearest_title)
|
|
current_level = nearest_title[0]
|
|
|
|
# Find all parent headings, allowing cross-level search
|
|
while current_level > 1:
|
|
found = False
|
|
for i in range(len(blocks)-1, -1, -1):
|
|
block_type, pos, block = blocks[i]
|
|
if pos >= target_table_pos: # Skip blocks after the table
|
|
continue
|
|
|
|
if block_type != 'p':
|
|
continue
|
|
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
|
try:
|
|
level_match = re.search(r"(\d+)", block.style.name)
|
|
if level_match:
|
|
level = int(level_match.group(1))
|
|
# Find any heading with a higher level
|
|
if level < current_level:
|
|
title_text = block.text.strip()
|
|
if title_text: # Avoid empty titles
|
|
titles.append((level, title_text))
|
|
current_level = level
|
|
found = True
|
|
break
|
|
except Exception as e:
|
|
logging.error(f"Error parsing parent heading: {e}")
|
|
|
|
if not found: # Break if no parent heading is found
|
|
break
|
|
|
|
# Sort by level (ascending, from highest to lowest)
|
|
titles.sort(key=lambda x: x[0])
|
|
# Organize titles (from highest to lowest)
|
|
hierarchy = [doc_name] + [t[1] for t in titles]
|
|
return " > ".join(hierarchy)
|
|
|
|
return ""
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
|
self.doc = Document(
|
|
filename) if not binary else Document(BytesIO(binary))
|
|
pn = 0
|
|
lines = []
|
|
last_image = None
|
|
for p in self.doc.paragraphs:
|
|
if pn > to_page:
|
|
break
|
|
if from_page <= pn < to_page:
|
|
if p.text.strip():
|
|
if p.style and p.style.name == 'Caption':
|
|
former_image = None
|
|
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
|
|
former_image = lines[-1][1].pop()
|
|
elif last_image:
|
|
former_image = last_image
|
|
last_image = None
|
|
lines.append((self.__clean(p.text), [former_image], p.style.name))
|
|
else:
|
|
current_image = self.get_picture(self.doc, p)
|
|
image_list = [current_image]
|
|
if last_image:
|
|
image_list.insert(0, last_image)
|
|
last_image = None
|
|
lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
|
|
else:
|
|
if current_image := self.get_picture(self.doc, p):
|
|
if lines:
|
|
lines[-1][1].append(current_image)
|
|
else:
|
|
last_image = current_image
|
|
for run in p.runs:
|
|
if 'lastRenderedPageBreak' in run._element.xml:
|
|
pn += 1
|
|
continue
|
|
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
|
pn += 1
|
|
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
|
|
|
|
tbls = []
|
|
for i, tb in enumerate(self.doc.tables):
|
|
title = self.__get_nearest_title(i, filename)
|
|
html = "<table>"
|
|
if title:
|
|
html += f"<caption>Table Location: {title}</caption>"
|
|
for r in tb.rows:
|
|
html += "<tr>"
|
|
i = 0
|
|
try:
|
|
while i < len(r.cells):
|
|
span = 1
|
|
c = r.cells[i]
|
|
for j in range(i + 1, len(r.cells)):
|
|
if c.text == r.cells[j].text:
|
|
span += 1
|
|
i = j
|
|
else:
|
|
break
|
|
i += 1
|
|
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
|
|
except Exception as e:
|
|
logging.warning(f"Error parsing table, ignore: {e}")
|
|
html += "</tr>"
|
|
html += "</table>"
|
|
tbls.append(((None, html), ""))
|
|
return new_line, tbls
|
|
|
|
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
|
"""
|
|
This function uses mammoth, licensed under the BSD 2-Clause License.
|
|
"""
|
|
|
|
import base64
|
|
import uuid
|
|
|
|
import mammoth
|
|
from markdownify import markdownify
|
|
|
|
docx_file = BytesIO(binary) if binary else open(filename, "rb")
|
|
|
|
def _convert_image_to_base64(image):
|
|
try:
|
|
with image.open() as image_file:
|
|
image_bytes = image_file.read()
|
|
encoded = base64.b64encode(image_bytes).decode("utf-8")
|
|
base64_url = f"data:{image.content_type};base64,{encoded}"
|
|
|
|
alt_name = "image"
|
|
alt_name = f"img_{uuid.uuid4().hex[:8]}"
|
|
|
|
return {"src": base64_url, "alt": alt_name}
|
|
except Exception as e:
|
|
logging.warning(f"Failed to convert image to base64: {e}")
|
|
return {"src": "", "alt": "image"}
|
|
|
|
try:
|
|
if inline_images:
|
|
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
|
else:
|
|
result = mammoth.convert_to_html(docx_file)
|
|
|
|
html = result.value
|
|
|
|
markdown_text = markdownify(html)
|
|
return markdown_text
|
|
|
|
finally:
|
|
if not binary:
|
|
docx_file.close()
|
|
|
|
|
|
class Pdf(PdfParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def __call__(self, filename, binary=None, from_page=0,
|
|
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
|
start = timer()
|
|
first_start = start
|
|
callback(msg="OCR started")
|
|
self.__images__(
|
|
filename if not binary else binary,
|
|
zoomin,
|
|
from_page,
|
|
to_page,
|
|
callback
|
|
)
|
|
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
|
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
|
|
|
start = timer()
|
|
self._layouts_rec(zoomin)
|
|
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
start = timer()
|
|
self._table_transformer_job(zoomin)
|
|
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
start = timer()
|
|
self._text_merge(zoomin=zoomin)
|
|
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
|
|
|
if separate_tables_figures:
|
|
tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
|
|
self._concat_downward()
|
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
|
|
else:
|
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
|
self._naive_vertical_merge()
|
|
self._concat_downward()
|
|
self._final_reading_order_merge()
|
|
# self._filter_forpages()
|
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
|
|
|
|
|
class Markdown(MarkdownParser):
|
|
def md_to_html(self, sections):
|
|
if not sections:
|
|
return []
|
|
if isinstance(sections, type("")):
|
|
text = sections
|
|
elif isinstance(sections[0], type("")):
|
|
text = sections[0]
|
|
else:
|
|
return []
|
|
|
|
from bs4 import BeautifulSoup
|
|
html_content = markdown(text)
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
return soup
|
|
|
|
def get_hyperlink_urls(self, soup):
|
|
if soup:
|
|
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
|
return []
|
|
|
|
def extract_image_urls_with_lines(self, text):
|
|
md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
|
|
html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
|
|
urls = []
|
|
seen = set()
|
|
lines = text.splitlines()
|
|
for idx, line in enumerate(lines):
|
|
for url in md_img_re.findall(line):
|
|
if (url, idx) not in seen:
|
|
urls.append({"url": url, "line": idx})
|
|
seen.add((url, idx))
|
|
for url in html_img_re.findall(line):
|
|
if (url, idx) not in seen:
|
|
urls.append({"url": url, "line": idx})
|
|
seen.add((url, idx))
|
|
|
|
# cross-line
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(text, 'html.parser')
|
|
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
|
for img_tag in soup.find_all('img'):
|
|
src = img_tag.get('src')
|
|
if not src:
|
|
continue
|
|
|
|
tag_str = str(img_tag)
|
|
pos = text.find(tag_str)
|
|
if pos == -1:
|
|
# fallback
|
|
pos = max(text.find(src), 0)
|
|
line_no = 0
|
|
for i, off in enumerate(newline_offsets):
|
|
if pos <= off:
|
|
line_no = i
|
|
break
|
|
if (src, line_no) not in seen:
|
|
urls.append({"url": src, "line": line_no})
|
|
seen.add((src, line_no))
|
|
except Exception:
|
|
pass
|
|
|
|
return urls
|
|
|
|
def load_images_from_urls(self, urls, cache=None):
|
|
import requests
|
|
from pathlib import Path
|
|
|
|
cache = cache or {}
|
|
images = []
|
|
for url in urls:
|
|
if url in cache:
|
|
if cache[url]:
|
|
images.append(cache[url])
|
|
continue
|
|
img_obj = None
|
|
try:
|
|
if url.startswith(('http://', 'https://')):
|
|
response = requests.get(url, stream=True, timeout=30)
|
|
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
|
|
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
|
|
else:
|
|
local_path = Path(url)
|
|
if local_path.exists():
|
|
img_obj = Image.open(url).convert('RGB')
|
|
else:
|
|
logging.warning(f"Local image file not found: {url}")
|
|
except Exception as e:
|
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
|
cache[url] = img_obj
|
|
if img_obj:
|
|
images.append(img_obj)
|
|
return images, cache
|
|
|
|
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
|
|
if binary:
|
|
encoding = find_codec(binary)
|
|
txt = binary.decode(encoding, errors="ignore")
|
|
else:
|
|
with open(filename, "r") as f:
|
|
txt = f.read()
|
|
|
|
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
|
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
|
# extractor = MarkdownElementExtractor(remainder)
|
|
extractor = MarkdownElementExtractor(txt)
|
|
image_refs = self.extract_image_urls_with_lines(txt)
|
|
element_sections = extractor.extract_elements(delimiter, include_meta=True)
|
|
|
|
sections = []
|
|
section_images = []
|
|
image_cache = {}
|
|
for element in element_sections:
|
|
content = element["content"]
|
|
start_line = element["start_line"]
|
|
end_line = element["end_line"]
|
|
urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
|
|
imgs = []
|
|
if urls_in_section:
|
|
imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
|
|
combined_image = None
|
|
if imgs:
|
|
combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
|
|
sections.append((content, ""))
|
|
section_images.append(combined_image)
|
|
|
|
tbls = []
|
|
for table in tables:
|
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
|
if return_section_images:
|
|
return sections, tbls, section_images
|
|
return sections, tbls
|
|
|
|
def load_from_xml_v2(baseURI, rels_item_xml):
|
|
"""
|
|
Return |_SerializedRelationships| instance loaded with the
|
|
relationships contained in *rels_item_xml*. Returns an empty
|
|
collection if *rels_item_xml* is |None|.
|
|
"""
|
|
srels = _SerializedRelationships()
|
|
if rels_item_xml is not None:
|
|
rels_elm = parse_xml(rels_item_xml)
|
|
for rel_elm in rels_elm.Relationship_lst:
|
|
if rel_elm.target_ref in ('../NULL', 'NULL'):
|
|
continue
|
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
|
return srels
|
|
|
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
|
"""
|
|
Supported file formats are docx, pdf, excel, txt.
|
|
This method apply the naive ways to chunk files.
|
|
Successive text will be sliced into pieces using 'delimiter'.
|
|
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
|
"""
|
|
urls = set()
|
|
url_res = []
|
|
|
|
is_english = lang.lower() == "english" # is_english(cks)
|
|
parser_config = kwargs.get(
|
|
"parser_config", {
|
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
|
|
|
child_deli = re.findall(r"`([^`]+)`", parser_config.get("children_delimiter", ""))
|
|
child_deli = sorted(set(child_deli), key=lambda x: -len(x))
|
|
child_deli = "|".join(re.escape(t) for t in child_deli if t)
|
|
is_markdown = False
|
|
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
|
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
|
|
|
doc = {
|
|
"docnm_kwd": filename,
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
|
}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
res = []
|
|
pdf_parser = None
|
|
section_images = None
|
|
|
|
is_root = kwargs.get("is_root", True)
|
|
embed_res = []
|
|
if is_root:
|
|
# Only extract embedded files at the root call
|
|
embeds = []
|
|
if binary is not None:
|
|
embeds = extract_embed_file(binary)
|
|
else:
|
|
raise Exception("Embedding extraction from file path is not supported.")
|
|
|
|
# Recursively chunk each embedded file and collect results
|
|
for embed_filename, embed_bytes in embeds:
|
|
try:
|
|
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
|
|
embed_res.extend(sub_res)
|
|
except Exception as e:
|
|
if callback:
|
|
callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
|
|
continue
|
|
|
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
|
urls = extract_links_from_docx(binary)
|
|
for index, url in enumerate(urls):
|
|
html_bytes, metadata = extract_html(url)
|
|
if not html_bytes:
|
|
continue
|
|
try:
|
|
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
except Exception as e:
|
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
url_res.extend(sub_url_res)
|
|
|
|
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
|
|
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
|
sections, tables = Docx()(filename, binary)
|
|
|
|
tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
|
|
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
st = timer()
|
|
|
|
chunks, images = naive_merge_docx(
|
|
sections, int(parser_config.get(
|
|
"chunk_token_num", 128)), parser_config.get(
|
|
"delimiter", "\n!?。;!?"))
|
|
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
res.extend(embed_res)
|
|
res.extend(url_res)
|
|
if table_context_size or image_context_size:
|
|
attach_media_context(res, table_context_size, image_context_size)
|
|
return res
|
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
|
urls = extract_links_from_pdf(binary)
|
|
|
|
if isinstance(layout_recognizer, bool):
|
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
|
|
|
name = layout_recognizer.strip().lower()
|
|
parser = PARSERS.get(name, by_plaintext)
|
|
callback(0.1, "Start to parse.")
|
|
|
|
sections, tables, pdf_parser = parser(
|
|
filename = filename,
|
|
binary = binary,
|
|
from_page = from_page,
|
|
to_page = to_page,
|
|
lang = lang,
|
|
callback = callback,
|
|
layout_recognizer = layout_recognizer,
|
|
**kwargs
|
|
)
|
|
|
|
if not sections and not tables:
|
|
return []
|
|
|
|
if name in ["tcadp", "docling", "mineru"]:
|
|
parser_config["chunk_token_num"] = 0
|
|
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
|
|
# Check if tcadp_parser is selected for spreadsheet files
|
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
|
if layout_recognizer == "TCADP Parser":
|
|
table_result_type = parser_config.get("table_result_type", "1")
|
|
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
|
tcadp_parser = TCADPParser(
|
|
table_result_type=table_result_type,
|
|
markdown_image_response_type=markdown_image_response_type
|
|
)
|
|
if not tcadp_parser.check_installation():
|
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
|
return res
|
|
|
|
# Determine file type based on extension
|
|
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
|
|
|
sections, tables = tcadp_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
|
file_type=file_type
|
|
)
|
|
parser_config["chunk_token_num"] = 0
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
else:
|
|
# Default DeepDOC parser
|
|
excel_parser = ExcelParser()
|
|
if parser_config.get("html4excel"):
|
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
|
parser_config["chunk_token_num"] = 0
|
|
else:
|
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
|
|
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
sections = TxtParser()(filename, binary,
|
|
parser_config.get("chunk_token_num", 128),
|
|
parser_config.get("delimiter", "\n!?;。;!?"))
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
|
sections, tables, section_images = markdown_parser(
|
|
filename,
|
|
binary,
|
|
separate_tables=False,
|
|
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
|
|
return_section_images=True,
|
|
)
|
|
|
|
is_markdown = True
|
|
|
|
try:
|
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
|
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
|
|
except Exception:
|
|
vision_model = None
|
|
|
|
if vision_model:
|
|
# Process images for each section
|
|
for idx, (section_text, _) in enumerate(sections):
|
|
images = []
|
|
if section_images and len(section_images) > idx and section_images[idx] is not None:
|
|
images.append(section_images[idx])
|
|
|
|
if images and len(images) > 0:
|
|
# If multiple images found, combine them using concat_img
|
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
|
if section_images:
|
|
section_images[idx] = combined_image
|
|
else:
|
|
section_images = [None] * len(sections)
|
|
section_images[idx] = combined_image
|
|
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
|
boosted_figures = markdown_vision_parser(callback=callback)
|
|
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
|
|
|
else:
|
|
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
|
|
|
if parser_config.get("hyperlink_urls", False) and is_root:
|
|
for idx, (section_text, _) in enumerate(sections):
|
|
soup = markdown_parser.md_to_html(section_text)
|
|
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
|
|
urls.update(hyperlink_urls)
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = HtmlParser()(filename, binary, chunk_token_num)
|
|
sections = [(_, "") for _ in sections if _]
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = JsonParser(chunk_token_num)(binary)
|
|
sections = [(_, "") for _ in sections if _]
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
|
|
try:
|
|
from tika import parser as tika_parser
|
|
except Exception as e:
|
|
callback(0.8, f"tika not available: {e}. Unsupported .doc parsing.")
|
|
logging.warning(f"tika not available: {e}. Unsupported .doc parsing for {filename}.")
|
|
return []
|
|
|
|
binary = BytesIO(binary)
|
|
doc_parsed = tika_parser.from_buffer(binary)
|
|
if doc_parsed.get('content', None) is not None:
|
|
sections = doc_parsed['content'].split('\n')
|
|
sections = [(_, "") for _ in sections if _]
|
|
callback(0.8, "Finish parsing.")
|
|
else:
|
|
callback(0.8, f"tika.parser got empty content from {filename}.")
|
|
logging.warning(f"tika.parser got empty content from {filename}.")
|
|
return []
|
|
else:
|
|
raise NotImplementedError(
|
|
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
|
|
|
st = timer()
|
|
if is_markdown:
|
|
merged_chunks = []
|
|
merged_images = []
|
|
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
|
overlapped_percent = int(parser_config.get("overlapped_percent", 0))
|
|
overlapped_percent = max(0, min(overlapped_percent, 90))
|
|
|
|
current_text = ""
|
|
current_tokens = 0
|
|
current_image = None
|
|
|
|
for idx, sec in enumerate(sections):
|
|
text = sec[0] if isinstance(sec, tuple) else sec
|
|
sec_tokens = num_tokens_from_string(text)
|
|
sec_image = section_images[idx] if section_images and idx < len(section_images) else None
|
|
|
|
if current_text and current_tokens + sec_tokens > chunk_limit:
|
|
merged_chunks.append(current_text)
|
|
merged_images.append(current_image)
|
|
overlap_part = ""
|
|
if overlapped_percent > 0:
|
|
overlap_len = int(len(current_text) * overlapped_percent / 100)
|
|
if overlap_len > 0:
|
|
overlap_part = current_text[-overlap_len:]
|
|
current_text = overlap_part
|
|
current_tokens = num_tokens_from_string(current_text)
|
|
current_image = current_image if overlap_part else None
|
|
|
|
if current_text:
|
|
current_text += "\n" + text
|
|
else:
|
|
current_text = text
|
|
current_tokens += sec_tokens
|
|
|
|
if sec_image:
|
|
current_image = concat_img(current_image, sec_image) if current_image else sec_image
|
|
|
|
if current_text:
|
|
merged_chunks.append(current_text)
|
|
merged_images.append(current_image)
|
|
|
|
chunks = merged_chunks
|
|
has_images = merged_images and any(img is not None for img in merged_images)
|
|
|
|
if has_images:
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
|
|
else:
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
|
else:
|
|
if section_images:
|
|
if all(image is None for image in section_images):
|
|
section_images = None
|
|
|
|
if section_images:
|
|
chunks, images = naive_merge_with_images(sections, section_images,
|
|
int(parser_config.get(
|
|
"chunk_token_num", 128)), parser_config.get(
|
|
"delimiter", "\n!?。;!?"))
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
|
else:
|
|
chunks = naive_merge(
|
|
sections, int(parser_config.get(
|
|
"chunk_token_num", 128)), parser_config.get(
|
|
"delimiter", "\n!?。;!?"))
|
|
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
|
|
|
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
|
for index, url in enumerate(urls):
|
|
html_bytes, metadata = extract_html(url)
|
|
if not html_bytes:
|
|
continue
|
|
try:
|
|
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
except Exception as e:
|
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
url_res.extend(sub_url_res)
|
|
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
|
|
if embed_res:
|
|
res.extend(embed_res)
|
|
if url_res:
|
|
res.extend(url_res)
|
|
if table_context_size or image_context_size:
|
|
attach_media_context(res, table_context_size, image_context_size)
|
|
return res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
def dummy(prog=None, msg=""):
|
|
pass
|
|
|
|
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|