From 10432a1be7b5e546c828aea98d40ad4cb04f9d85 Mon Sep 17 00:00:00 2001 From: zhudongwork <32665466+zhudongwork@users.noreply.github.com> Date: Tue, 22 Apr 2025 10:16:24 +0800 Subject: [PATCH] Refa: Optimize pptx shape extraction to reduce content loss (#6703) ### What problem does this PR solve? When parsing pptx files, some shapes do not contain the `shape_type` attribute, which causes the original code to throw an exception during extraction, leading to failure in content extraction. This optimization introduces handling logic for such anomalous shapes, providing a safer and more robust processing mechanism. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [x] Performance Improvement - [ ] Other (please describe): --- deepdoc/parser/ppt_parser.py | 61 +++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 8757e3090..83c275309 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -31,29 +31,48 @@ class RAGFlowPptParser: return paragraph.text def __extract(self, shape): - if shape.shape_type == 19: - tb = shape.table - rows = [] - for i in range(1, len(tb.rows)): - rows.append("; ".join([tb.cell( - 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) - return "\n".join(rows) + try: + # First try to get text content + if hasattr(shape, 'has_text_frame') and shape.has_text_frame: + text_frame = shape.text_frame + texts = [] + for paragraph in text_frame.paragraphs: + if paragraph.text.strip(): + texts.append(self.__get_bulleted_text(paragraph)) + return "\n".join(texts) - if shape.has_text_frame: - text_frame = shape.text_frame - texts = [] - for paragraph in text_frame.paragraphs: - if paragraph.text.strip(): - texts.append(self.__get_bulleted_text(paragraph)) - return "\n".join(texts) + # Safely get shape_type + try: + shape_type = shape.shape_type + except NotImplementedError: + # If shape_type is not available, try to get text content + if hasattr(shape, 'text'): + return shape.text.strip() + return "" - if shape.shape_type == 6: - texts = [] - for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): - t = self.__extract(p) - if t: - texts.append(t) - return "\n".join(texts) + # Handle table + if shape_type == 19: + tb = shape.table + rows = [] + for i in range(1, len(tb.rows)): + rows.append("; ".join([tb.cell( + 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) + return "\n".join(rows) + + # Handle group shape + if shape_type == 6: + texts = [] + for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): + t = self.__extract_texts(p) + if t: + texts.append(t) + return "\n".join(texts) + + return "" + + except Exception as e: + logging.error(f"Error processing shape: {str(e)}") + return "" def __call__(self, fnm, from_page, to_page, callback=None): ppt = Presentation(fnm) if isinstance(