From 10432a1be7b5e546c828aea98d40ad4cb04f9d85 Mon Sep 17 00:00:00 2001
From: zhudongwork <32665466+zhudongwork@users.noreply.github.com>
Date: Tue, 22 Apr 2025 10:16:24 +0800
Subject: [PATCH] Refa: Optimize pptx shape extraction to reduce content loss
 (#6703)

### What problem does this PR solve?

When parsing pptx files, some shapes do not contain the `shape_type`
attribute, which causes the original code to throw an exception during
extraction, leading to failure in content extraction. This optimization
introduces handling logic for such anomalous shapes, providing a safer
and more robust processing mechanism.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [x] Performance Improvement
- [ ] Other (please describe):
---
 deepdoc/parser/ppt_parser.py | 61 +++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py
index 8757e3090..83c275309 100644
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@@ -31,29 +31,48 @@ class RAGFlowPptParser:
             return paragraph.text
 
     def __extract(self, shape):
-        if shape.shape_type == 19:
-            tb = shape.table
-            rows = []
-            for i in range(1, len(tb.rows)):
-                rows.append("; ".join([tb.cell(
-                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
-            return "\n".join(rows)
+        try:
+            # First try to get text content
+            if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
+                text_frame = shape.text_frame
+                texts = []
+                for paragraph in text_frame.paragraphs:
+                    if paragraph.text.strip():
+                        texts.append(self.__get_bulleted_text(paragraph))
+                return "\n".join(texts)
 
-        if shape.has_text_frame:
-            text_frame = shape.text_frame
-            texts = []
-            for paragraph in text_frame.paragraphs:
-                if paragraph.text.strip():
-                    texts.append(self.__get_bulleted_text(paragraph))
-            return "\n".join(texts)
+            # Safely get shape_type
+            try:
+                shape_type = shape.shape_type
+            except NotImplementedError:
+                # If shape_type is not available, try to get text content
+                if hasattr(shape, 'text'):
+                    return shape.text.strip()
+                return ""
 
-        if shape.shape_type == 6:
-            texts = []
-            for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
-                t = self.__extract(p)
-                if t:
-                    texts.append(t)
-            return "\n".join(texts)
+            # Handle table
+            if shape_type == 19:
+                tb = shape.table
+                rows = []
+                for i in range(1, len(tb.rows)):
+                    rows.append("; ".join([tb.cell(
+                        0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                return "\n".join(rows)
+
+            # Handle group shape
+            if shape_type == 6:
+                texts = []
+                for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                    t = self.__extract_texts(p)
+                    if t:
+                        texts.append(t)
+                return "\n".join(texts)
+
+            return ""
+
+        except Exception as e:
+            logging.error(f"Error processing shape: {str(e)}")
+            return ""
 
     def __call__(self, fnm, from_page, to_page, callback=None):
         ppt = Presentation(fnm) if isinstance(