From aef92861816a68c4714dae9dd404f08728b40cdf Mon Sep 17 00:00:00 2001
From: Shivam Johri <shivamjohri@Shivams-MacBook-Air.local>
Date: Thu, 11 Dec 2025 21:23:34 +0530
Subject: [PATCH 1/8] feat: Add Excel export support and fix variable reference
 regex

Changes:
- Add Excel export output format option to Message component
- Apply nest_asyncio patch to handle nested event loops
- Fix async generator iteration in canvas_app.py debug endpoint
- Add underscore support in variable reference regex pattern
---
 agent/component/base.py                |  2 +-
 agent/component/message.py             | 44 +++++++++++++++++++++++++-
 api/apps/canvas_app.py                 | 10 ++++--
 web/src/pages/agent/constant/index.tsx |  1 +
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/agent/component/base.py b/agent/component/base.py
index 81d3fac56..321907dbe 100644
--- a/agent/component/base.py
+++ b/agent/component/base.py
@@ -393,7 +393,7 @@ class ComponentParamBase(ABC):
 class ComponentBase(ABC):
     component_name: str
     thread_limiter = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT_CHATS", 10)))
-    variable_ref_patt = r"\{* *\{([a-zA-Z:0-9]+@[A-Za-z0-9_.]+|sys\.[A-Za-z0-9_.]+|env\.[A-Za-z0-9_.]+)\} *\}*"
+    variable_ref_patt = r"\{* *\{([a-zA-Z_:0-9]+@[A-Za-z0-9_.]+|sys\.[A-Za-z0-9_.]+|env\.[A-Za-z0-9_.]+)\} *\}*"
 
     def __str__(self):
         """
diff --git a/agent/component/message.py b/agent/component/message.py
index b4e2985e0..e2428a084 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -14,6 +14,8 @@
 #  limitations under the License.
 #
 import asyncio
+import nest_asyncio
+nest_asyncio.apply()
 import inspect
 import json
 import os
@@ -207,7 +209,7 @@ class Message(ComponentBase):
         import pypandoc
         doc_id = get_uuid()
 
-        if self._param.output_format.lower() not in {"markdown", "html", "pdf", "docx"}:
+        if self._param.output_format.lower() not in {"markdown", "html", "pdf", "docx", "xlsx"}:
             self._param.output_format = "markdown"
 
         try:
@@ -227,6 +229,46 @@ class Message(ComponentBase):
 
                 binary_content = converted.encode("utf-8")
 
+            elif self._param.output_format == "xlsx":
+                import pandas as pd
+                from io import BytesIO
+
+                if isinstance(content, str):
+                    try:
+                        # Convert markdown to HTML tables to help pandas parse it
+                        html_content = pypandoc.convert_text(content, to="html", format="markdown")
+                        dfs = pd.read_html(html_content)
+                    except Exception as e:
+                        dfs = []
+                    
+                    if not dfs:
+                        df = pd.DataFrame({"Content": [content]})
+                        dfs = [df]
+                else:
+                    # Should not accept file path for Excel generation from agent response usually, 
+                    # but if it does, read it as text
+                    with open(content, "r") as f:
+                        txt_content = f.read()
+                    try:
+                         html_content = pypandoc.convert_text(txt_content, to="html", format="markdown")
+                         dfs = pd.read_html(html_content)
+                    except Exception:
+                        dfs = []
+                    
+                    if not dfs:
+                        df = pd.DataFrame({"Content": [txt_content]})
+                        dfs = [df]
+
+                # Write to Excel
+                excel_io = BytesIO()
+                with pd.ExcelWriter(excel_io, engine='openpyxl') as writer:
+                    for i, df in enumerate(dfs):
+                        sheet_name = f"Sheet{i+1}"
+                        df.to_excel(writer, sheet_name=sheet_name, index=False)
+                
+                excel_io.seek(0)
+                binary_content = excel_io.read()
+
             else:  # pdf, docx
                 with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp:
                     tmp_name = tmp.name
diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py
index ed8c8c7a0..64b0d0f55 100644
--- a/api/apps/canvas_app.py
+++ b/api/apps/canvas_app.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 #
 import asyncio
+import inspect
 import json
 import logging
 from functools import partial
@@ -299,8 +300,13 @@ async def debug():
         for k in outputs.keys():
             if isinstance(outputs[k], partial):
                 txt = ""
-                for c in outputs[k]():
-                    txt += c
+                iter_obj = outputs[k]()
+                if inspect.isasyncgen(iter_obj):
+                    async for c in iter_obj:
+                        txt += c
+                else:
+                    for c in iter_obj:
+                        txt += c
                 outputs[k] = txt
         return get_json_result(data=outputs)
     except Exception as e:
diff --git a/web/src/pages/agent/constant/index.tsx b/web/src/pages/agent/constant/index.tsx
index 5c25b7fe0..8775c6288 100644
--- a/web/src/pages/agent/constant/index.tsx
+++ b/web/src/pages/agent/constant/index.tsx
@@ -832,6 +832,7 @@ export enum ExportFileType {
   HTML = 'html',
   Markdown = 'md',
   DOCX = 'docx',
+  Excel = 'xlsx',
 }
 
 export enum TypesWithArray {

From fb74c2eb94e52e01fb9062146a4118a6658989ac Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Thu, 11 Dec 2025 21:54:13 +0530
Subject: [PATCH 2/8] feat: refine Excel export to capture conversation
 history.

---
 agent/component/message.py | 54 +++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/agent/component/message.py b/agent/component/message.py
index e2428a084..e900c9516 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -232,39 +232,39 @@ class Message(ComponentBase):
             elif self._param.output_format == "xlsx":
                 import pandas as pd
                 from io import BytesIO
+                from datetime import datetime
 
-                if isinstance(content, str):
-                    try:
-                        # Convert markdown to HTML tables to help pandas parse it
-                        html_content = pypandoc.convert_text(content, to="html", format="markdown")
-                        dfs = pd.read_html(html_content)
-                    except Exception as e:
-                        dfs = []
-                    
-                    if not dfs:
-                        df = pd.DataFrame({"Content": [content]})
-                        dfs = [df]
+                # Get all conversation history from canvas
+                history = getattr(self._canvas, 'history', [])
+                
+                # Build rows from conversation history
+                rows = []
+                for role, msg in history:
+                    if isinstance(msg, dict):
+                        msg_content = msg.get("content", str(msg))
+                    else:
+                        msg_content = str(msg) if msg else ""
+                    rows.append({
+                        "Role": role,
+                        "Content": msg_content,
+                    })
+                
+                # Add current message if not already in history
+                if content and (not rows or rows[-1].get("Content") != content):
+                    rows.append({
+                        "Role": "assistant",
+                        "Content": content,
+                    })
+                
+                if rows:
+                    df = pd.DataFrame(rows)
                 else:
-                    # Should not accept file path for Excel generation from agent response usually, 
-                    # but if it does, read it as text
-                    with open(content, "r") as f:
-                        txt_content = f.read()
-                    try:
-                         html_content = pypandoc.convert_text(txt_content, to="html", format="markdown")
-                         dfs = pd.read_html(html_content)
-                    except Exception:
-                        dfs = []
-                    
-                    if not dfs:
-                        df = pd.DataFrame({"Content": [txt_content]})
-                        dfs = [df]
+                    df = pd.DataFrame({"Role": ["assistant"], "Content": [content if content else ""]})
 
                 # Write to Excel
                 excel_io = BytesIO()
                 with pd.ExcelWriter(excel_io, engine='openpyxl') as writer:
-                    for i, df in enumerate(dfs):
-                        sheet_name = f"Sheet{i+1}"
-                        df.to_excel(writer, sheet_name=sheet_name, index=False)
+                    df.to_excel(writer, sheet_name="Conversation", index=False)
                 
                 excel_io.seek(0)
                 binary_content = excel_io.read()

From 4cc984928cbf94ff625aab927dfd649fc2e3908b Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Thu, 11 Dec 2025 22:19:18 +0530
Subject: [PATCH 3/8] feat: Add ExcelProcessor component for Excel file
 processing

- New ExcelProcessor component with read/merge/transform/output operations
- Add to frontend Operator enum and initial values
- Supports multiple Excel file inputs, data transformation, and output generation
---
 agent/component/excel_processor.py     | 404 +++++++++++++++++++++++++
 web/src/constants/agent.tsx            |   1 +
 web/src/pages/agent/constant/index.tsx |  25 ++
 3 files changed, 430 insertions(+)
 create mode 100644 agent/component/excel_processor.py

diff --git a/agent/component/excel_processor.py b/agent/component/excel_processor.py
new file mode 100644
index 000000000..256b92780
--- /dev/null
+++ b/agent/component/excel_processor.py
@@ -0,0 +1,404 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+ExcelProcessor Component
+
+A component for reading, processing, and generating Excel files in RAGFlow agents.
+Supports multiple Excel file inputs, data transformation, and Excel output generation.
+"""
+
+import json
+import logging
+import os
+from abc import ABC
+from functools import partial
+from io import BytesIO
+from typing import Any
+
+import pandas as pd
+
+from agent.component.base import ComponentBase, ComponentParamBase
+from api.db.services.file_service import FileService
+from api.utils.api_utils import timeout
+from common import settings
+from common.misc_utils import get_uuid
+
+
+class ExcelProcessorParam(ComponentParamBase):
+    """
+    Define the ExcelProcessor component parameters.
+    """
+    def __init__(self):
+        super().__init__()
+        # Input configuration
+        self.input_files = []  # Variable references to uploaded files
+        self.operation = "read"  # read, merge, transform, output
+        
+        # Processing options
+        self.sheet_selection = "all"  # all, first, or comma-separated sheet names
+        self.merge_strategy = "concat"  # concat, join
+        self.join_on = ""  # Column name for join operations
+        
+        # Transform options (for LLM-guided transformations)
+        self.transform_instructions = ""
+        self.transform_data = ""  # Variable reference to transformation data
+        
+        # Output options
+        self.output_format = "xlsx"  # xlsx, csv
+        self.output_filename = "output"
+        
+        # Component outputs
+        self.outputs = {
+            "data": {
+                "type": "object",
+                "value": {}
+            },
+            "summary": {
+                "type": "str",
+                "value": ""
+            },
+            "markdown": {
+                "type": "str",
+                "value": ""
+            }
+        }
+    
+    def check(self):
+        self.check_valid_value(
+            self.operation, 
+            "[ExcelProcessor] Operation", 
+            ["read", "merge", "transform", "output"]
+        )
+        self.check_valid_value(
+            self.output_format,
+            "[ExcelProcessor] Output format",
+            ["xlsx", "csv"]
+        )
+        return True
+
+
+class ExcelProcessor(ComponentBase, ABC):
+    """
+    Excel processing component for RAGFlow agents.
+    
+    Operations:
+    - read: Parse Excel files into structured data
+    - merge: Combine multiple Excel files
+    - transform: Apply data transformations based on instructions
+    - output: Generate Excel file output
+    """
+    component_name = "ExcelProcessor"
+
+    def get_input_form(self) -> dict[str, dict]:
+        """Define input form for the component."""
+        res = {}
+        for ref in (self._param.input_files or []):
+            for k, o in self.get_input_elements_from_text(ref).items():
+                res[k] = {"name": o.get("name", ""), "type": "file"}
+        if self._param.transform_data:
+            for k, o in self.get_input_elements_from_text(self._param.transform_data).items():
+                res[k] = {"name": o.get("name", ""), "type": "object"}
+        return res
+
+    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
+    def _invoke(self, **kwargs):
+        if self.check_if_canceled("ExcelProcessor processing"):
+            return
+
+        operation = self._param.operation.lower()
+        
+        if operation == "read":
+            self._read_excels()
+        elif operation == "merge":
+            self._merge_excels()
+        elif operation == "transform":
+            self._transform_data()
+        elif operation == "output":
+            self._output_excel()
+        else:
+            self.set_output("summary", f"Unknown operation: {operation}")
+
+    def _get_file_content(self, file_ref: str) -> tuple[bytes, str]:
+        """
+        Get file content from a variable reference.
+        Returns (content_bytes, filename).
+        """
+        value = self._canvas.get_variable_value(file_ref)
+        if value is None:
+            return None, None
+            
+        # Handle different value formats
+        if isinstance(value, dict):
+            # File reference from Begin/UserFillUp component
+            file_id = value.get("id") or value.get("file_id")
+            created_by = value.get("created_by") or self._canvas.get_tenant_id()
+            filename = value.get("name") or value.get("filename", "unknown.xlsx")
+            if file_id:
+                content = FileService.get_blob(created_by, file_id)
+                return content, filename
+        elif isinstance(value, list) and len(value) > 0:
+            # List of file references - return first
+            return self._get_file_content_from_list(value[0])
+        elif isinstance(value, str):
+            # Could be base64 encoded or a path
+            if value.startswith("data:"):
+                import base64
+                # Extract base64 content
+                _, encoded = value.split(",", 1)
+                return base64.b64decode(encoded), "uploaded.xlsx"
+                
+        return None, None
+    
+    def _get_file_content_from_list(self, item) -> tuple[bytes, str]:
+        """Extract file content from a list item."""
+        if isinstance(item, dict):
+            return self._get_file_content(item)
+        return None, None
+
+    def _parse_excel_to_dataframes(self, content: bytes, filename: str) -> dict[str, pd.DataFrame]:
+        """Parse Excel content into a dictionary of DataFrames (one per sheet)."""
+        try:
+            excel_file = BytesIO(content)
+            
+            if filename.lower().endswith(".csv"):
+                df = pd.read_csv(excel_file)
+                return {"Sheet1": df}
+            else:
+                # Read all sheets
+                xlsx = pd.ExcelFile(excel_file, engine='openpyxl')
+                sheet_selection = self._param.sheet_selection
+                
+                if sheet_selection == "all":
+                    sheets_to_read = xlsx.sheet_names
+                elif sheet_selection == "first":
+                    sheets_to_read = [xlsx.sheet_names[0]] if xlsx.sheet_names else []
+                else:
+                    # Comma-separated sheet names
+                    requested = [s.strip() for s in sheet_selection.split(",")]
+                    sheets_to_read = [s for s in requested if s in xlsx.sheet_names]
+                
+                dfs = {}
+                for sheet in sheets_to_read:
+                    dfs[sheet] = pd.read_excel(xlsx, sheet_name=sheet)
+                return dfs
+                
+        except Exception as e:
+            logging.error(f"Error parsing Excel file {filename}: {e}")
+            return {}
+
+    def _read_excels(self):
+        """Read and parse Excel files into structured data."""
+        all_data = {}
+        summaries = []
+        markdown_parts = []
+        
+        for file_ref in (self._param.input_files or []):
+            if self.check_if_canceled("ExcelProcessor reading"):
+                return
+                
+            # Get variable value
+            value = self._canvas.get_variable_value(file_ref)
+            self.set_input_value(file_ref, str(value)[:200] if value else "")
+            
+            if value is None:
+                continue
+            
+            # Handle file content
+            content, filename = self._get_file_content(file_ref)
+            if content is None:
+                continue
+                
+            # Parse Excel
+            dfs = self._parse_excel_to_dataframes(content, filename)
+            
+            for sheet_name, df in dfs.items():
+                key = f"{filename}_{sheet_name}" if len(dfs) > 1 else filename
+                all_data[key] = df.to_dict(orient="records")
+                
+                # Build summary
+                summaries.append(f"**{key}**: {len(df)} rows, {len(df.columns)} columns ({', '.join(df.columns.tolist()[:5])}{'...' if len(df.columns) > 5 else ''})")
+                
+                # Build markdown table
+                markdown_parts.append(f"### {key}\n\n{df.head(10).to_markdown(index=False)}\n")
+        
+        # Set outputs
+        self.set_output("data", all_data)
+        self.set_output("summary", "\n".join(summaries) if summaries else "No Excel files found")
+        self.set_output("markdown", "\n\n".join(markdown_parts) if markdown_parts else "No data")
+
+    def _merge_excels(self):
+        """Merge multiple Excel files/sheets into one."""
+        all_dfs = []
+        
+        for file_ref in (self._param.input_files or []):
+            if self.check_if_canceled("ExcelProcessor merging"):
+                return
+                
+            value = self._canvas.get_variable_value(file_ref)
+            self.set_input_value(file_ref, str(value)[:200] if value else "")
+            
+            if value is None:
+                continue
+                
+            content, filename = self._get_file_content(file_ref)
+            if content is None:
+                continue
+                
+            dfs = self._parse_excel_to_dataframes(content, filename)
+            all_dfs.extend(dfs.values())
+        
+        if not all_dfs:
+            self.set_output("data", {})
+            self.set_output("summary", "No data to merge")
+            return
+        
+        # Merge strategy
+        if self._param.merge_strategy == "concat":
+            merged_df = pd.concat(all_dfs, ignore_index=True)
+        elif self._param.merge_strategy == "join" and self._param.join_on:
+            # Join on specified column
+            merged_df = all_dfs[0]
+            for df in all_dfs[1:]:
+                merged_df = merged_df.merge(df, on=self._param.join_on, how="outer")
+        else:
+            merged_df = pd.concat(all_dfs, ignore_index=True)
+        
+        self.set_output("data", {"merged": merged_df.to_dict(orient="records")})
+        self.set_output("summary", f"Merged {len(all_dfs)} sources into {len(merged_df)} rows, {len(merged_df.columns)} columns")
+        self.set_output("markdown", merged_df.head(20).to_markdown(index=False))
+
+    def _transform_data(self):
+        """Apply transformations to data based on instructions or input data."""
+        # Get the data to transform
+        transform_ref = self._param.transform_data
+        if not transform_ref:
+            self.set_output("summary", "No transform data reference provided")
+            return
+            
+        data = self._canvas.get_variable_value(transform_ref)
+        self.set_input_value(transform_ref, str(data)[:300] if data else "")
+        
+        if data is None:
+            self.set_output("summary", "Transform data is empty")
+            return
+        
+        # Convert to DataFrame
+        if isinstance(data, dict):
+            # Could be {"sheet": [rows]} format
+            if all(isinstance(v, list) for v in data.values()):
+                # Multiple sheets
+                all_markdown = []
+                for sheet_name, rows in data.items():
+                    df = pd.DataFrame(rows)
+                    all_markdown.append(f"### {sheet_name}\n\n{df.to_markdown(index=False)}")
+                self.set_output("data", data)
+                self.set_output("markdown", "\n\n".join(all_markdown))
+            else:
+                df = pd.DataFrame([data])
+                self.set_output("data", df.to_dict(orient="records"))
+                self.set_output("markdown", df.to_markdown(index=False))
+        elif isinstance(data, list):
+            df = pd.DataFrame(data)
+            self.set_output("data", df.to_dict(orient="records"))
+            self.set_output("markdown", df.to_markdown(index=False))
+        else:
+            self.set_output("data", {"raw": str(data)})
+            self.set_output("markdown", str(data))
+        
+        self.set_output("summary", f"Transformed data ready for processing")
+
+    def _output_excel(self):
+        """Generate Excel file output from data."""
+        # Get data from transform_data reference
+        transform_ref = self._param.transform_data
+        if not transform_ref:
+            self.set_output("summary", "No data reference for output")
+            return
+            
+        data = self._canvas.get_variable_value(transform_ref)
+        self.set_input_value(transform_ref, str(data)[:300] if data else "")
+        
+        if data is None:
+            self.set_output("summary", "No data to output")
+            return
+        
+        try:
+            # Prepare DataFrames
+            if isinstance(data, dict):
+                if all(isinstance(v, list) for v in data.values()):
+                    # Multi-sheet format
+                    dfs = {k: pd.DataFrame(v) for k, v in data.items()}
+                else:
+                    dfs = {"Sheet1": pd.DataFrame([data])}
+            elif isinstance(data, list):
+                dfs = {"Sheet1": pd.DataFrame(data)}
+            else:
+                self.set_output("summary", "Invalid data format for Excel output")
+                return
+            
+            # Generate output
+            doc_id = get_uuid()
+            
+            if self._param.output_format == "csv":
+                # For CSV, only output first sheet
+                first_df = list(dfs.values())[0]
+                binary_content = first_df.to_csv(index=False).encode("utf-8")
+                filename = f"{self._param.output_filename}.csv"
+            else:
+                # Excel output
+                excel_io = BytesIO()
+                with pd.ExcelWriter(excel_io, engine='openpyxl') as writer:
+                    for sheet_name, df in dfs.items():
+                        # Sanitize sheet name (max 31 chars, no special chars)
+                        safe_name = sheet_name[:31].replace("/", "_").replace("\\", "_")
+                        df.to_excel(writer, sheet_name=safe_name, index=False)
+                excel_io.seek(0)
+                binary_content = excel_io.read()
+                filename = f"{self._param.output_filename}.xlsx"
+            
+            # Store file
+            settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content)
+            
+            # Set attachment output
+            self.set_output("attachment", {
+                "doc_id": doc_id,
+                "format": self._param.output_format,
+                "file_name": filename
+            })
+            
+            total_rows = sum(len(df) for df in dfs.values())
+            self.set_output("summary", f"Generated {filename} with {len(dfs)} sheet(s), {total_rows} total rows")
+            self.set_output("data", {k: v.to_dict(orient="records") for k, v in dfs.items()})
+            
+            logging.info(f"ExcelProcessor: Generated {filename} as {doc_id}")
+            
+        except Exception as e:
+            logging.error(f"ExcelProcessor output error: {e}")
+            self.set_output("summary", f"Error generating output: {str(e)}")
+
+    def thoughts(self) -> str:
+        """Return component thoughts for UI display."""
+        op = self._param.operation
+        if op == "read":
+            return "Reading Excel files..."
+        elif op == "merge":
+            return "Merging Excel data..."
+        elif op == "transform":
+            return "Transforming data..."
+        elif op == "output":
+            return "Generating Excel output..."
+        return "Processing Excel..."
diff --git a/web/src/constants/agent.tsx b/web/src/constants/agent.tsx
index 2f51e24f6..7d1079d7a 100644
--- a/web/src/constants/agent.tsx
+++ b/web/src/constants/agent.tsx
@@ -115,6 +115,7 @@ export enum Operator {
   Loop = 'Loop',
   LoopStart = 'LoopItem',
   ExitLoop = 'ExitLoop',
+  ExcelProcessor = 'ExcelProcessor',
 }
 
 export enum ComparisonOperator {
diff --git a/web/src/pages/agent/constant/index.tsx b/web/src/pages/agent/constant/index.tsx
index 8775c6288..d22b2532e 100644
--- a/web/src/pages/agent/constant/index.tsx
+++ b/web/src/pages/agent/constant/index.tsx
@@ -135,6 +135,31 @@ export const initialMessageValues = {
   content: [''],
 };
 
+export const initialExcelProcessorValues = {
+  input_files: [],
+  operation: 'read',
+  sheet_selection: 'all',
+  merge_strategy: 'concat',
+  join_on: '',
+  transform_data: '',
+  output_format: 'xlsx',
+  output_filename: 'output',
+  outputs: {
+    data: {
+      type: 'object',
+      value: {},
+    },
+    summary: {
+      type: 'string',
+      value: '',
+    },
+    markdown: {
+      type: 'string',
+      value: '',
+    },
+  },
+};
+
 export const initialDuckValues = {
   top_n: 10,
   channel: Channel.Text,

From 9ab745e1e97a5600f503fd1d04ef519adb9b52ee Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Fri, 12 Dec 2025 00:09:39 +0530
Subject: [PATCH 4/8] feat: Add markdown table parsing for XLSX output.

---
 agent/component/message.py | 79 +++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/agent/component/message.py b/agent/component/message.py
index e900c9516..49161681d 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -232,39 +232,66 @@ class Message(ComponentBase):
             elif self._param.output_format == "xlsx":
                 import pandas as pd
                 from io import BytesIO
-                from datetime import datetime
+                import re
 
-                # Get all conversation history from canvas
-                history = getattr(self._canvas, 'history', [])
+                # Try to parse markdown table from the content
+                df = None
                 
-                # Build rows from conversation history
-                rows = []
-                for role, msg in history:
-                    if isinstance(msg, dict):
-                        msg_content = msg.get("content", str(msg))
-                    else:
-                        msg_content = str(msg) if msg else ""
-                    rows.append({
-                        "Role": role,
-                        "Content": msg_content,
-                    })
+                if isinstance(content, str):
+                    # Extract markdown table from content
+                    # Pattern: lines starting with | and containing |
+                    lines = content.strip().split('\n')
+                    table_lines = []
+                    in_table = False
+                    
+                    for line in lines:
+                        line = line.strip()
+                        if line.startswith('|') and '|' in line[1:]:
+                            in_table = True
+                            # Skip separator line (|---|---| or |:---:|:---:| etc.)
+                            # Check if line only contains |, -, :, and whitespace
+                            cleaned = line.replace(' ', '').replace('|', '').replace('-', '').replace(':', '')
+                            if cleaned == '':
+                                continue  # Skip separator line
+                            table_lines.append(line)
+                        elif in_table and not line.startswith('|'):
+                            # End of table
+                            break
+                    
+                    if table_lines:
+                        # Parse the markdown table
+                        rows = []
+                        headers = None
+                        
+                        for line in table_lines:
+                            # Split by | and clean up
+                            cells = [cell.strip() for cell in line.split('|')]
+                            # Remove empty first and last elements from split
+                            cells = [c for c in cells if c]
+                            
+                            if headers is None:
+                                headers = cells
+                            else:
+                                rows.append(cells)
+                        
+                        if headers and rows:
+                            # Ensure all rows have same number of columns as headers
+                            normalized_rows = []
+                            for row in rows:
+                                while len(row) < len(headers):
+                                    row.append('')
+                                normalized_rows.append(row[:len(headers)])
+                            
+                            df = pd.DataFrame(normalized_rows, columns=headers)
                 
-                # Add current message if not already in history
-                if content and (not rows or rows[-1].get("Content") != content):
-                    rows.append({
-                        "Role": "assistant",
-                        "Content": content,
-                    })
-                
-                if rows:
-                    df = pd.DataFrame(rows)
-                else:
-                    df = pd.DataFrame({"Role": ["assistant"], "Content": [content if content else ""]})
+                # Fallback: if no table found, create single column with content
+                if df is None or df.empty:
+                    df = pd.DataFrame({"Content": [content if content else ""]})
 
                 # Write to Excel
                 excel_io = BytesIO()
                 with pd.ExcelWriter(excel_io, engine='openpyxl') as writer:
-                    df.to_excel(writer, sheet_name="Conversation", index=False)
+                    df.to_excel(writer, sheet_name="Data", index=False)
                 
                 excel_io.seek(0)
                 binary_content = excel_io.read()

From 7f3daf86ce9c2233fac4b12d58893e5be7b560f5 Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Fri, 12 Dec 2025 11:58:04 +0530
Subject: [PATCH 5/8] Fix lint errors in ExcelProcessor and Message components

---
 agent/component/excel_processor.py | 5 +----
 agent/component/message.py         | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/agent/component/excel_processor.py b/agent/component/excel_processor.py
index 256b92780..65b3a9bd2 100644
--- a/agent/component/excel_processor.py
+++ b/agent/component/excel_processor.py
@@ -21,13 +21,10 @@ A component for reading, processing, and generating Excel files in RAGFlow agent
 Supports multiple Excel file inputs, data transformation, and Excel output generation.
 """
 
-import json
 import logging
 import os
 from abc import ABC
-from functools import partial
 from io import BytesIO
-from typing import Any
 
 import pandas as pd
 
@@ -319,7 +316,7 @@ class ExcelProcessor(ComponentBase, ABC):
             self.set_output("data", {"raw": str(data)})
             self.set_output("markdown", str(data))
         
-        self.set_output("summary", f"Transformed data ready for processing")
+        self.set_output("summary", "Transformed data ready for processing")
 
     def _output_excel(self):
         """Generate Excel file output from data."""
diff --git a/agent/component/message.py b/agent/component/message.py
index 49161681d..848c7921e 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -232,7 +232,7 @@ class Message(ComponentBase):
             elif self._param.output_format == "xlsx":
                 import pandas as pd
                 from io import BytesIO
-                import re
+
 
                 # Try to parse markdown table from the content
                 df = None

From 41cdf6ad0aadf4c642433910d9354a2c45173685 Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Mon, 15 Dec 2025 17:05:47 +0530
Subject: [PATCH 6/8] feat: support multiple tables in Excel export

- Extract ALL markdown tables from LLM response, not just the first one
- Write each table to a separate sheet in the XLSX file
- Auto-generate sheet names from table titles (e.g., 'Table 1: Military Power...')
- Sanitize sheet names for Excel compatibility (max 31 chars, remove special chars)
- Handle duplicate sheet names with numbered suffixes
- Add debug logging for troubleshooting table parsing
---
 agent/component/message.py | 178 +++++++++++++++++++++++++++----------
 1 file changed, 133 insertions(+), 45 deletions(-)

diff --git a/agent/component/message.py b/agent/component/message.py
index 848c7921e..e1bb01aac 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -202,6 +202,48 @@ class Message(ComponentBase):
     def thoughts(self) -> str:
         return ""
 
+    def _parse_markdown_table_lines(self, table_lines: list) -> "pd.DataFrame":
+        """
+        Parse a list of markdown table lines into a pandas DataFrame.
+        
+        Args:
+            table_lines: List of strings, each representing a row in the markdown table
+                        (excluding separator lines like |---|---|)
+        
+        Returns:
+            pandas DataFrame with the table data, or None if parsing fails
+        """
+        import pandas as pd
+        
+        if not table_lines:
+            return None
+        
+        rows = []
+        headers = None
+        
+        for line in table_lines:
+            # Split by | and clean up
+            cells = [cell.strip() for cell in line.split('|')]
+            # Remove empty first and last elements from split (caused by leading/trailing |)
+            cells = [c for c in cells if c]
+            
+            if headers is None:
+                headers = cells
+            else:
+                rows.append(cells)
+        
+        if headers and rows:
+            # Ensure all rows have same number of columns as headers
+            normalized_rows = []
+            for row in rows:
+                while len(row) < len(headers):
+                    row.append('')
+                normalized_rows.append(row[:len(headers)])
+            
+            return pd.DataFrame(normalized_rows, columns=headers)
+        
+        return None
+
     def _convert_content(self, content):
         if not self._param.output_format:
             return
@@ -233,68 +275,114 @@ class Message(ComponentBase):
                 import pandas as pd
                 from io import BytesIO
 
-
-                # Try to parse markdown table from the content
-                df = None
+                # Debug: log the content being parsed
+                logging.info(f"XLSX Parser: Content length={len(content) if content else 0}, first 500 chars: {content[:500] if content else 'None'}")
+                
+                # Try to parse ALL markdown tables from the content
+                # Each table will be written to a separate sheet
+                tables = []  # List of (sheet_name, dataframe)
                 
                 if isinstance(content, str):
-                    # Extract markdown table from content
-                    # Pattern: lines starting with | and containing |
                     lines = content.strip().split('\n')
-                    table_lines = []
+                    logging.info(f"XLSX Parser: Total lines={len(lines)}, lines starting with '|': {sum(1 for l in lines if l.strip().startswith('|'))}")
+                    current_table_lines = []
+                    current_table_title = None
+                    pending_title = None
                     in_table = False
+                    table_count = 0
                     
-                    for line in lines:
-                        line = line.strip()
-                        if line.startswith('|') and '|' in line[1:]:
-                            in_table = True
-                            # Skip separator line (|---|---| or |:---:|:---:| etc.)
-                            # Check if line only contains |, -, :, and whitespace
-                            cleaned = line.replace(' ', '').replace('|', '').replace('-', '').replace(':', '')
+                    for i, line in enumerate(lines):
+                        stripped = line.strip()
+                        
+                        # Check for potential table title (lines before a table)
+                        # Look for patterns like "Table 1:", "## Table", or markdown headers
+                        if not in_table and stripped and not stripped.startswith('|'):
+                            # Check if this could be a table title
+                            lower_stripped = stripped.lower()
+                            if (lower_stripped.startswith('table') or 
+                                stripped.startswith('#') or
+                                ':' in stripped):
+                                pending_title = stripped.lstrip('#').strip()
+                        
+                        if stripped.startswith('|') and '|' in stripped[1:]:
+                            # Check if this is a separator line (|---|---|)
+                            cleaned = stripped.replace(' ', '').replace('|', '').replace('-', '').replace(':', '')
                             if cleaned == '':
                                 continue  # Skip separator line
-                            table_lines.append(line)
-                        elif in_table and not line.startswith('|'):
-                            # End of table
-                            break
+                            
+                            if not in_table:
+                                # Starting a new table
+                                in_table = True
+                                current_table_lines = []
+                                current_table_title = pending_title
+                                pending_title = None
+                            
+                            current_table_lines.append(stripped)
+                        
+                        elif in_table and not stripped.startswith('|'):
+                            # End of current table - save it
+                            if current_table_lines:
+                                df = self._parse_markdown_table_lines(current_table_lines)
+                                if df is not None and not df.empty:
+                                    table_count += 1
+                                    # Generate sheet name
+                                    if current_table_title:
+                                        # Clean and truncate title for sheet name
+                                        sheet_name = current_table_title[:31]
+                                        sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace('*', '').replace('?', '').replace('[', '').replace(']', '')
+                                    else:
+                                        sheet_name = f"Table_{table_count}"
+                                    tables.append((sheet_name, df))
+                            
+                            # Reset for next table
+                            in_table = False
+                            current_table_lines = []
+                            current_table_title = None
+                            
+                            # Check if this line could be a title for the next table
+                            if stripped:
+                                lower_stripped = stripped.lower()
+                                if (lower_stripped.startswith('table') or 
+                                    stripped.startswith('#') or
+                                    ':' in stripped):
+                                    pending_title = stripped.lstrip('#').strip()
                     
-                    if table_lines:
-                        # Parse the markdown table
-                        rows = []
-                        headers = None
-                        
-                        for line in table_lines:
-                            # Split by | and clean up
-                            cells = [cell.strip() for cell in line.split('|')]
-                            # Remove empty first and last elements from split
-                            cells = [c for c in cells if c]
-                            
-                            if headers is None:
-                                headers = cells
+                    # Don't forget the last table if content ends with a table
+                    if in_table and current_table_lines:
+                        df = self._parse_markdown_table_lines(current_table_lines)
+                        if df is not None and not df.empty:
+                            table_count += 1
+                            if current_table_title:
+                                sheet_name = current_table_title[:31]
+                                sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace('*', '').replace('?', '').replace('[', '').replace(']', '')
                             else:
-                                rows.append(cells)
-                        
-                        if headers and rows:
-                            # Ensure all rows have same number of columns as headers
-                            normalized_rows = []
-                            for row in rows:
-                                while len(row) < len(headers):
-                                    row.append('')
-                                normalized_rows.append(row[:len(headers)])
-                            
-                            df = pd.DataFrame(normalized_rows, columns=headers)
+                                sheet_name = f"Table_{table_count}"
+                            tables.append((sheet_name, df))
                 
-                # Fallback: if no table found, create single column with content
-                if df is None or df.empty:
+                # Fallback: if no tables found, create single sheet with content
+                if not tables:
                     df = pd.DataFrame({"Content": [content if content else ""]})
+                    tables = [("Data", df)]
 
-                # Write to Excel
+                # Write all tables to Excel, each in a separate sheet
                 excel_io = BytesIO()
                 with pd.ExcelWriter(excel_io, engine='openpyxl') as writer:
-                    df.to_excel(writer, sheet_name="Data", index=False)
+                    used_names = set()
+                    for sheet_name, df in tables:
+                        # Ensure unique sheet names
+                        original_name = sheet_name
+                        counter = 1
+                        while sheet_name in used_names:
+                            suffix = f"_{counter}"
+                            sheet_name = original_name[:31-len(suffix)] + suffix
+                            counter += 1
+                        used_names.add(sheet_name)
+                        df.to_excel(writer, sheet_name=sheet_name, index=False)
                 
                 excel_io.seek(0)
                 binary_content = excel_io.read()
+                
+                logging.info(f"Generated Excel with {len(tables)} sheet(s): {[t[0] for t in tables]}")
 
             else:  # pdf, docx
                 with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp:

From 989244ce2aa861b413b242c9fb36e2cae9a29130 Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Mon, 15 Dec 2025 17:11:15 +0530
Subject: [PATCH 7/8] fix: resolve ruff lint errors

- Remove pd.DataFrame type hint (pd not imported at module level)
- Rename ambiguous variable 'l' to 'line'
---
 agent/component/message.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent/component/message.py b/agent/component/message.py
index e1bb01aac..5f046e388 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -202,7 +202,7 @@ class Message(ComponentBase):
     def thoughts(self) -> str:
         return ""
 
-    def _parse_markdown_table_lines(self, table_lines: list) -> "pd.DataFrame":
+    def _parse_markdown_table_lines(self, table_lines: list):
         """
         Parse a list of markdown table lines into a pandas DataFrame.
         
@@ -284,7 +284,7 @@ class Message(ComponentBase):
                 
                 if isinstance(content, str):
                     lines = content.strip().split('\n')
-                    logging.info(f"XLSX Parser: Total lines={len(lines)}, lines starting with '|': {sum(1 for l in lines if l.strip().startswith('|'))}")
+                    logging.info(f"XLSX Parser: Total lines={len(lines)}, lines starting with '|': {sum(1 for line in lines if line.strip().startswith('|'))}")
                     current_table_lines = []
                     current_table_title = None
                     pending_title = None

From 264e9894c37ed2262343cd06d3e446a4ba80433e Mon Sep 17 00:00:00 2001
From: shivamjohri247 <shivamjohri.theking@gmail.com>
Date: Mon, 15 Dec 2025 17:55:34 +0530
Subject: [PATCH 8/8] fix: remove colon from Excel sheet names

Excel doesn't allow colons (:) in sheet names, causing export to fail with
'Invalid character : found in sheet title' error
---
 agent/component/message.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent/component/message.py b/agent/component/message.py
index 5f046e388..164716575 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -329,7 +329,7 @@ class Message(ComponentBase):
                                     if current_table_title:
                                         # Clean and truncate title for sheet name
                                         sheet_name = current_table_title[:31]
-                                        sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace('*', '').replace('?', '').replace('[', '').replace(']', '')
+                                        sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace('*', '').replace('?', '').replace('[', '').replace(']', '').replace(':', '')
                                     else:
                                         sheet_name = f"Table_{table_count}"
                                     tables.append((sheet_name, df))
@@ -354,7 +354,7 @@ class Message(ComponentBase):
                             table_count += 1
                             if current_table_title:
                                 sheet_name = current_table_title[:31]
-                                sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace('*', '').replace('?', '').replace('[', '').replace(']', '')
+                                sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace('*', '').replace('?', '').replace('[', '').replace(']', '').replace(':', '')
                             else:
                                 sheet_name = f"Table_{table_count}"
                             tables.append((sheet_name, df))