From ef659a1e09ed0f5b5e650728b182ec49f93c4c38 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 03:34:22 +0800
Subject: [PATCH] Preserve column alignment in XLSX extraction with two-pass
 processing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Two-pass approach for consistent width
• Maintain tabular structure integrity
• Determine max columns first pass
• Extract with alignment second pass
• Prevent column misalignment issues
---
 lightrag/api/routers/document_routes.py | 42 ++++++++++++++++---------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 1e770520..14e03f5f 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1061,9 +1061,9 @@ def _extract_xlsx(file_bytes: bytes) -> str:
     Features:
     - Each sheet is wrapped with '====================' separators for visual distinction
     - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
-    - Trailing empty columns are trimmed per row to reduce token waste
+    - Column alignment is preserved across all rows to maintain tabular structure
     - Empty rows are preserved as blank lines to maintain row structure
-    - Single-pass optimization for better performance on large spreadsheets
+    - Two-pass processing: determines max column width, then extracts with consistent alignment
 
     Args:
         file_bytes: XLSX file content as bytes
@@ -1133,25 +1133,39 @@ def _extract_xlsx(file_bytes: bytes) -> str:
         safe_title = escape_sheet_title(sheet.title)
         content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
 
-        # Single-pass optimization: escape and trim in one iteration
-        for row in sheet.iter_rows(values_only=True):
-            row_parts = []
-            last_nonempty_idx = -1
+        # Two-pass approach to preserve column alignment:
+        # Pass 1: Determine the maximum column width for this sheet
+        max_columns = 0
+        all_rows = list(sheet.iter_rows(values_only=True))
 
-            # Build escaped row while tracking last non-empty cell position
+        for row in all_rows:
+            last_nonempty_idx = -1
             for idx, cell in enumerate(row):
-                escaped = escape_cell(cell)
-                row_parts.append(escaped)
-                if escaped != "":
+                # Check if cell has meaningful content (not None or empty string)
+                if cell is not None and str(cell).strip():
                     last_nonempty_idx = idx
 
-            # Handle completely empty rows vs rows with data
-            if last_nonempty_idx == -1:
+            if last_nonempty_idx >= 0:
+                max_columns = max(max_columns, last_nonempty_idx + 1)
+
+        # Pass 2: Extract rows with consistent width to preserve column alignment
+        for row in all_rows:
+            row_parts = []
+
+            # Build row up to max_columns width
+            for idx in range(max_columns):
+                if idx < len(row):
+                    row_parts.append(escape_cell(row[idx]))
+                else:
+                    row_parts.append("")  # Pad short rows
+
+            # Check if row is completely empty
+            if all(part == "" for part in row_parts):
                 # Preserve empty rows as blank lines (maintains row structure)
                 content_parts.append("")
             else:
-                # Only join up to last non-empty cell (trim trailing empties)
-                content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1]))
+                # Join all columns to maintain consistent column count
+                content_parts.append("\t".join(row_parts))
 
     # Final separator for symmetry (makes parsing easier)
     content_parts.append(sheet_separator)