From ef659a1e09ed0f5b5e650728b182ec49f93c4c38 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 03:34:22 +0800 Subject: [PATCH] Preserve column alignment in XLSX extraction with two-pass processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Two-pass approach for consistent width • Maintain tabular structure integrity • Determine max columns first pass • Extract with alignment second pass • Prevent column misalignment issues --- lightrag/api/routers/document_routes.py | 42 ++++++++++++++++--------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 1e770520..14e03f5f 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1061,9 +1061,9 @@ def _extract_xlsx(file_bytes: bytes) -> str: Features: - Each sheet is wrapped with '====================' separators for visual distinction - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption - - Trailing empty columns are trimmed per row to reduce token waste + - Column alignment is preserved across all rows to maintain tabular structure - Empty rows are preserved as blank lines to maintain row structure - - Single-pass optimization for better performance on large spreadsheets + - Two-pass processing: determines max column width, then extracts with consistent alignment Args: file_bytes: XLSX file content as bytes @@ -1133,25 +1133,39 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Single-pass optimization: escape and trim in one iteration - for row in sheet.iter_rows(values_only=True): - row_parts = [] - last_nonempty_idx = -1 + # Two-pass approach to preserve column alignment: + # Pass 1: Determine the maximum column width for this sheet + max_columns = 0 + all_rows = list(sheet.iter_rows(values_only=True)) - # Build escaped row while tracking last non-empty cell position + for row in all_rows: + last_nonempty_idx = -1 for idx, cell in enumerate(row): - escaped = escape_cell(cell) - row_parts.append(escaped) - if escaped != "": + # Check if cell has meaningful content (not None or empty string) + if cell is not None and str(cell).strip(): last_nonempty_idx = idx - # Handle completely empty rows vs rows with data - if last_nonempty_idx == -1: + if last_nonempty_idx >= 0: + max_columns = max(max_columns, last_nonempty_idx + 1) + + # Pass 2: Extract rows with consistent width to preserve column alignment + for row in all_rows: + row_parts = [] + + # Build row up to max_columns width + for idx in range(max_columns): + if idx < len(row): + row_parts.append(escape_cell(row[idx])) + else: + row_parts.append("") # Pad short rows + + # Check if row is completely empty + if all(part == "" for part in row_parts): # Preserve empty rows as blank lines (maintains row structure) content_parts.append("") else: - # Only join up to last non-empty cell (trim trailing empties) - content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1])) + # Join all columns to maintain consistent column count + content_parts.append("\t".join(row_parts)) # Final separator for symmetry (makes parsing easier) content_parts.append(sheet_separator)