From a3fb2446312792f16e98a3293644ad10a083c676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Thu, 4 Dec 2025 19:15:05 +0800 Subject: [PATCH] cherry-pick 2b160163 --- lightrag/api/routers/document_routes.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 5775c4da..a4efcacd 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1133,10 +1133,20 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Use sheet.max_column to get the maximum column width directly - max_columns = sheet.max_column if sheet.max_column else 0 + # Two-pass approach to preserve column alignment without storing rows in memory: + # Pass 1: Scan to determine the maximum column width (memory-efficient) + max_columns = 0 + for row in sheet.iter_rows(values_only=True): + last_nonempty_idx = -1 + for idx, cell in enumerate(row): + # Check if cell has meaningful content (not None or empty string) + if cell is not None and str(cell).strip(): + last_nonempty_idx = idx - # Extract rows with consistent width to preserve column alignment + if last_nonempty_idx >= 0: + max_columns = max(max_columns, last_nonempty_idx + 1) + + # Pass 2: Extract rows with consistent width to preserve column alignment for row in sheet.iter_rows(values_only=True): row_parts = []