From 0244699d81a551cf5c5ba3a936709eb54fbd1968 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 04:02:39 +0800 Subject: [PATCH] Optimize XLSX extraction by using sheet.max_column instead of two-pass scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove two-pass row scanning approach • Use built-in sheet.max_column property • Simplify column width detection logic • Improve memory efficiency • Maintain column alignment preservation --- lightrag/api/routers/document_routes.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index a4efcacd..5775c4da 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1133,20 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Two-pass approach to preserve column alignment without storing rows in memory: - # Pass 1: Scan to determine the maximum column width (memory-efficient) - max_columns = 0 - for row in sheet.iter_rows(values_only=True): - last_nonempty_idx = -1 - for idx, cell in enumerate(row): - # Check if cell has meaningful content (not None or empty string) - if cell is not None and str(cell).strip(): - last_nonempty_idx = idx + # Use sheet.max_column to get the maximum column width directly + max_columns = sheet.max_column if sheet.max_column else 0 - if last_nonempty_idx >= 0: - max_columns = max(max_columns, last_nonempty_idx + 1) - - # Pass 2: Extract rows with consistent width to preserve column alignment + # Extract rows with consistent width to preserve column alignment for row in sheet.iter_rows(values_only=True): row_parts = []