From 2b160163120bc8310babbb84887d07c8089ac7c7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 03:48:36 +0800 Subject: [PATCH] Optimize XLSX extraction to avoid storing all rows in memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove intermediate row storage • Use iterator twice instead of list() • Preserve column alignment logic • Reduce memory footprint • Maintain same output format --- lightrag/api/routers/document_routes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 14e03f5f..a4efcacd 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1133,12 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Two-pass approach to preserve column alignment: - # Pass 1: Determine the maximum column width for this sheet + # Two-pass approach to preserve column alignment without storing rows in memory: + # Pass 1: Scan to determine the maximum column width (memory-efficient) max_columns = 0 - all_rows = list(sheet.iter_rows(values_only=True)) - - for row in all_rows: + for row in sheet.iter_rows(values_only=True): last_nonempty_idx = -1 for idx, cell in enumerate(row): # Check if cell has meaningful content (not None or empty string) @@ -1149,7 +1147,7 @@ def _extract_xlsx(file_bytes: bytes) -> str: max_columns = max(max_columns, last_nonempty_idx + 1) # Pass 2: Extract rows with consistent width to preserve column alignment - for row in all_rows: + for row in sheet.iter_rows(values_only=True): row_parts = [] # Build row up to max_columns width