From 2b160163120bc8310babbb84887d07c8089ac7c7 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 03:48:36 +0800
Subject: [PATCH] Optimize XLSX extraction to avoid storing all rows in memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove intermediate row storage
• Use iterator twice instead of list()
• Preserve column alignment logic
• Reduce memory footprint
• Maintain same output format
---
 lightrag/api/routers/document_routes.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 14e03f5f..a4efcacd 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1133,12 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str:
         safe_title = escape_sheet_title(sheet.title)
         content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
 
-        # Two-pass approach to preserve column alignment:
-        # Pass 1: Determine the maximum column width for this sheet
+        # Two-pass approach to preserve column alignment without storing rows in memory:
+        # Pass 1: Scan to determine the maximum column width (memory-efficient)
         max_columns = 0
-        all_rows = list(sheet.iter_rows(values_only=True))
-
-        for row in all_rows:
+        for row in sheet.iter_rows(values_only=True):
             last_nonempty_idx = -1
             for idx, cell in enumerate(row):
                 # Check if cell has meaningful content (not None or empty string)
@@ -1149,7 +1147,7 @@ def _extract_xlsx(file_bytes: bytes) -> str:
                 max_columns = max(max_columns, last_nonempty_idx + 1)
 
         # Pass 2: Extract rows with consistent width to preserve column alignment
-        for row in all_rows:
+        for row in sheet.iter_rows(values_only=True):
             row_parts = []
 
             # Build row up to max_columns width