Optimize XLSX extraction to avoid storing all rows in memory

• Remove intermediate row storage • Use iterator twice instead of list() • Preserve column alignment logic • Reduce memory footprint • Maintain same output format
2025-11-19 03:48:36 +08:00 · 2025-11-19 03:48:36 +08:00 · 2b16016312
commit 2b16016312
parent ef659a1e09
1 changed files with 4 additions and 6 deletions
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -1133,12 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str:
        safe_title = escape_sheet_title(sheet.title)
        content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")

-        # Two-pass approach to preserve column alignment:
-        # Pass 1: Determine the maximum column width for this sheet
+        # Two-pass approach to preserve column alignment without storing rows in memory:
+        # Pass 1: Scan to determine the maximum column width (memory-efficient)
        max_columns = 0
-        all_rows = list(sheet.iter_rows(values_only=True))
-
-        for row in all_rows:
+        for row in sheet.iter_rows(values_only=True):
            last_nonempty_idx = -1
            for idx, cell in enumerate(row):
                # Check if cell has meaningful content (not None or empty string)
@ -1149,7 +1147,7 @@ def _extract_xlsx(file_bytes: bytes) -> str:
                max_columns = max(max_columns, last_nonempty_idx + 1)

        # Pass 2: Extract rows with consistent width to preserve column alignment
-        for row in all_rows:
+        for row in sheet.iter_rows(values_only=True):
            row_parts = []

            # Build row up to max_columns width