This commit is contained in:
Raphaël MANSUY 2025-12-04 19:15:05 +08:00
parent 196033bf75
commit f9f4555b48

View file

@ -1063,7 +1063,7 @@ def _extract_xlsx(file_bytes: bytes) -> str:
- Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
- Column alignment is preserved across all rows to maintain tabular structure
- Empty rows are preserved as blank lines to maintain row structure
- Uses sheet.max_column to determine column width efficiently
- Two-pass processing: determines max column width, then extracts with consistent alignment
Args:
file_bytes: XLSX file content as bytes
@ -1133,11 +1133,23 @@ def _extract_xlsx(file_bytes: bytes) -> str:
safe_title = escape_sheet_title(sheet.title)
content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
# Use sheet.max_column to get the maximum column width directly
max_columns = sheet.max_column if sheet.max_column else 0
# Two-pass approach to preserve column alignment:
# Pass 1: Determine the maximum column width for this sheet
max_columns = 0
all_rows = list(sheet.iter_rows(values_only=True))
# Extract rows with consistent width to preserve column alignment
for row in sheet.iter_rows(values_only=True):
for row in all_rows:
last_nonempty_idx = -1
for idx, cell in enumerate(row):
# Check if cell has meaningful content (not None or empty string)
if cell is not None and str(cell).strip():
last_nonempty_idx = idx
if last_nonempty_idx >= 0:
max_columns = max(max_columns, last_nonempty_idx + 1)
# Pass 2: Extract rows with consistent width to preserve column alignment
for row in all_rows:
row_parts = []
# Build row up to max_columns width