Preserve column alignment in XLSX extraction with two-pass processing
• Two-pass approach for consistent width • Maintain tabular structure integrity • Determine max columns first pass • Extract with alignment second pass • Prevent column misalignment issues
This commit is contained in:
parent
3efb1716b4
commit
ef659a1e09
1 changed files with 28 additions and 14 deletions
|
|
@ -1061,9 +1061,9 @@ def _extract_xlsx(file_bytes: bytes) -> str:
|
||||||
Features:
|
Features:
|
||||||
- Each sheet is wrapped with '====================' separators for visual distinction
|
- Each sheet is wrapped with '====================' separators for visual distinction
|
||||||
- Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
|
- Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
|
||||||
- Trailing empty columns are trimmed per row to reduce token waste
|
- Column alignment is preserved across all rows to maintain tabular structure
|
||||||
- Empty rows are preserved as blank lines to maintain row structure
|
- Empty rows are preserved as blank lines to maintain row structure
|
||||||
- Single-pass optimization for better performance on large spreadsheets
|
- Two-pass processing: determines max column width, then extracts with consistent alignment
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_bytes: XLSX file content as bytes
|
file_bytes: XLSX file content as bytes
|
||||||
|
|
@ -1133,25 +1133,39 @@ def _extract_xlsx(file_bytes: bytes) -> str:
|
||||||
safe_title = escape_sheet_title(sheet.title)
|
safe_title = escape_sheet_title(sheet.title)
|
||||||
content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
|
content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
|
||||||
|
|
||||||
# Single-pass optimization: escape and trim in one iteration
|
# Two-pass approach to preserve column alignment:
|
||||||
for row in sheet.iter_rows(values_only=True):
|
# Pass 1: Determine the maximum column width for this sheet
|
||||||
row_parts = []
|
max_columns = 0
|
||||||
last_nonempty_idx = -1
|
all_rows = list(sheet.iter_rows(values_only=True))
|
||||||
|
|
||||||
# Build escaped row while tracking last non-empty cell position
|
for row in all_rows:
|
||||||
|
last_nonempty_idx = -1
|
||||||
for idx, cell in enumerate(row):
|
for idx, cell in enumerate(row):
|
||||||
escaped = escape_cell(cell)
|
# Check if cell has meaningful content (not None or empty string)
|
||||||
row_parts.append(escaped)
|
if cell is not None and str(cell).strip():
|
||||||
if escaped != "":
|
|
||||||
last_nonempty_idx = idx
|
last_nonempty_idx = idx
|
||||||
|
|
||||||
# Handle completely empty rows vs rows with data
|
if last_nonempty_idx >= 0:
|
||||||
if last_nonempty_idx == -1:
|
max_columns = max(max_columns, last_nonempty_idx + 1)
|
||||||
|
|
||||||
|
# Pass 2: Extract rows with consistent width to preserve column alignment
|
||||||
|
for row in all_rows:
|
||||||
|
row_parts = []
|
||||||
|
|
||||||
|
# Build row up to max_columns width
|
||||||
|
for idx in range(max_columns):
|
||||||
|
if idx < len(row):
|
||||||
|
row_parts.append(escape_cell(row[idx]))
|
||||||
|
else:
|
||||||
|
row_parts.append("") # Pad short rows
|
||||||
|
|
||||||
|
# Check if row is completely empty
|
||||||
|
if all(part == "" for part in row_parts):
|
||||||
# Preserve empty rows as blank lines (maintains row structure)
|
# Preserve empty rows as blank lines (maintains row structure)
|
||||||
content_parts.append("")
|
content_parts.append("")
|
||||||
else:
|
else:
|
||||||
# Only join up to last non-empty cell (trim trailing empties)
|
# Join all columns to maintain consistent column count
|
||||||
content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1]))
|
content_parts.append("\t".join(row_parts))
|
||||||
|
|
||||||
# Final separator for symmetry (makes parsing easier)
|
# Final separator for symmetry (makes parsing easier)
|
||||||
content_parts.append(sheet_separator)
|
content_parts.append(sheet_separator)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue