Enhance XLSX extraction with structured tab-delimited format and escaping
- Add clear sheet separators - Escape special characters - Trim trailing empty columns - Preserve row structure - Single-pass optimization
This commit is contained in:
parent
efbbaaf7f9
commit
3efb1716b4
1 changed files with 93 additions and 10 deletions
|
|
@ -1052,27 +1052,110 @@ def _extract_pptx(file_bytes: bytes) -> str:
|
|||
|
||||
|
||||
def _extract_xlsx(file_bytes: bytes) -> str:
|
||||
"""Extract XLSX content (synchronous).
|
||||
"""Extract XLSX content in tab-delimited format with clear sheet separation.
|
||||
|
||||
This function processes Excel workbooks and converts them to a structured text format
|
||||
suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with
|
||||
separator lines, and special characters are escaped to preserve the tab-delimited structure.
|
||||
|
||||
Features:
|
||||
- Each sheet is wrapped with '====================' separators for visual distinction
|
||||
- Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
|
||||
- Trailing empty columns are trimmed per row to reduce token waste
|
||||
- Empty rows are preserved as blank lines to maintain row structure
|
||||
- Single-pass optimization for better performance on large spreadsheets
|
||||
|
||||
Args:
|
||||
file_bytes: XLSX file content as bytes
|
||||
|
||||
Returns:
|
||||
str: Extracted text content
|
||||
str: Extracted text content with all sheets in tab-delimited format.
|
||||
Format: Sheet separators, sheet name, then tab-delimited rows.
|
||||
|
||||
Example output:
|
||||
==================== Sheet: Data ====================
|
||||
Name\tAge\tCity
|
||||
Alice\t30\tNew York
|
||||
Bob\t25\tLondon
|
||||
|
||||
==================== Sheet: Summary ====================
|
||||
Total\t2
|
||||
====================
|
||||
"""
|
||||
from openpyxl import load_workbook # type: ignore
|
||||
|
||||
xlsx_file = BytesIO(file_bytes)
|
||||
wb = load_workbook(xlsx_file)
|
||||
content = ""
|
||||
for sheet in wb:
|
||||
content += f"Sheet: {sheet.title}\n"
|
||||
|
||||
def escape_cell(cell_value: str | int | float | None) -> str:
|
||||
"""Escape characters that would break tab-delimited layout.
|
||||
|
||||
Escape order is critical: backslashes first, then tabs/newlines.
|
||||
This prevents double-escaping issues.
|
||||
|
||||
Args:
|
||||
cell_value: The cell value to escape (can be None, str, int, or float)
|
||||
|
||||
Returns:
|
||||
str: Escaped cell value safe for tab-delimited format
|
||||
"""
|
||||
if cell_value is None:
|
||||
return ""
|
||||
text = str(cell_value)
|
||||
# CRITICAL: Escape backslash first to avoid double-escaping
|
||||
return (
|
||||
text.replace("\\", "\\\\") # Must be first: \ -> \\
|
||||
.replace("\t", "\\t") # Tab -> \t (visible)
|
||||
.replace("\r\n", "\\n") # Windows newline -> \n
|
||||
.replace("\r", "\\n") # Mac newline -> \n
|
||||
.replace("\n", "\\n") # Unix newline -> \n
|
||||
)
|
||||
|
||||
def escape_sheet_title(title: str) -> str:
|
||||
"""Escape sheet title to prevent formatting issues in separators.
|
||||
|
||||
Args:
|
||||
title: Original sheet title
|
||||
|
||||
Returns:
|
||||
str: Sanitized sheet title with tabs/newlines replaced
|
||||
"""
|
||||
return str(title).replace("\n", " ").replace("\t", " ").replace("\r", " ")
|
||||
|
||||
content_parts: list[str] = []
|
||||
sheet_separator = "=" * 20
|
||||
|
||||
for idx, sheet in enumerate(wb):
|
||||
if idx > 0:
|
||||
content_parts.append("") # Blank line between sheets for readability
|
||||
|
||||
# Escape sheet title to handle edge cases with special characters
|
||||
safe_title = escape_sheet_title(sheet.title)
|
||||
content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
|
||||
|
||||
# Single-pass optimization: escape and trim in one iteration
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
content += (
|
||||
"\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
|
||||
)
|
||||
content += "\n"
|
||||
return content
|
||||
row_parts = []
|
||||
last_nonempty_idx = -1
|
||||
|
||||
# Build escaped row while tracking last non-empty cell position
|
||||
for idx, cell in enumerate(row):
|
||||
escaped = escape_cell(cell)
|
||||
row_parts.append(escaped)
|
||||
if escaped != "":
|
||||
last_nonempty_idx = idx
|
||||
|
||||
# Handle completely empty rows vs rows with data
|
||||
if last_nonempty_idx == -1:
|
||||
# Preserve empty rows as blank lines (maintains row structure)
|
||||
content_parts.append("")
|
||||
else:
|
||||
# Only join up to last non-empty cell (trim trailing empties)
|
||||
content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1]))
|
||||
|
||||
# Final separator for symmetry (makes parsing easier)
|
||||
content_parts.append(sheet_separator)
|
||||
return "\n".join(content_parts)
|
||||
|
||||
|
||||
async def pipeline_enqueue_file(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue