Merge pull request #2386 from danielaskdd/excel-optimization
Feat: Enhance XLSX Extraction by Adding Separators and Escape Special Characters
This commit is contained in:
commit
af4d2a3dcc
1 changed files with 121 additions and 12 deletions
|
|
@ -992,6 +992,30 @@ def _extract_docx(file_bytes: bytes) -> str:
|
||||||
docx_file = BytesIO(file_bytes)
|
docx_file = BytesIO(file_bytes)
|
||||||
doc = Document(docx_file)
|
doc = Document(docx_file)
|
||||||
|
|
||||||
|
def escape_cell(cell_value: str | None) -> str:
|
||||||
|
"""Escape characters that would break tab-delimited layout.
|
||||||
|
|
||||||
|
Escape order is critical: backslashes first, then tabs/newlines.
|
||||||
|
This prevents double-escaping issues.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cell_value: The cell value to escape (can be None or str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Escaped cell value safe for tab-delimited format
|
||||||
|
"""
|
||||||
|
if cell_value is None:
|
||||||
|
return ""
|
||||||
|
text = str(cell_value)
|
||||||
|
# CRITICAL: Escape backslash first to avoid double-escaping
|
||||||
|
return (
|
||||||
|
text.replace("\\", "\\\\") # Must be first: \ -> \\
|
||||||
|
.replace("\t", "\\t") # Tab -> \t (visible)
|
||||||
|
.replace("\r\n", "\\n") # Windows newline -> \n
|
||||||
|
.replace("\r", "\\n") # Mac newline -> \n
|
||||||
|
.replace("\n", "\\n") # Unix newline -> \n
|
||||||
|
)
|
||||||
|
|
||||||
content_parts = []
|
content_parts = []
|
||||||
in_table = False # Track if we're currently processing a table
|
in_table = False # Track if we're currently processing a table
|
||||||
|
|
||||||
|
|
@ -1021,8 +1045,8 @@ def _extract_docx(file_bytes: bytes) -> str:
|
||||||
row_text = []
|
row_text = []
|
||||||
for cell in row.cells:
|
for cell in row.cells:
|
||||||
cell_text = cell.text
|
cell_text = cell.text
|
||||||
# Always append cell text to preserve column structure
|
# Escape special characters to preserve tab-delimited structure
|
||||||
row_text.append(cell_text)
|
row_text.append(escape_cell(cell_text))
|
||||||
# Only add row if at least one cell has content
|
# Only add row if at least one cell has content
|
||||||
if any(cell for cell in row_text):
|
if any(cell for cell in row_text):
|
||||||
content_parts.append("\t".join(row_text))
|
content_parts.append("\t".join(row_text))
|
||||||
|
|
@ -1052,27 +1076,112 @@ def _extract_pptx(file_bytes: bytes) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _extract_xlsx(file_bytes: bytes) -> str:
|
def _extract_xlsx(file_bytes: bytes) -> str:
|
||||||
"""Extract XLSX content (synchronous).
|
"""Extract XLSX content in tab-delimited format with clear sheet separation.
|
||||||
|
|
||||||
|
This function processes Excel workbooks and converts them to a structured text format
|
||||||
|
suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with
|
||||||
|
separator lines, and special characters are escaped to preserve the tab-delimited structure.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Each sheet is wrapped with '====================' separators for visual distinction
|
||||||
|
- Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
|
||||||
|
- Column alignment is preserved across all rows to maintain tabular structure
|
||||||
|
- Empty rows are preserved as blank lines to maintain row structure
|
||||||
|
- Uses sheet.max_column to determine column width efficiently
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_bytes: XLSX file content as bytes
|
file_bytes: XLSX file content as bytes
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Extracted text content
|
str: Extracted text content with all sheets in tab-delimited format.
|
||||||
|
Format: Sheet separators, sheet name, then tab-delimited rows.
|
||||||
|
|
||||||
|
Example output:
|
||||||
|
==================== Sheet: Data ====================
|
||||||
|
Name\tAge\tCity
|
||||||
|
Alice\t30\tNew York
|
||||||
|
Bob\t25\tLondon
|
||||||
|
|
||||||
|
==================== Sheet: Summary ====================
|
||||||
|
Total\t2
|
||||||
|
====================
|
||||||
"""
|
"""
|
||||||
from openpyxl import load_workbook # type: ignore
|
from openpyxl import load_workbook # type: ignore
|
||||||
|
|
||||||
xlsx_file = BytesIO(file_bytes)
|
xlsx_file = BytesIO(file_bytes)
|
||||||
wb = load_workbook(xlsx_file)
|
wb = load_workbook(xlsx_file)
|
||||||
content = ""
|
|
||||||
for sheet in wb:
|
def escape_cell(cell_value: str | int | float | None) -> str:
|
||||||
content += f"Sheet: {sheet.title}\n"
|
"""Escape characters that would break tab-delimited layout.
|
||||||
|
|
||||||
|
Escape order is critical: backslashes first, then tabs/newlines.
|
||||||
|
This prevents double-escaping issues.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cell_value: The cell value to escape (can be None, str, int, or float)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Escaped cell value safe for tab-delimited format
|
||||||
|
"""
|
||||||
|
if cell_value is None:
|
||||||
|
return ""
|
||||||
|
text = str(cell_value)
|
||||||
|
# CRITICAL: Escape backslash first to avoid double-escaping
|
||||||
|
return (
|
||||||
|
text.replace("\\", "\\\\") # Must be first: \ -> \\
|
||||||
|
.replace("\t", "\\t") # Tab -> \t (visible)
|
||||||
|
.replace("\r\n", "\\n") # Windows newline -> \n
|
||||||
|
.replace("\r", "\\n") # Mac newline -> \n
|
||||||
|
.replace("\n", "\\n") # Unix newline -> \n
|
||||||
|
)
|
||||||
|
|
||||||
|
def escape_sheet_title(title: str) -> str:
|
||||||
|
"""Escape sheet title to prevent formatting issues in separators.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Original sheet title
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Sanitized sheet title with tabs/newlines replaced
|
||||||
|
"""
|
||||||
|
return str(title).replace("\n", " ").replace("\t", " ").replace("\r", " ")
|
||||||
|
|
||||||
|
content_parts: list[str] = []
|
||||||
|
sheet_separator = "=" * 20
|
||||||
|
|
||||||
|
for idx, sheet in enumerate(wb):
|
||||||
|
if idx > 0:
|
||||||
|
content_parts.append("") # Blank line between sheets for readability
|
||||||
|
|
||||||
|
# Escape sheet title to handle edge cases with special characters
|
||||||
|
safe_title = escape_sheet_title(sheet.title)
|
||||||
|
content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
|
||||||
|
|
||||||
|
# Use sheet.max_column to get the maximum column width directly
|
||||||
|
max_columns = sheet.max_column if sheet.max_column else 0
|
||||||
|
|
||||||
|
# Extract rows with consistent width to preserve column alignment
|
||||||
for row in sheet.iter_rows(values_only=True):
|
for row in sheet.iter_rows(values_only=True):
|
||||||
content += (
|
row_parts = []
|
||||||
"\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
|
|
||||||
)
|
# Build row up to max_columns width
|
||||||
content += "\n"
|
for idx in range(max_columns):
|
||||||
return content
|
if idx < len(row):
|
||||||
|
row_parts.append(escape_cell(row[idx]))
|
||||||
|
else:
|
||||||
|
row_parts.append("") # Pad short rows
|
||||||
|
|
||||||
|
# Check if row is completely empty
|
||||||
|
if all(part == "" for part in row_parts):
|
||||||
|
# Preserve empty rows as blank lines (maintains row structure)
|
||||||
|
content_parts.append("")
|
||||||
|
else:
|
||||||
|
# Join all columns to maintain consistent column count
|
||||||
|
content_parts.append("\t".join(row_parts))
|
||||||
|
|
||||||
|
# Final separator for symmetry (makes parsing easier)
|
||||||
|
content_parts.append(sheet_separator)
|
||||||
|
return "\n".join(content_parts)
|
||||||
|
|
||||||
|
|
||||||
async def pipeline_enqueue_file(
|
async def pipeline_enqueue_file(
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue