Fix table column structure preservation in DOCX extraction

• Always append cell text to maintain columns
• Preserve empty cells in table structure
• Check for any content before adding rows
• Use tab separation for proper alignment
• Improve table formatting consistency
This commit is contained in:
yangdx 2025-11-19 01:52:02 +08:00
parent 4438ba41a3
commit fa887d811b

View file

@ -1021,9 +1021,10 @@ def _extract_docx(file_bytes: bytes) -> str:
row_text = []
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text:
row_text.append(cell_text)
if row_text:
# Always append cell text to preserve column structure
row_text.append(cell_text)
# Only add row if at least one cell has content
if any(cell for cell in row_text):
content_parts.append("\t".join(row_text))
return "\n".join(content_parts)