This commit is contained in:
Raphaël MANSUY 2025-12-04 19:15:04 +08:00
parent d3d59b0dca
commit 5a9677396b

View file

@ -1006,8 +1006,8 @@ def _extract_docx(file_bytes: bytes) -> str:
paragraph = Paragraph(element, doc) paragraph = Paragraph(element, doc)
text = paragraph.text.strip() text = paragraph.text.strip()
# Always append to preserve document spacing (including blank paragraphs) if text:
content_parts.append(text) content_parts.append(text)
# Check if element is a table # Check if element is a table
elif element.tag.endswith("tbl"): elif element.tag.endswith("tbl"):
@ -1021,10 +1021,9 @@ def _extract_docx(file_bytes: bytes) -> str:
row_text = [] row_text = []
for cell in row.cells: for cell in row.cells:
cell_text = cell.text.strip() cell_text = cell.text.strip()
# Always append cell text to preserve column structure if cell_text:
row_text.append(cell_text) row_text.append(cell_text)
# Only add row if at least one cell has content if row_text:
if any(cell for cell in row_text):
content_parts.append("\t".join(row_text)) content_parts.append("\t".join(row_text))
return "\n".join(content_parts) return "\n".join(content_parts)