Fix table column structure preservation in DOCX extraction
• Always append cell text to maintain columns • Preserve empty cells in table structure • Check for any content before adding rows • Use tab separation for proper alignment • Improve table formatting consistency
This commit is contained in:
parent
4438ba41a3
commit
fa887d811b
1 changed files with 4 additions and 3 deletions
|
|
@ -1021,9 +1021,10 @@ def _extract_docx(file_bytes: bytes) -> str:
|
||||||
row_text = []
|
row_text = []
|
||||||
for cell in row.cells:
|
for cell in row.cells:
|
||||||
cell_text = cell.text.strip()
|
cell_text = cell.text.strip()
|
||||||
if cell_text:
|
# Always append cell text to preserve column structure
|
||||||
row_text.append(cell_text)
|
row_text.append(cell_text)
|
||||||
if row_text:
|
# Only add row if at least one cell has content
|
||||||
|
if any(cell for cell in row_text):
|
||||||
content_parts.append("\t".join(row_text))
|
content_parts.append("\t".join(row_text))
|
||||||
|
|
||||||
return "\n".join(content_parts)
|
return "\n".join(content_parts)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue