From fa887d811b180ff1b21e879225f859535cf563f3 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 01:52:02 +0800 Subject: [PATCH] Fix table column structure preservation in DOCX extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Always append cell text to maintain columns • Preserve empty cells in table structure • Check for any content before adding rows • Use tab separation for proper alignment • Improve table formatting consistency --- lightrag/api/routers/document_routes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d2e4176b..15bf2508 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1021,9 +1021,10 @@ def _extract_docx(file_bytes: bytes) -> str: row_text = [] for cell in row.cells: cell_text = cell.text.strip() - if cell_text: - row_text.append(cell_text) - if row_text: + # Always append cell text to preserve column structure + row_text.append(cell_text) + # Only add row if at least one cell has content + if any(cell for cell in row_text): content_parts.append("\t".join(row_text)) return "\n".join(content_parts)