From e7d2803a65fe3a4329487258acc2cf029138717b Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 02:12:27 +0800 Subject: [PATCH] Remove text stripping in DOCX extraction to preserve whitespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Keep original paragraph spacing • Preserve cell whitespace in tables • Maintain document formatting • Don't strip leading/trailing spaces --- lightrag/api/routers/document_routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 1726e197..dd6d7fd8 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1005,7 +1005,7 @@ def _extract_docx(file_bytes: bytes) -> str: in_table = False paragraph = Paragraph(element, doc) - text = paragraph.text.strip() + text = paragraph.text # Always append to preserve document spacing (including blank paragraphs) content_parts.append(text) @@ -1020,7 +1020,7 @@ def _extract_docx(file_bytes: bytes) -> str: for row in table.rows: row_text = [] for cell in row.cells: - cell_text = cell.text.strip() + cell_text = cell.text # Always append cell text to preserve column structure row_text.append(cell_text) # Only add row if at least one cell has content