From e7d2803a65fe3a4329487258acc2cf029138717b Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 02:12:27 +0800
Subject: [PATCH] Remove text stripping in DOCX extraction to preserve
 whitespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Keep original paragraph spacing
• Preserve cell whitespace in tables
• Maintain document formatting
• Don't strip leading/trailing spaces
---
 lightrag/api/routers/document_routes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 1726e197..dd6d7fd8 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1005,7 +1005,7 @@ def _extract_docx(file_bytes: bytes) -> str:
                 in_table = False
 
             paragraph = Paragraph(element, doc)
-            text = paragraph.text.strip()
+            text = paragraph.text
             # Always append to preserve document spacing (including blank paragraphs)
             content_parts.append(text)
 
@@ -1020,7 +1020,7 @@ def _extract_docx(file_bytes: bytes) -> str:
             for row in table.rows:
                 row_text = []
                 for cell in row.cells:
-                    cell_text = cell.text.strip()
+                    cell_text = cell.text
                     # Always append cell text to preserve column structure
                     row_text.append(cell_text)
                 # Only add row if at least one cell has content