From 450174084909117ad342ba260685a07d55ef1514 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Thu, 4 Dec 2025 19:15:04 +0800 Subject: [PATCH] cherry-pick fa887d81 --- lightrag/api/routers/document_routes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index dd6d7fd8..15bf2508 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1005,9 +1005,9 @@ def _extract_docx(file_bytes: bytes) -> str: in_table = False paragraph = Paragraph(element, doc) - text = paragraph.text - # Always append to preserve document spacing (including blank paragraphs) - content_parts.append(text) + text = paragraph.text.strip() + if text: + content_parts.append(text) # Check if element is a table elif element.tag.endswith("tbl"): @@ -1020,7 +1020,7 @@ def _extract_docx(file_bytes: bytes) -> str: for row in table.rows: row_text = [] for cell in row.cells: - cell_text = cell.text + cell_text = cell.text.strip() # Always append cell text to preserve column structure row_text.append(cell_text) # Only add row if at least one cell has content