From 5a9677396bcba6a1d4dd4c74562b156d9b203f91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Thu, 4 Dec 2025 19:15:04 +0800 Subject: [PATCH] cherry-pick 4438ba41 --- lightrag/api/routers/document_routes.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 1726e197..d2e4176b 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1006,8 +1006,8 @@ def _extract_docx(file_bytes: bytes) -> str: paragraph = Paragraph(element, doc) text = paragraph.text.strip() - # Always append to preserve document spacing (including blank paragraphs) - content_parts.append(text) + if text: + content_parts.append(text) # Check if element is a table elif element.tag.endswith("tbl"): @@ -1021,10 +1021,9 @@ def _extract_docx(file_bytes: bytes) -> str: row_text = [] for cell in row.cells: cell_text = cell.text.strip() - # Always append cell text to preserve column structure - row_text.append(cell_text) - # Only add row if at least one cell has content - if any(cell for cell in row_text): + if cell_text: + row_text.append(cell_text) + if row_text: content_parts.append("\t".join(row_text)) return "\n".join(content_parts)