From 4438ba41a374f36982af043d0385f38bffed132d Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 01:31:33 +0800 Subject: [PATCH 1/4] Enhance DOCX extraction to preserve document order with tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Include tables in extracted content • Maintain original document order • Add spacing around tables • Use tabs to separate table cells • Process all body elements sequentially --- lightrag/api/routers/document_routes.py | 44 +++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index a0c2f0dd..d2e4176b 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -976,19 +976,57 @@ def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str: def _extract_docx(file_bytes: bytes) -> str: - """Extract DOCX content (synchronous). + """Extract DOCX content including tables in document order (synchronous). Args: file_bytes: DOCX file content as bytes Returns: - str: Extracted text content + str: Extracted text content with tables in their original positions. + Tables are separated from paragraphs with blank lines for clarity. """ from docx import Document # type: ignore + from docx.table import Table # type: ignore + from docx.text.paragraph import Paragraph # type: ignore docx_file = BytesIO(file_bytes) doc = Document(docx_file) - return "\n".join([paragraph.text for paragraph in doc.paragraphs]) + + content_parts = [] + in_table = False # Track if we're currently processing a table + + # Iterate through all body elements in document order + for element in doc.element.body: + # Check if element is a paragraph + if element.tag.endswith("p"): + # If coming out of a table, add blank line after table + if in_table: + content_parts.append("") # Blank line after table + in_table = False + + paragraph = Paragraph(element, doc) + text = paragraph.text.strip() + if text: + content_parts.append(text) + + # Check if element is a table + elif element.tag.endswith("tbl"): + # Add blank line before table (if content exists) + if content_parts and not in_table: + content_parts.append("") # Blank line before table + + in_table = True + table = Table(element, doc) + for row in table.rows: + row_text = [] + for cell in row.cells: + cell_text = cell.text.strip() + if cell_text: + row_text.append(cell_text) + if row_text: + content_parts.append("\t".join(row_text)) + + return "\n".join(content_parts) def _extract_pptx(file_bytes: bytes) -> str: From fa887d811b180ff1b21e879225f859535cf563f3 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 01:52:02 +0800 Subject: [PATCH 2/4] Fix table column structure preservation in DOCX extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Always append cell text to maintain columns • Preserve empty cells in table structure • Check for any content before adding rows • Use tab separation for proper alignment • Improve table formatting consistency --- lightrag/api/routers/document_routes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d2e4176b..15bf2508 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1021,9 +1021,10 @@ def _extract_docx(file_bytes: bytes) -> str: row_text = [] for cell in row.cells: cell_text = cell.text.strip() - if cell_text: - row_text.append(cell_text) - if row_text: + # Always append cell text to preserve column structure + row_text.append(cell_text) + # Only add row if at least one cell has content + if any(cell for cell in row_text): content_parts.append("\t".join(row_text)) return "\n".join(content_parts) From 186c8f0e16fa08f239527d982a8d68d21482defe Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 02:03:10 +0800 Subject: [PATCH 3/4] Preserve blank paragraphs in DOCX extraction to maintain spacing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove text emptiness check • Always append paragraph text • Maintain document formatting • Preserve original spacing --- lightrag/api/routers/document_routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 15bf2508..1726e197 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1006,8 +1006,8 @@ def _extract_docx(file_bytes: bytes) -> str: paragraph = Paragraph(element, doc) text = paragraph.text.strip() - if text: - content_parts.append(text) + # Always append to preserve document spacing (including blank paragraphs) + content_parts.append(text) # Check if element is a table elif element.tag.endswith("tbl"): From e7d2803a65fe3a4329487258acc2cf029138717b Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 02:12:27 +0800 Subject: [PATCH 4/4] Remove text stripping in DOCX extraction to preserve whitespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Keep original paragraph spacing • Preserve cell whitespace in tables • Maintain document formatting • Don't strip leading/trailing spaces --- lightrag/api/routers/document_routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 1726e197..dd6d7fd8 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1005,7 +1005,7 @@ def _extract_docx(file_bytes: bytes) -> str: in_table = False paragraph = Paragraph(element, doc) - text = paragraph.text.strip() + text = paragraph.text # Always append to preserve document spacing (including blank paragraphs) content_parts.append(text) @@ -1020,7 +1020,7 @@ def _extract_docx(file_bytes: bytes) -> str: for row in table.rows: row_text = [] for cell in row.cells: - cell_text = cell.text.strip() + cell_text = cell.text # Always append cell text to preserve column structure row_text.append(cell_text) # Only add row if at least one cell has content