From 4438ba41a374f36982af043d0385f38bffed132d Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 01:31:33 +0800
Subject: [PATCH 1/4] Enhance DOCX extraction to preserve document order with
 tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Include tables in extracted content
• Maintain original document order
• Add spacing around tables
• Use tabs to separate table cells
• Process all body elements sequentially
---
 lightrag/api/routers/document_routes.py | 44 +++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index a0c2f0dd..d2e4176b 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -976,19 +976,57 @@ def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
 
 
 def _extract_docx(file_bytes: bytes) -> str:
-    """Extract DOCX content (synchronous).
+    """Extract DOCX content including tables in document order (synchronous).
 
     Args:
         file_bytes: DOCX file content as bytes
 
     Returns:
-        str: Extracted text content
+        str: Extracted text content with tables in their original positions.
+             Tables are separated from paragraphs with blank lines for clarity.
     """
     from docx import Document  # type: ignore
+    from docx.table import Table  # type: ignore
+    from docx.text.paragraph import Paragraph  # type: ignore
 
     docx_file = BytesIO(file_bytes)
     doc = Document(docx_file)
-    return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+
+    content_parts = []
+    in_table = False  # Track if we're currently processing a table
+
+    # Iterate through all body elements in document order
+    for element in doc.element.body:
+        # Check if element is a paragraph
+        if element.tag.endswith("p"):
+            # If coming out of a table, add blank line after table
+            if in_table:
+                content_parts.append("")  # Blank line after table
+                in_table = False
+
+            paragraph = Paragraph(element, doc)
+            text = paragraph.text.strip()
+            if text:
+                content_parts.append(text)
+
+        # Check if element is a table
+        elif element.tag.endswith("tbl"):
+            # Add blank line before table (if content exists)
+            if content_parts and not in_table:
+                content_parts.append("")  # Blank line before table
+
+            in_table = True
+            table = Table(element, doc)
+            for row in table.rows:
+                row_text = []
+                for cell in row.cells:
+                    cell_text = cell.text.strip()
+                    if cell_text:
+                        row_text.append(cell_text)
+                if row_text:
+                    content_parts.append("\t".join(row_text))
+
+    return "\n".join(content_parts)
 
 
 def _extract_pptx(file_bytes: bytes) -> str:

From fa887d811b180ff1b21e879225f859535cf563f3 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 01:52:02 +0800
Subject: [PATCH 2/4] Fix table column structure preservation in DOCX
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Always append cell text to maintain columns
• Preserve empty cells in table structure
• Check for any content before adding rows
• Use tab separation for proper alignment
• Improve table formatting consistency
---
 lightrag/api/routers/document_routes.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index d2e4176b..15bf2508 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1021,9 +1021,10 @@ def _extract_docx(file_bytes: bytes) -> str:
                 row_text = []
                 for cell in row.cells:
                     cell_text = cell.text.strip()
-                    if cell_text:
-                        row_text.append(cell_text)
-                if row_text:
+                    # Always append cell text to preserve column structure
+                    row_text.append(cell_text)
+                # Only add row if at least one cell has content
+                if any(cell for cell in row_text):
                     content_parts.append("\t".join(row_text))
 
     return "\n".join(content_parts)

From 186c8f0e16fa08f239527d982a8d68d21482defe Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 02:03:10 +0800
Subject: [PATCH 3/4] Preserve blank paragraphs in DOCX extraction to maintain
 spacing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove text emptiness check
• Always append paragraph text
• Maintain document formatting
• Preserve original spacing
---
 lightrag/api/routers/document_routes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 15bf2508..1726e197 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1006,8 +1006,8 @@ def _extract_docx(file_bytes: bytes) -> str:
 
             paragraph = Paragraph(element, doc)
             text = paragraph.text.strip()
-            if text:
-                content_parts.append(text)
+            # Always append to preserve document spacing (including blank paragraphs)
+            content_parts.append(text)
 
         # Check if element is a table
         elif element.tag.endswith("tbl"):

From e7d2803a65fe3a4329487258acc2cf029138717b Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 02:12:27 +0800
Subject: [PATCH 4/4] Remove text stripping in DOCX extraction to preserve
 whitespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Keep original paragraph spacing
• Preserve cell whitespace in tables
• Maintain document formatting
• Don't strip leading/trailing spaces
---
 lightrag/api/routers/document_routes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 1726e197..dd6d7fd8 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1005,7 +1005,7 @@ def _extract_docx(file_bytes: bytes) -> str:
                 in_table = False
 
             paragraph = Paragraph(element, doc)
-            text = paragraph.text.strip()
+            text = paragraph.text
             # Always append to preserve document spacing (including blank paragraphs)
             content_parts.append(text)
 
@@ -1020,7 +1020,7 @@ def _extract_docx(file_bytes: bytes) -> str:
             for row in table.rows:
                 row_text = []
                 for cell in row.cells:
-                    cell_text = cell.text.strip()
+                    cell_text = cell.text
                     # Always append cell text to preserve column structure
                     row_text.append(cell_text)
                 # Only add row if at least one cell has content