Enhance DOCX extraction to preserve document order with tables
• Include tables in extracted content • Maintain original document order • Add spacing around tables • Use tabs to separate table cells • Process all body elements sequentially
This commit is contained in:
parent
d16c7840ab
commit
4438ba41a3
1 changed files with 41 additions and 3 deletions
|
|
@ -976,19 +976,57 @@ def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _extract_docx(file_bytes: bytes) -> str:
|
def _extract_docx(file_bytes: bytes) -> str:
|
||||||
"""Extract DOCX content (synchronous).
|
"""Extract DOCX content including tables in document order (synchronous).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_bytes: DOCX file content as bytes
|
file_bytes: DOCX file content as bytes
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Extracted text content
|
str: Extracted text content with tables in their original positions.
|
||||||
|
Tables are separated from paragraphs with blank lines for clarity.
|
||||||
"""
|
"""
|
||||||
from docx import Document # type: ignore
|
from docx import Document # type: ignore
|
||||||
|
from docx.table import Table # type: ignore
|
||||||
|
from docx.text.paragraph import Paragraph # type: ignore
|
||||||
|
|
||||||
docx_file = BytesIO(file_bytes)
|
docx_file = BytesIO(file_bytes)
|
||||||
doc = Document(docx_file)
|
doc = Document(docx_file)
|
||||||
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
|
||||||
|
content_parts = []
|
||||||
|
in_table = False # Track if we're currently processing a table
|
||||||
|
|
||||||
|
# Iterate through all body elements in document order
|
||||||
|
for element in doc.element.body:
|
||||||
|
# Check if element is a paragraph
|
||||||
|
if element.tag.endswith("p"):
|
||||||
|
# If coming out of a table, add blank line after table
|
||||||
|
if in_table:
|
||||||
|
content_parts.append("") # Blank line after table
|
||||||
|
in_table = False
|
||||||
|
|
||||||
|
paragraph = Paragraph(element, doc)
|
||||||
|
text = paragraph.text.strip()
|
||||||
|
if text:
|
||||||
|
content_parts.append(text)
|
||||||
|
|
||||||
|
# Check if element is a table
|
||||||
|
elif element.tag.endswith("tbl"):
|
||||||
|
# Add blank line before table (if content exists)
|
||||||
|
if content_parts and not in_table:
|
||||||
|
content_parts.append("") # Blank line before table
|
||||||
|
|
||||||
|
in_table = True
|
||||||
|
table = Table(element, doc)
|
||||||
|
for row in table.rows:
|
||||||
|
row_text = []
|
||||||
|
for cell in row.cells:
|
||||||
|
cell_text = cell.text.strip()
|
||||||
|
if cell_text:
|
||||||
|
row_text.append(cell_text)
|
||||||
|
if row_text:
|
||||||
|
content_parts.append("\t".join(row_text))
|
||||||
|
|
||||||
|
return "\n".join(content_parts)
|
||||||
|
|
||||||
|
|
||||||
def _extract_pptx(file_bytes: bytes) -> str:
|
def _extract_pptx(file_bytes: bytes) -> str:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue