cherry-pick cf2a024e

2025-12-04 19:18:14 +08:00 · 2025-12-04 19:18:14 +08:00 · 201084e05a
commit 201084e05a
parent 86a6159b3f
5 changed files with 199 additions and 574 deletions
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -3,15 +3,14 @@ This module contains all document-related routes for the LightRAG API.
 """
 import asyncio
 from functools import lru_cache
 from lightrag.utils import logger, get_pinyin_sort_key
 import aiofiles
 import shutil
 import traceback
 import pipmaster as pm
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Literal
 from io import BytesIO
 from fastapi import (
    APIRouter,
    BackgroundTasks,
@ -29,24 +28,6 @@ from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args
@lru_cache(maxsize=1)
 def _is_docling_available() -> bool:
    """Check if docling is available (cached check).
    This function uses lru_cache to avoid repeated import attempts.
    The result is cached after the first call.
    Returns:
        bool: True if docling is available, False otherwise
    """
    try:
        import docling  # noqa: F401  # type: ignore[import-not-found]
        return True
    except ImportError:
        return False
 # Function to format datetime to ISO format string with timezone information
 def format_datetime(dt: Any) -> Optional[str]:
    """Format datetime to ISO format string with timezone information
@ -180,28 +161,6 @@ class ReprocessResponse(BaseModel):
        }
 class CancelPipelineResponse(BaseModel):
    """Response model for pipeline cancellation operation
    Attributes:
        status: Status of the cancellation request
        message: Message describing the operation result
    """
    status: Literal["cancellation_requested", "not_busy"] = Field(
        description="Status of the cancellation request"
    )
    message: str = Field(description="Human-readable message describing the operation")
    class Config:
        json_schema_extra = {
            "example": {
                "status": "cancellation_requested",
                "message": "Pipeline cancellation has been requested. Documents will be marked as FAILED.",
            }
        }
 class InsertTextRequest(BaseModel):
    """Request model for inserting a single text document
@ -377,10 +336,6 @@ class DeleteDocRequest(BaseModel):
        default=False,
        description="Whether to delete the corresponding file in the upload directory.",
    )
    delete_llm_cache: bool = Field(
        default=False,
        description="Whether to delete cached LLM extraction results for the documents.",
    )
    @field_validator("doc_ids", mode="after")
    @classmethod
@ -451,7 +406,7 @@ class DocStatusResponse(BaseModel):
                "id": "doc_123456",
                "content_summary": "Research paper on machine learning",
                "content_length": 15240,
-                "status": "processed",
+                "status": "PROCESSED",
                "created_at": "2025-03-31T12:34:56",
                "updated_at": "2025-03-31T12:35:30",
                "track_id": "upload_20250729_170612_abc123",
@ -484,7 +439,7 @@ class DocsStatusesResponse(BaseModel):
                            "id": "doc_123",
                            "content_summary": "Pending document",
                            "content_length": 5000,
-                            "status": "pending",
+                            "status": "PENDING",
                            "created_at": "2025-03-31T10:00:00",
                            "updated_at": "2025-03-31T10:00:00",
                            "track_id": "upload_20250331_100000_abc123",
@ -494,27 +449,12 @@ class DocsStatusesResponse(BaseModel):
                            "file_path": "pending_doc.pdf",
                        }
                    ],
                    "PREPROCESSED": [
                        {
                            "id": "doc_789",
                            "content_summary": "Document pending final indexing",
                            "content_length": 7200,
                            "status": "preprocessed",
                            "created_at": "2025-03-31T09:30:00",
                            "updated_at": "2025-03-31T09:35:00",
                            "track_id": "upload_20250331_093000_xyz789",
                            "chunks_count": 10,
                            "error": None,
                            "metadata": None,
                            "file_path": "preprocessed_doc.pdf",
                        }
                    ],
                    "PROCESSED": [
                        {
                            "id": "doc_456",
                            "content_summary": "Processed document",
                            "content_length": 8000,
-                            "status": "processed",
+                            "status": "PROCESSED",
                            "created_at": "2025-03-31T09:00:00",
                            "updated_at": "2025-03-31T09:05:00",
                            "track_id": "insert_20250331_090000_def456",
@ -686,7 +626,6 @@ class PaginatedDocsResponse(BaseModel):
                "status_counts": {
                    "PENDING": 10,
                    "PROCESSING": 5,
                    "PREPROCESSED": 5,
                    "PROCESSED": 130,
                    "FAILED": 5,
                },
@ -709,7 +648,6 @@ class StatusCountsResponse(BaseModel):
                "status_counts": {
                    "PENDING": 10,
                    "PROCESSING": 5,
                    "PREPROCESSED": 5,
                    "PROCESSED": 130,
                    "FAILED": 5,
                }
@ -898,6 +836,7 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
    Returns:
        str: Unique filename (may have numeric suffix added)
    """
    from pathlib import Path
    import time
    original_path = Path(original_name)
@ -920,258 +859,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
    return f"{base_name}_{timestamp}{extension}"
 # Document processing helper functions (synchronous)
 # These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
 def _convert_with_docling(file_path: Path) -> str:
    """Convert document using docling (synchronous).
    Args:
        file_path: Path to the document file
    Returns:
        str: Extracted markdown content
    """
    from docling.document_converter import DocumentConverter  # type: ignore
    converter = DocumentConverter()
    result = converter.convert(file_path)
    return result.document.export_to_markdown()
 def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
    """Extract PDF content using pypdf (synchronous).
    Args:
        file_bytes: PDF file content as bytes
        password: Optional password for encrypted PDFs
    Returns:
        str: Extracted text content
    Raises:
        Exception: If PDF is encrypted and password is incorrect or missing
    """
    from pypdf import PdfReader  # type: ignore
    pdf_file = BytesIO(file_bytes)
    reader = PdfReader(pdf_file)
    # Check if PDF is encrypted
    if reader.is_encrypted:
        if not password:
            raise Exception("PDF is encrypted but no password provided")
        decrypt_result = reader.decrypt(password)
        if decrypt_result == 0:
            raise Exception("Incorrect PDF password")
    # Extract text from all pages
    content = ""
    for page in reader.pages:
        content += page.extract_text() + "\n"
    return content
 def _extract_docx(file_bytes: bytes) -> str:
    """Extract DOCX content including tables in document order (synchronous).
    Args:
        file_bytes: DOCX file content as bytes
    Returns:
        str: Extracted text content with tables in their original positions.
             Tables are separated from paragraphs with blank lines for clarity.
    """
    from docx import Document  # type: ignore
    from docx.table import Table  # type: ignore
    from docx.text.paragraph import Paragraph  # type: ignore
    docx_file = BytesIO(file_bytes)
    doc = Document(docx_file)
    content_parts = []
    in_table = False  # Track if we're currently processing a table
    # Iterate through all body elements in document order
    for element in doc.element.body:
        # Check if element is a paragraph
        if element.tag.endswith("p"):
            # If coming out of a table, add blank line after table
            if in_table:
                content_parts.append("")  # Blank line after table
                in_table = False
            paragraph = Paragraph(element, doc)
            text = paragraph.text
            # Always append to preserve document spacing (including blank paragraphs)
            content_parts.append(text)
        # Check if element is a table
        elif element.tag.endswith("tbl"):
            # Add blank line before table (if content exists)
            if content_parts and not in_table:
                content_parts.append("")  # Blank line before table
            in_table = True
            table = Table(element, doc)
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    cell_text = cell.text
                    # Always append cell text to preserve column structure
                    row_text.append(cell_text)
                # Only add row if at least one cell has content
                if any(cell for cell in row_text):
                    content_parts.append("\t".join(row_text))
    return "\n".join(content_parts)
 def _extract_pptx(file_bytes: bytes) -> str:
    """Extract PPTX content (synchronous).
    Args:
        file_bytes: PPTX file content as bytes
    Returns:
        str: Extracted text content
    """
    from pptx import Presentation  # type: ignore
    pptx_file = BytesIO(file_bytes)
    prs = Presentation(pptx_file)
    content = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                content += shape.text + "\n"
    return content
 def _extract_xlsx(file_bytes: bytes) -> str:
    """Extract XLSX content in tab-delimited format with clear sheet separation.
    This function processes Excel workbooks and converts them to a structured text format
    suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with
    separator lines, and special characters are escaped to preserve the tab-delimited structure.
    Features:
    - Each sheet is wrapped with '====================' separators for visual distinction
    - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
    - Column alignment is preserved across all rows to maintain tabular structure
    - Empty rows are preserved as blank lines to maintain row structure
    - Two-pass processing: determines max column width, then extracts with consistent alignment
    Args:
        file_bytes: XLSX file content as bytes
    Returns:
        str: Extracted text content with all sheets in tab-delimited format.
             Format: Sheet separators, sheet name, then tab-delimited rows.
    Example output:
        ==================== Sheet: Data ====================
        Name\tAge\tCity
        Alice\t30\tNew York
        Bob\t25\tLondon
        ==================== Sheet: Summary ====================
        Total\t2
        ====================
    """
    from openpyxl import load_workbook  # type: ignore
    xlsx_file = BytesIO(file_bytes)
    wb = load_workbook(xlsx_file)
    def escape_cell(cell_value: str | int | float | None) -> str:
        """Escape characters that would break tab-delimited layout.
        Escape order is critical: backslashes first, then tabs/newlines.
        This prevents double-escaping issues.
        Args:
            cell_value: The cell value to escape (can be None, str, int, or float)
        Returns:
            str: Escaped cell value safe for tab-delimited format
        """
        if cell_value is None:
            return ""
        text = str(cell_value)
        # CRITICAL: Escape backslash first to avoid double-escaping
        return (
            text.replace("\\", "\\\\")  # Must be first: \ -> \\
            .replace("\t", "\\t")  # Tab -> \t (visible)
            .replace("\r\n", "\\n")  # Windows newline -> \n
            .replace("\r", "\\n")  # Mac newline -> \n
            .replace("\n", "\\n")  # Unix newline -> \n
        )
    def escape_sheet_title(title: str) -> str:
        """Escape sheet title to prevent formatting issues in separators.
        Args:
            title: Original sheet title
        Returns:
            str: Sanitized sheet title with tabs/newlines replaced
        """
        return str(title).replace("\n", " ").replace("\t", " ").replace("\r", " ")
    content_parts: list[str] = []
    sheet_separator = "=" * 20
    for idx, sheet in enumerate(wb):
        if idx > 0:
            content_parts.append("")  # Blank line between sheets for readability
        # Escape sheet title to handle edge cases with special characters
        safe_title = escape_sheet_title(sheet.title)
        content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
        # Two-pass approach to preserve column alignment:
        # Pass 1: Determine the maximum column width for this sheet
        max_columns = 0
        all_rows = list(sheet.iter_rows(values_only=True))
        for row in all_rows:
            last_nonempty_idx = -1
            for idx, cell in enumerate(row):
                # Check if cell has meaningful content (not None or empty string)
                if cell is not None and str(cell).strip():
                    last_nonempty_idx = idx
            if last_nonempty_idx >= 0:
                max_columns = max(max_columns, last_nonempty_idx + 1)
        # Pass 2: Extract rows with consistent width to preserve column alignment
        for row in all_rows:
            row_parts = []
            # Build row up to max_columns width
            for idx in range(max_columns):
                if idx < len(row):
                    row_parts.append(escape_cell(row[idx]))
                else:
                    row_parts.append("")  # Pad short rows
            # Check if row is completely empty
            if all(part == "" for part in row_parts):
                # Preserve empty rows as blank lines (maintains row structure)
                content_parts.append("")
            else:
                # Join all columns to maintain consistent column count
                content_parts.append("\t".join(row_parts))
    # Final separator for symmetry (makes parsing easier)
    content_parts.append(sheet_separator)
    return "\n".join(content_parts)
 async def pipeline_enqueue_file(
    rag: LightRAG, file_path: Path, track_id: str = None
 ) -> tuple[bool, str]:
@ -1342,28 +1029,24 @@ async def pipeline_enqueue_file(
                case ".pdf":
                    try:
-                        # Try DOCLING first if configured and available
+                        if global_args.document_loading_engine == "DOCLING":
-                        if (
+                            if not pm.is_installed("docling"):  # type: ignore
-                            global_args.document_loading_engine == "DOCLING"
+                                pm.install("docling")
-                            and _is_docling_available()
+                            from docling.document_converter import DocumentConverter  # type: ignore
-                        ):
+
-                            content = await asyncio.to_thread(
+                            converter = DocumentConverter()
-                                _convert_with_docling, file_path
+                            result = converter.convert(file_path)
-                            )
+                            content = result.document.export_to_markdown()
                        else:
-                            if (
+                            if not pm.is_installed("pypdf2"):  # type: ignore
-                                global_args.document_loading_engine == "DOCLING"
+                                pm.install("pypdf2")
-                                and not _is_docling_available()
+                            from PyPDF2 import PdfReader  # type: ignore
-                            ):
+                            from io import BytesIO
-                                logger.warning(
+
-                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to pypdf."
+                            pdf_file = BytesIO(file)
-                                )
+                            reader = PdfReader(pdf_file)
-                            # Use pypdf (non-blocking via to_thread)
+                            for page in reader.pages:
-                            content = await asyncio.to_thread(
+                                content += page.extract_text() + "\n"
                                _extract_pdf_pypdf,
                                file,
                                global_args.pdf_decrypt_password,
                            )
                    except Exception as e:
                        error_files = [
                            {
@ -1383,24 +1066,28 @@ async def pipeline_enqueue_file(
                case ".docx":
                    try:
-                        # Try DOCLING first if configured and available
+                        if global_args.document_loading_engine == "DOCLING":
-                        if (
+                            if not pm.is_installed("docling"):  # type: ignore
-                            global_args.document_loading_engine == "DOCLING"
+                                pm.install("docling")
-                            and _is_docling_available()
+                            from docling.document_converter import DocumentConverter  # type: ignore
-                        ):
+
-                            content = await asyncio.to_thread(
+                            converter = DocumentConverter()
-                                _convert_with_docling, file_path
+                            result = converter.convert(file_path)
-                            )
+                            content = result.document.export_to_markdown()
                        else:
-                            if (
+                            if not pm.is_installed("python-docx"):  # type: ignore
-                                global_args.document_loading_engine == "DOCLING"
+                                try:
-                                and not _is_docling_available()
+                                    pm.install("python-docx")
-                            ):
+                                except Exception:
-                                logger.warning(
+                                    pm.install("docx")
-                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-docx."
+                            from docx import Document  # type: ignore
-                                )
+                            from io import BytesIO
-                            # Use python-docx (non-blocking via to_thread)
+
-                            content = await asyncio.to_thread(_extract_docx, file)
+                            docx_file = BytesIO(file)
                            doc = Document(docx_file)
                            content = "\n".join(
                                [paragraph.text for paragraph in doc.paragraphs]
                            )
                    except Exception as e:
                        error_files = [
                            {
@ -1420,24 +1107,26 @@ async def pipeline_enqueue_file(
                case ".pptx":
                    try:
-                        # Try DOCLING first if configured and available
+                        if global_args.document_loading_engine == "DOCLING":
-                        if (
+                            if not pm.is_installed("docling"):  # type: ignore
-                            global_args.document_loading_engine == "DOCLING"
+                                pm.install("docling")
-                            and _is_docling_available()
+                            from docling.document_converter import DocumentConverter  # type: ignore
-                        ):
+
-                            content = await asyncio.to_thread(
+                            converter = DocumentConverter()
-                                _convert_with_docling, file_path
+                            result = converter.convert(file_path)
-                            )
+                            content = result.document.export_to_markdown()
                        else:
-                            if (
+                            if not pm.is_installed("python-pptx"):  # type: ignore
-                                global_args.document_loading_engine == "DOCLING"
+                                pm.install("pptx")
-                                and not _is_docling_available()
+                            from pptx import Presentation  # type: ignore
-                            ):
+                            from io import BytesIO
-                                logger.warning(
+
-                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-pptx."
+                            pptx_file = BytesIO(file)
-                                )
+                            prs = Presentation(pptx_file)
-                            # Use python-pptx (non-blocking via to_thread)
+                            for slide in prs.slides:
-                            content = await asyncio.to_thread(_extract_pptx, file)
+                                for shape in slide.shapes:
                                    if hasattr(shape, "text"):
                                        content += shape.text + "\n"
                    except Exception as e:
                        error_files = [
                            {
@ -1457,24 +1146,33 @@ async def pipeline_enqueue_file(
                case ".xlsx":
                    try:
-                        # Try DOCLING first if configured and available
+                        if global_args.document_loading_engine == "DOCLING":
-                        if (
+                            if not pm.is_installed("docling"):  # type: ignore
-                            global_args.document_loading_engine == "DOCLING"
+                                pm.install("docling")
-                            and _is_docling_available()
+                            from docling.document_converter import DocumentConverter  # type: ignore
-                        ):
+
-                            content = await asyncio.to_thread(
+                            converter = DocumentConverter()
-                                _convert_with_docling, file_path
+                            result = converter.convert(file_path)
-                            )
+                            content = result.document.export_to_markdown()
                        else:
-                            if (
+                            if not pm.is_installed("openpyxl"):  # type: ignore
-                                global_args.document_loading_engine == "DOCLING"
+                                pm.install("openpyxl")
-                                and not _is_docling_available()
+                            from openpyxl import load_workbook  # type: ignore
-                            ):
+                            from io import BytesIO
-                                logger.warning(
+
-                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to openpyxl."
+                            xlsx_file = BytesIO(file)
-                                )
+                            wb = load_workbook(xlsx_file)
-                            # Use openpyxl (non-blocking via to_thread)
+                            for sheet in wb:
-                            content = await asyncio.to_thread(_extract_xlsx, file)
+                                content += f"Sheet: {sheet.title}\n"
                                for row in sheet.iter_rows(values_only=True):
                                    content += (
                                        "\t".join(
                                            str(cell) if cell is not None else ""
                                            for cell in row
                                        )
                                        + "\n"
                                    )
                                content += "\n"
                    except Exception as e:
                        error_files = [
                            {
@ -1772,20 +1470,15 @@ async def background_delete_documents(
    doc_manager: DocumentManager,
    doc_ids: List[str],
    delete_file: bool = False,
    delete_llm_cache: bool = False,
 ):
    """Background task to delete multiple documents"""
    from lightrag.kg.shared_storage import (
        get_namespace_data,
-        get_namespace_lock,
+        get_pipeline_status_lock,
    )
-    pipeline_status = await get_namespace_data(
+    pipeline_status = await get_namespace_data("pipeline_status")
-        "pipeline_status", workspace=rag.workspace
+    pipeline_status_lock = get_pipeline_status_lock()
    )
    pipeline_status_lock = get_namespace_lock(
        "pipeline_status", workspace=rag.workspace
    )
    total_docs = len(doc_ids)
    successful_deletions = []
@ -1801,7 +1494,6 @@ async def background_delete_documents(
        pipeline_status.update(
            {
                "busy": True,
                # Job name can not be changed, it's verified in adelete_by_doc_id()
                "job_name": f"Deleting {total_docs} Documents",
                "job_start": datetime.now().isoformat(),
                "docs": total_docs,
@ -1812,27 +1504,11 @@ async def background_delete_documents(
        )
        # Use slice assignment to clear the list in place
        pipeline_status["history_messages"][:] = ["Starting document deletion process"]
        if delete_llm_cache:
            pipeline_status["history_messages"].append(
                "LLM cache cleanup requested for this deletion job"
            )
    try:
        # Loop through each document ID and delete them one by one
        for i, doc_id in enumerate(doc_ids, 1):
            # Check for cancellation at the start of each document deletion
            async with pipeline_status_lock:
                if pipeline_status.get("cancellation_requested", False):
                    cancel_msg = f"Deletion cancelled by user at document {i}/{total_docs}. {len(successful_deletions)} deleted, {total_docs - i + 1} remaining."
                    logger.info(cancel_msg)
                    pipeline_status["latest_message"] = cancel_msg
                    pipeline_status["history_messages"].append(cancel_msg)
                    # Add remaining documents to failed list with cancellation reason
                    failed_deletions.extend(
                        doc_ids[i - 1 :]
                    )  # i-1 because enumerate starts at 1
                    break  # Exit the loop, remaining documents unchanged
                start_msg = f"Deleting document {i}/{total_docs}: {doc_id}"
                logger.info(start_msg)
                pipeline_status["cur_batch"] = i
@ -1841,9 +1517,7 @@ async def background_delete_documents(
            file_path = "#"
            try:
-                result = await rag.adelete_by_doc_id(
+                result = await rag.adelete_by_doc_id(doc_id)
                    doc_id, delete_llm_cache=delete_llm_cache
                )
                file_path = (
                    getattr(result, "file_path", "-") if "result" in locals() else "-"
                )
@ -1995,10 +1669,6 @@ async def background_delete_documents(
        # Final summary and check for pending requests
        async with pipeline_status_lock:
            pipeline_status["busy"] = False
            pipeline_status["pending_requests"] = False  # Reset pending requests flag
            pipeline_status["cancellation_requested"] = (
                False  # Always reset cancellation flag
            )
            completion_msg = f"Deletion completed: {len(successful_deletions)} successful, {len(failed_deletions)} failed"
            pipeline_status["latest_message"] = completion_msg
            pipeline_status["history_messages"].append(completion_msg)
@ -2275,16 +1945,12 @@ def create_document_routes(
        """
        from lightrag.kg.shared_storage import (
            get_namespace_data,
-            get_namespace_lock,
+            get_pipeline_status_lock,
        )
        # Get pipeline status and lock
-        pipeline_status = await get_namespace_data(
+        pipeline_status = await get_namespace_data("pipeline_status")
-            "pipeline_status", workspace=rag.workspace
+        pipeline_status_lock = get_pipeline_status_lock()
        )
        pipeline_status_lock = get_namespace_lock(
            "pipeline_status", workspace=rag.workspace
        )
        # Check and set status with lock
        async with pipeline_status_lock:
@ -2320,8 +1986,6 @@ def create_document_routes(
                rag.full_docs,
                rag.full_entities,
                rag.full_relations,
                rag.entity_chunks,
                rag.relation_chunks,
                rag.entities_vdb,
                rag.relationships_vdb,
                rag.chunks_vdb,
@ -2475,19 +2139,13 @@ def create_document_routes(
        try:
            from lightrag.kg.shared_storage import (
                get_namespace_data,
                get_namespace_lock,
                get_all_update_flags_status,
            )
-            pipeline_status = await get_namespace_data(
+            pipeline_status = await get_namespace_data("pipeline_status")
                "pipeline_status", workspace=rag.workspace
            )
            pipeline_status_lock = get_namespace_lock(
                "pipeline_status", workspace=rag.workspace
            )
            # Get update flags status for all namespaces
-            update_status = await get_all_update_flags_status(workspace=rag.workspace)
+            update_status = await get_all_update_flags_status()
            # Convert MutableBoolean objects to regular boolean values
            processed_update_status = {}
@ -2501,9 +2159,8 @@ def create_document_routes(
                        processed_flags.append(bool(flag))
                processed_update_status[namespace] = processed_flags
-            async with pipeline_status_lock:
+            # Convert to regular dict if it's a Manager.dict
-                # Convert to regular dict if it's a Manager.dict
+            status_dict = dict(pipeline_status)
                status_dict = dict(pipeline_status)
            # Add processed update_status to the status dictionary
            status_dict["update_status"] = processed_update_status
@ -2543,7 +2200,7 @@ def create_document_routes(
            logger.error(traceback.format_exc())
            raise HTTPException(status_code=500, detail=str(e))
-    # TODO: Deprecated, use /documents/paginated instead
+    # TODO: Deprecated
    @router.get(
        "", response_model=DocsStatusesResponse, dependencies=[Depends(combined_auth)]
    )
@ -2553,7 +2210,7 @@ def create_document_routes(
        To prevent excessive resource consumption, a maximum of 1,000 records is returned.
        This endpoint retrieves the current status of all documents, grouped by their
-        processing status (PENDING, PROCESSING, PREPROCESSED, PROCESSED, FAILED). The results are
+        processing status (PENDING, PROCESSING, PROCESSED, FAILED). The results are
        limited to 1000 total documents with fair distribution across all statuses.
        Returns:
@ -2569,7 +2226,6 @@ def create_document_routes(
            statuses = (
                DocStatus.PENDING,
                DocStatus.PROCESSING,
                DocStatus.PREPROCESSED,
                DocStatus.PROCESSED,
                DocStatus.FAILED,
            )
@ -2668,20 +2324,21 @@ def create_document_routes(
        Delete documents and all their associated data by their IDs using background processing.
        Deletes specific documents and all their associated data, including their status,
-        text chunks, vector embeddings, and any related graph data. When requested,
+        text chunks, vector embeddings, and any related graph data.
        cached LLM extraction responses are removed after graph deletion/rebuild completes.
        The deletion process runs in the background to avoid blocking the client connection.
        It is disabled when llm cache for entity extraction is disabled.
        This operation is irreversible and will interact with the pipeline status.
        Args:
-            delete_request (DeleteDocRequest): The request containing the document IDs and deletion options.
+            delete_request (DeleteDocRequest): The request containing the document IDs and delete_file options.
            background_tasks: FastAPI BackgroundTasks for async processing
        Returns:
            DeleteDocByIdResponse: The result of the deletion operation.
                - status="deletion_started": The document deletion has been initiated in the background.
                - status="busy": The pipeline is busy with another operation.
                - status="not_allowed": Operation not allowed when LLM cache for entity extraction is disabled.
        Raises:
            HTTPException:
@ -2689,27 +2346,27 @@ def create_document_routes(
        """
        doc_ids = delete_request.doc_ids
        # The rag object is initialized from the server startup args,
        # so we can access its properties here.
        if not rag.enable_llm_cache_for_entity_extract:
            return DeleteDocByIdResponse(
                status="not_allowed",
                message="Operation not allowed when LLM cache for entity extraction is disabled.",
                doc_id=", ".join(delete_request.doc_ids),
            )
        try:
-            from lightrag.kg.shared_storage import (
+            from lightrag.kg.shared_storage import get_namespace_data
                get_namespace_data,
                get_namespace_lock,
            )
-            pipeline_status = await get_namespace_data(
+            pipeline_status = await get_namespace_data("pipeline_status")
                "pipeline_status", workspace=rag.workspace
            )
            pipeline_status_lock = get_namespace_lock(
                "pipeline_status", workspace=rag.workspace
            )
-            # Check if pipeline is busy with proper lock
+            # Check if pipeline is busy
-            async with pipeline_status_lock:
+            if pipeline_status.get("busy", False):
-                if pipeline_status.get("busy", False):
+                return DeleteDocByIdResponse(
-                    return DeleteDocByIdResponse(
+                    status="busy",
-                        status="busy",
+                    message="Cannot delete documents while pipeline is busy",
-                        message="Cannot delete documents while pipeline is busy",
+                    doc_id=", ".join(doc_ids),
-                        doc_id=", ".join(doc_ids),
+                )
                    )
            # Add deletion task to background tasks
            background_tasks.add_task(
@ -2718,7 +2375,6 @@ def create_document_routes(
                doc_manager,
                doc_ids,
                delete_request.delete_file,
                delete_request.delete_llm_cache,
            )
            return DeleteDocByIdResponse(
@ -3076,67 +2732,4 @@ def create_document_routes(
            logger.error(traceback.format_exc())
            raise HTTPException(status_code=500, detail=str(e))
    @router.post(
        "/cancel_pipeline",
        response_model=CancelPipelineResponse,
        dependencies=[Depends(combined_auth)],
    )
    async def cancel_pipeline():
        """
        Request cancellation of the currently running pipeline.
        This endpoint sets a cancellation flag in the pipeline status. The pipeline will:
        1. Check this flag at key processing points
        2. Stop processing new documents
        3. Cancel all running document processing tasks
        4. Mark all PROCESSING documents as FAILED with reason "User cancelled"
        The cancellation is graceful and ensures data consistency. Documents that have
        completed processing will remain in PROCESSED status.
        Returns:
            CancelPipelineResponse: Response with status and message
                - status="cancellation_requested": Cancellation flag has been set
                - status="not_busy": Pipeline is not currently running
        Raises:
            HTTPException: If an error occurs while setting cancellation flag (500).
        """
        try:
            from lightrag.kg.shared_storage import (
                get_namespace_data,
                get_namespace_lock,
            )
            pipeline_status = await get_namespace_data(
                "pipeline_status", workspace=rag.workspace
            )
            pipeline_status_lock = get_namespace_lock(
                "pipeline_status", workspace=rag.workspace
            )
            async with pipeline_status_lock:
                if not pipeline_status.get("busy", False):
                    return CancelPipelineResponse(
                        status="not_busy",
                        message="Pipeline is not currently running. No cancellation needed.",
                    )
                # Set cancellation flag
                pipeline_status["cancellation_requested"] = True
                cancel_msg = "Pipeline cancellation requested by user"
                logger.info(cancel_msg)
                pipeline_status["latest_message"] = cancel_msg
                pipeline_status["history_messages"].append(cancel_msg)
            return CancelPipelineResponse(
                status="cancellation_requested",
                message="Pipeline cancellation has been requested. Documents will be marked as FAILED.",
            )
        except Exception as e:
            logger.error(f"Error requesting pipeline cancellation: {str(e)}")
            logger.error(traceback.format_exc())
            raise HTTPException(status_code=500, detail=str(e))
    return router
--- a/lightrag_webui/src/api/lightrag.ts
+++ b/lightrag_webui/src/api/lightrag.ts
@ -155,6 +155,12 @@ export type ScanResponse = {
  track_id: string
 }
 export type ReprocessFailedResponse = {
  status: 'reprocessing_started'
  message: string
  track_id: string
 }
 export type DeleteDocResponse = {
  status: 'deletion_started' | 'busy' | 'not_allowed'
  message: string
@ -305,6 +311,11 @@ export const scanNewDocuments = async (): Promise<ScanResponse> => {
  return response.data
 }
 export const reprocessFailedDocuments = async (): Promise<ReprocessFailedResponse> => {
  const response = await axiosInstance.post('/documents/reprocess_failed')
  return response.data
 }
 export const getDocumentsScanProgress = async (): Promise<LightragDocumentsScanProgress> => {
  const response = await axiosInstance.get('/documents/scan-progress')
  return response.data
--- a/lightrag_webui/src/features/DocumentManager.tsx
+++ b/lightrag_webui/src/features/DocumentManager.tsx
@ -22,6 +22,7 @@ import PaginationControls from '@/components/ui/PaginationControls'
 import {
  scanNewDocuments,
  reprocessFailedDocuments,
  getDocumentsPaginated,
  DocsStatusesResponse,
  DocStatus,
@ -913,6 +914,42 @@ export default function DocumentManager() {
    }
  }, [t, startPollingInterval, currentTab, health, statusCounts])
  const retryFailedDocuments = useCallback(async () => {
    try {
      // Check if component is still mounted before starting the request
      if (!isMountedRef.current) return;
      const { status, message, track_id: _track_id } = await reprocessFailedDocuments(); // eslint-disable-line @typescript-eslint/no-unused-vars
      // Check again if component is still mounted after the request completes
      if (!isMountedRef.current) return;
      // Note: _track_id is available for future use (e.g., progress tracking)
      toast.message(message || status);
      // Reset health check timer with 1 second delay to avoid race condition
      useBackendState.getState().resetHealthCheckTimerDelayed(1000);
      // Start fast refresh with 2-second interval immediately after retry
      startPollingInterval(2000);
      // Set recovery timer to restore normal polling interval after 15 seconds
      setTimeout(() => {
        if (isMountedRef.current && currentTab === 'documents' && health) {
          // Restore intelligent polling interval based on document status
          const hasActiveDocuments = (statusCounts.processing || 0) > 0 || (statusCounts.pending || 0) > 0;
          const normalInterval = hasActiveDocuments ? 5000 : 30000;
          startPollingInterval(normalInterval);
        }
      }, 15000); // Restore after 15 seconds
    } catch (err) {
      // Only show error if component is still mounted
      if (isMountedRef.current) {
        toast.error(errorMessage(err));
      }
    }
  }, [startPollingInterval, currentTab, health, statusCounts])
  // Handle page size change - update state and save to store
  const handlePageSizeChange = useCallback((newPageSize: number) => {
    if (newPageSize === pagination.page_size) return;
@ -1289,6 +1326,16 @@ export default function DocumentManager() {
            >
              <RefreshCwIcon /> {t('documentPanel.documentManager.scanButton')}
            </Button>
            <Button
              variant="outline"
              onClick={retryFailedDocuments}
              side="bottom"
              tooltip={t('documentPanel.documentManager.retryFailedTooltip')}
              size="sm"
              disabled={pipelineBusy}
            >
              <RotateCcwIcon /> {t('documentPanel.documentManager.retryFailedButton')}
            </Button>
            <Button
              variant="outline"
              onClick={() => setShowPipelineStatus(true)}
--- a/lightrag_webui/src/locales/en.json
+++ b/lightrag_webui/src/locales/en.json
@ -70,7 +70,6 @@
      "confirmButton": "YES",
      "deleteFileOption": "Also delete uploaded files",
      "deleteFileTooltip": "Check this option to also delete the corresponding uploaded files on the server",
      "deleteLLMCacheOption": "Also delete extracted LLM cache",
      "success": "Document deletion pipeline started successfully",
      "failed": "Delete Documents Failed:\n{{message}}",
      "error": "Delete Documents Failed:\n{{error}}",
@ -115,12 +114,12 @@
    "documentManager": {
      "title": "Document Management",
      "scanButton": "Scan",
-      "scanTooltip": "Scan and process documents in input folder, and also reprocess all failed documents",
+      "scanTooltip": "Scan documents in input folder",
-      "retryFailedButton": "Retry",
+      "retryFailedButton": "Retry Failed",
      "retryFailedTooltip": "Retry processing all failed documents",
      "refreshTooltip": "Reset document list",
-      "pipelineStatusButton": "Pipeline",
+      "pipelineStatusButton": "Pipeline Status",
-      "pipelineStatusTooltip": "View document processing pipeline status",
+      "pipelineStatusTooltip": "View pipeline status",
      "uploadedTitle": "Uploaded Documents",
      "uploadedDescription": "List of uploaded documents and their statuses.",
      "emptyTitle": "No Documents",
@ -140,7 +139,6 @@
      "status": {
        "all": "All",
        "completed": "Completed",
        "preprocessed": "Preprocessed",
        "processing": "Processing",
        "pending": "Pending",
        "failed": "Failed"
@ -160,25 +158,14 @@
      "title": "Pipeline Status",
      "busy": "Pipeline Busy",
      "requestPending": "Request Pending",
      "cancellationRequested": "Cancellation Requested",
      "jobName": "Job Name",
      "startTime": "Start Time",
      "progress": "Progress",
-      "unit": "Batch",
+      "unit": "batch",
      "latestMessage": "Latest Message",
      "historyMessages": "History Messages",
      "cancelButton": "Cancel",
      "cancelTooltip": "Cancel pipeline processing",
      "cancelConfirmTitle": "Confirm Pipeline Cancellation",
      "cancelConfirmDescription": "This will interrupt the ongoing pipeline processing. Are you sure you want to continue?",
      "cancelConfirmButton": "Confirm Cancellation",
      "cancelInProgress": "Cancellation in progress...",
      "pipelineNotRunning": "Pipeline not running",
      "cancelSuccess": "Pipeline cancellation requested",
      "cancelFailed": "Failed to cancel pipeline\n{{error}}",
      "cancelNotBusy": "Pipeline is not running, no need to cancel",
      "errors": {
-        "fetchFailed": "Failed to fetch pipeline status\n{{error}}"
+        "fetchFailed": "Failed to get pipeline status\n{{error}}"
      }
    }
  },
@ -330,9 +317,9 @@
          "description": "Description",
          "entity_id": "Name",
          "entity_type": "Type",
-          "source_id": "C-ID",
+          "source_id": "SrcID",
          "Neighbour": "Neigh",
-          "file_path": "File",
+          "file_path": "Source",
          "keywords": "Keys",
          "weight": "Weight"
        }
@ -348,14 +335,14 @@
    },
    "search": {
      "placeholder": "Search nodes in page...",
-      "message": "And {{count}} others"
+      "message": "And {count} others"
    },
    "graphLabels": {
      "selectTooltip": "Get subgraph of a node (label)",
      "noLabels": "No matching nodes found",
      "label": "Search node name",
      "placeholder": "Search node name...",
-      "andOthers": "And {{count}} others",
+      "andOthers": "And {count} others",
      "refreshGlobalTooltip": "Refresh global graph data and reset search history",
      "refreshCurrentLabelTooltip": "Refresh current page graph data",
      "refreshingTooltip": "Refreshing data..."
--- a/lightrag_webui/src/locales/zh.json
+++ b/lightrag_webui/src/locales/zh.json
@ -70,7 +70,6 @@
      "confirmButton": "确定",
      "deleteFileOption": "同时删除上传文件",
      "deleteFileTooltip": "选中此选项将同时删除服务器上对应的上传文件",
      "deleteLLMCacheOption": "同时删除实体关系抽取 LLM 缓存",
      "success": "文档删除流水线启动成功",
      "failed": "删除文档失败：\n{{message}}",
      "error": "删除文档失败：\n{{error}}",
@ -115,12 +114,12 @@
    "documentManager": {
      "title": "文档管理",
      "scanButton": "扫描",
-      "scanTooltip": "扫描处理输入目录中的文档，同时重新处理所有失败的文档",
+      "scanTooltip": "扫描输入目录中的文档",
-      "retryFailedButton": "重试",
+      "retryFailedButton": "重试失败",
      "retryFailedTooltip": "重新处理所有失败的文档",
      "refreshTooltip": "复位文档清单",
-      "pipelineStatusButton": "流水线",
+      "pipelineStatusButton": "流水线状态",
-      "pipelineStatusTooltip": "查看文档处理流水线状态",
+      "pipelineStatusTooltip": "查看流水线状态",
      "uploadedTitle": "已上传文档",
      "uploadedDescription": "已上传文档列表及其状态",
      "emptyTitle": "无文档",
@ -140,7 +139,6 @@
      "status": {
        "all": "全部",
        "completed": "已完成",
        "preprocessed": "预处理",
        "processing": "处理中",
        "pending": "等待中",
        "failed": "失败"
@ -160,23 +158,12 @@
      "title": "流水线状态",
      "busy": "流水线忙碌",
      "requestPending": "待处理请求",
      "cancellationRequested": "取消请求",
      "jobName": "作业名称",
      "startTime": "开始时间",
      "progress": "进度",
      "unit": "批",
      "latestMessage": "最新消息",
      "historyMessages": "历史消息",
      "cancelButton": "中断",
      "cancelTooltip": "中断流水线处理",
      "cancelConfirmTitle": "确认中断流水线",
      "cancelConfirmDescription": "此操作将中断正在进行的流水线处理。确定要继续吗？",
      "cancelConfirmButton": "确认中断",
      "cancelInProgress": "取消请求进行中...",
      "pipelineNotRunning": "流水线未运行",
      "cancelSuccess": "流水线中断请求已发送",
      "cancelFailed": "中断流水线失败\n{{error}}",
      "cancelNotBusy": "流水线未运行，无需中断",
      "errors": {
        "fetchFailed": "获取流水线状态失败\n{{error}}"
      }
@ -330,9 +317,9 @@
          "description": "描述",
          "entity_id": "名称",
          "entity_type": "类型",
-          "source_id": "C-ID",
+          "source_id": "信源ID",
          "Neighbour": "邻接",
-          "file_path": "文件",
+          "file_path": "信源",
          "keywords": "Keys",
          "weight": "权重"
        }
@ -348,14 +335,14 @@
    },
    "search": {
      "placeholder": "页面内搜索节点...",
-      "message": "还有 {{count}} 个"
+      "message": "还有 {count} 个"
    },
    "graphLabels": {
      "selectTooltip": "获取节点(标签)子图",
      "noLabels": "未找到匹配的节点",
      "label": "搜索节点名称",
      "placeholder": "搜索节点名称...",
-      "andOthers": "还有 {{count}} 个",
+      "andOthers": "还有 {count} 个",
      "refreshGlobalTooltip": "刷新全图数据和重置搜索历史",
      "refreshCurrentLabelTooltip": "刷新当前页面图数据",
      "refreshingTooltip": "正在刷新数据..."