refactor: move document deps to api group, remove dynamic imports

- Merge offline-docs into api extras - Remove pipmaster dynamic installs - Add async document processing - Pre-check docling availability - Update offline deployment docs
2025-11-13 13:34:09 +08:00 · 2025-11-13 13:34:09 +08:00 · 69a0b74ce7
commit 69a0b74ce7
parent 7d394fb0a4
4 changed files with 205 additions and 191 deletions
--- a/docs/OfflineDeployment.md
+++ b/docs/OfflineDeployment.md
@ -23,10 +23,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b
 LightRAG dynamically installs packages for:
 - **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl`
 - **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client`
 - **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch`
- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
+- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
 **Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation.
 ## Quick Start
@ -75,32 +76,31 @@ LightRAG provides flexible dependency groups for different use cases:
 | Group | Description | Use Case |
 |-------|-------------|----------|
-| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files |
+| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support |
 | `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. |
 | `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. |
-| `offline` | All of the above | Complete offline deployment |
+| `offline` | Complete offline package | API + Storage + LLM (all features) |
 **Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration.
 > Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group.
 ### Installation Examples
 ```bash
-# Install only document processing dependencies
+# Install API with document processing
-pip install lightrag-hku[offline-docs]
+pip install lightrag-hku[api]
-# Install document processing and storage backends
+# Install API and storage backends
-pip install lightrag-hku[offline-docs,offline-storage]
+pip install lightrag-hku[api,offline-storage]
-# Install all offline dependencies
+# Install all offline dependencies (recommended for offline deployment)
 pip install lightrag-hku[offline]
 ```
 ### Using Individual Requirements Files
 ```bash
 # Document processing only
 pip install -r requirements-offline-docs.txt
 # Storage backends only
 pip install -r requirements-offline-storage.txt
@ -244,8 +244,8 @@ ls -la ~/.tiktoken_cache/
 **Solution**:
 ```bash
 # Pre-install the specific package you need
-# For document processing:
+# For API with document processing:
-pip install lightrag-hku[offline-docs]
+pip install lightrag-hku[api]
 # For storage backends:
 pip install lightrag-hku[offline-storage]
@ -297,9 +297,9 @@ mkdir -p ~/my_tiktoken_cache
 5. **Minimal Installation**: Only install what you need:
   ```bash
-   # If you only process PDFs with OpenAI
+   # If you only need API with document processing
-   pip install lightrag-hku[offline-docs]
+   pip install lightrag-hku[api]
-   # Then manually add: pip install openai
+   # Then manually add specific LLM: pip install openai
   ```
 ## Additional Resources
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key
 import aiofiles
 import shutil
 import traceback
 import pipmaster as pm
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Literal
 from io import BytesIO
 from fastapi import (
    APIRouter,
    BackgroundTasks,
@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id
 from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args
 # Check docling availability at module load time
 DOCLING_AVAILABLE = False
 try:
    import docling  # noqa: F401  # type: ignore[import-not-found]
    DOCLING_AVAILABLE = True
 except ImportError:
    if global_args.document_loading_engine == "DOCLING":
        logger.warning(
            "DOCLING engine requested but 'docling' package not installed. "
            "Falling back to standard document processing. "
            "To use DOCLING, install with: pip install lightrag-hku[api,docling]"
        )
 # Function to format datetime to ISO format string with timezone information
 def format_datetime(dt: Any) -> Optional[str]:
@ -879,7 +893,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
    Returns:
        str: Unique filename (may have numeric suffix added)
    """
    from pathlib import Path
    import time
    original_path = Path(original_name)
@ -902,6 +915,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
    return f"{base_name}_{timestamp}{extension}"
 # Document processing helper functions (synchronous)
 # These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
 def _convert_with_docling(file_path: Path) -> str:
    """Convert document using docling (synchronous).
    Args:
        file_path: Path to the document file
    Returns:
        str: Extracted markdown content
    """
    from docling.document_converter import DocumentConverter  # type: ignore
    converter = DocumentConverter()
    result = converter.convert(file_path)
    return result.document.export_to_markdown()
 def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
    """Extract PDF content using pypdf (synchronous).
    Args:
        file_bytes: PDF file content as bytes
        password: Optional password for encrypted PDFs
    Returns:
        str: Extracted text content
    Raises:
        Exception: If PDF is encrypted and password is incorrect or missing
    """
    from pypdf import PdfReader  # type: ignore
    pdf_file = BytesIO(file_bytes)
    reader = PdfReader(pdf_file)
    # Check if PDF is encrypted
    if reader.is_encrypted:
        if not password:
            raise Exception("PDF is encrypted but no password provided")
        decrypt_result = reader.decrypt(password)
        if decrypt_result == 0:
            raise Exception("Incorrect PDF password")
    # Extract text from all pages
    content = ""
    for page in reader.pages:
        content += page.extract_text() + "\n"
    return content
 def _extract_docx(file_bytes: bytes) -> str:
    """Extract DOCX content (synchronous).
    Args:
        file_bytes: DOCX file content as bytes
    Returns:
        str: Extracted text content
    """
    from docx import Document  # type: ignore
    docx_file = BytesIO(file_bytes)
    doc = Document(docx_file)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])
 def _extract_pptx(file_bytes: bytes) -> str:
    """Extract PPTX content (synchronous).
    Args:
        file_bytes: PPTX file content as bytes
    Returns:
        str: Extracted text content
    """
    from pptx import Presentation  # type: ignore
    pptx_file = BytesIO(file_bytes)
    prs = Presentation(pptx_file)
    content = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                content += shape.text + "\n"
    return content
 def _extract_xlsx(file_bytes: bytes) -> str:
    """Extract XLSX content (synchronous).
    Args:
        file_bytes: XLSX file content as bytes
    Returns:
        str: Extracted text content
    """
    from openpyxl import load_workbook  # type: ignore
    xlsx_file = BytesIO(file_bytes)
    wb = load_workbook(xlsx_file)
    content = ""
    for sheet in wb:
        content += f"Sheet: {sheet.title}\n"
        for row in sheet.iter_rows(values_only=True):
            content += (
                "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
            )
        content += "\n"
    return content
 async def pipeline_enqueue_file(
    rag: LightRAG, file_path: Path, track_id: str = None
 ) -> tuple[bool, str]:
@ -1072,87 +1201,21 @@ async def pipeline_enqueue_file(
                case ".pdf":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
+                        # Try DOCLING first if configured and available
-                            if not pm.is_installed("docling"):  # type: ignore
+                        if (
-                                pm.install("docling")
+                            global_args.document_loading_engine == "DOCLING"
-                            from docling.document_converter import DocumentConverter  # type: ignore
+                            and DOCLING_AVAILABLE
-
+                        ):
-                            converter = DocumentConverter()
+                            content = await asyncio.to_thread(
-                            result = converter.convert(file_path)
+                                _convert_with_docling, file_path
-                            content = result.document.export_to_markdown()
+                            )
                        else:
-                            if not pm.is_installed("pypdf"):  # type: ignore
+                            # Use pypdf (non-blocking via to_thread)
-                                pm.install("pypdf")
+                            content = await asyncio.to_thread(
-                            if not pm.is_installed("pycryptodome"):  # type: ignore
+                                _extract_pdf_pypdf,
-                                pm.install("pycryptodome")
+                                file,
-                            from pypdf import PdfReader  # type: ignore
+                                global_args.pdf_decrypt_password,
                            from io import BytesIO
                            pdf_file = BytesIO(file)
                            reader = PdfReader(pdf_file)
                            # Check if PDF is encrypted
                            if reader.is_encrypted:
                                pdf_password = global_args.pdf_decrypt_password
                                if not pdf_password:
                                    # PDF is encrypted but no password provided
                                    error_files = [
                                        {
                                            "file_path": str(file_path.name),
                                            "error_description": "[File Extraction]PDF is encrypted but no password provided",
                                            "original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
                                            "file_size": file_size,
                                        }
                                    ]
                                    await rag.apipeline_enqueue_error_documents(
                                        error_files, track_id
                            )
                                    logger.error(
                                        f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
                                    )
                                    return False, track_id
                                # Try to decrypt with password
                                try:
                                    decrypt_result = reader.decrypt(pdf_password)
                                    if decrypt_result == 0:
                                        # Password is incorrect
                                        error_files = [
                                            {
                                                "file_path": str(file_path.name),
                                                "error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
                                                "original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
                                                "file_size": file_size,
                                            }
                                        ]
                                        await rag.apipeline_enqueue_error_documents(
                                            error_files, track_id
                                        )
                                        logger.error(
                                            f"[File Extraction]Incorrect PDF password: {file_path.name}"
                                        )
                                        return False, track_id
                                except Exception as decrypt_error:
                                    # Decryption process error
                                    error_files = [
                                        {
                                            "file_path": str(file_path.name),
                                            "error_description": "[File Extraction]PDF decryption failed",
                                            "original_error": f"Error during PDF decryption: {str(decrypt_error)}",
                                            "file_size": file_size,
                                        }
                                    ]
                                    await rag.apipeline_enqueue_error_documents(
                                        error_files, track_id
                                    )
                                    logger.error(
                                        f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
                                    )
                                    return False, track_id
                            # Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
                            for page in reader.pages:
                                content += page.extract_text() + "\n"
                    except Exception as e:
                        error_files = [
                            {
@ -1172,28 +1235,17 @@ async def pipeline_enqueue_file(
                case ".docx":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
+                        # Try DOCLING first if configured and available
-                            if not pm.is_installed("docling"):  # type: ignore
+                        if (
-                                pm.install("docling")
+                            global_args.document_loading_engine == "DOCLING"
-                            from docling.document_converter import DocumentConverter  # type: ignore
+                            and DOCLING_AVAILABLE
-
+                        ):
-                            converter = DocumentConverter()
+                            content = await asyncio.to_thread(
-                            result = converter.convert(file_path)
+                                _convert_with_docling, file_path
                            content = result.document.export_to_markdown()
                        else:
                            if not pm.is_installed("python-docx"):  # type: ignore
                                try:
                                    pm.install("python-docx")
                                except Exception:
                                    pm.install("docx")
                            from docx import Document  # type: ignore
                            from io import BytesIO
                            docx_file = BytesIO(file)
                            doc = Document(docx_file)
                            content = "\n".join(
                                [paragraph.text for paragraph in doc.paragraphs]
                            )
                        else:
                            # Use python-docx (non-blocking via to_thread)
                            content = await asyncio.to_thread(_extract_docx, file)
                    except Exception as e:
                        error_files = [
                            {
@ -1213,26 +1265,17 @@ async def pipeline_enqueue_file(
                case ".pptx":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
+                        # Try DOCLING first if configured and available
-                            if not pm.is_installed("docling"):  # type: ignore
+                        if (
-                                pm.install("docling")
+                            global_args.document_loading_engine == "DOCLING"
-                            from docling.document_converter import DocumentConverter  # type: ignore
+                            and DOCLING_AVAILABLE
-
+                        ):
-                            converter = DocumentConverter()
+                            content = await asyncio.to_thread(
-                            result = converter.convert(file_path)
+                                _convert_with_docling, file_path
-                            content = result.document.export_to_markdown()
+                            )
                        else:
-                            if not pm.is_installed("python-pptx"):  # type: ignore
+                            # Use python-pptx (non-blocking via to_thread)
-                                pm.install("pptx")
+                            content = await asyncio.to_thread(_extract_pptx, file)
                            from pptx import Presentation  # type: ignore
                            from io import BytesIO
                            pptx_file = BytesIO(file)
                            prs = Presentation(pptx_file)
                            for slide in prs.slides:
                                for shape in slide.shapes:
                                    if hasattr(shape, "text"):
                                        content += shape.text + "\n"
                    except Exception as e:
                        error_files = [
                            {
@ -1252,33 +1295,17 @@ async def pipeline_enqueue_file(
                case ".xlsx":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
+                        # Try DOCLING first if configured and available
-                            if not pm.is_installed("docling"):  # type: ignore
+                        if (
-                                pm.install("docling")
+                            global_args.document_loading_engine == "DOCLING"
-                            from docling.document_converter import DocumentConverter  # type: ignore
+                            and DOCLING_AVAILABLE
-
+                        ):
-                            converter = DocumentConverter()
+                            content = await asyncio.to_thread(
-                            result = converter.convert(file_path)
+                                _convert_with_docling, file_path
-                            content = result.document.export_to_markdown()
+                            )
                        else:
-                            if not pm.is_installed("openpyxl"):  # type: ignore
+                            # Use openpyxl (non-blocking via to_thread)
-                                pm.install("openpyxl")
+                            content = await asyncio.to_thread(_extract_xlsx, file)
                            from openpyxl import load_workbook  # type: ignore
                            from io import BytesIO
                            xlsx_file = BytesIO(file)
                            wb = load_workbook(xlsx_file)
                            for sheet in wb:
                                content += f"Sheet: {sheet.title}\n"
                                for row in sheet.iter_rows(values_only=True):
                                    content += (
                                        "\t".join(
                                            str(cell) if cell is not None else ""
                                            for cell in row
                                        )
                                        + "\n"
                                    )
                                content += "\n"
                    except Exception as e:
                        error_files = [
                            {
--- a/pyproject.toml
+++ b/pyproject.toml
@ -79,18 +79,20 @@ api = [
    "python-multipart",
    "pytz",
    "uvicorn",
    # Document processing dependencies (required for API document upload functionality)
    "openpyxl>=3.0.0,<4.0.0",      # XLSX processing
    "pycryptodome>=3.0.0,<4.0.0",  # PDF encryption support
    "pypdf>=6.1.0",                 # PDF processing
    "python-docx>=0.8.11,<2.0.0",  # DOCX processing
    "python-pptx>=0.6.21,<2.0.0",  # PPTX processing
 ]
 # Advanced document processing engine (optional)
 docling = [
    "docling>=2.0.0,<3.0.0",
 ]
 # Offline deployment dependencies (layered design for flexibility)
 offline-docs = [
    # Document processing dependencies
    "openpyxl>=3.0.0,<4.0.0",
    "pycryptodome>=3.0.0,<4.0.0",
    "pypdf>=6.1.0",
    "python-docx>=0.8.11,<2.0.0",
    "python-pptx>=0.6.21,<2.0.0",
 ]
 offline-storage = [
    # Storage backend dependencies
    "redis>=5.0.0,<8.0.0",
@ -115,8 +117,8 @@ offline-llm = [
 ]
 offline = [
-    # Complete offline package (includes all offline dependencies)
+    # Complete offline package (includes api for document processing, plus storage and LLM)
-    "lightrag-hku[offline-docs,offline-storage,offline-llm]",
+    "lightrag-hku[api,offline-storage,offline-llm]",
 ]
 evaluation = [
--- a/requirements-offline-docs.txt
+++ b/requirements-offline-docs.txt
@ -1,15 +0,0 @@
 # LightRAG Offline Dependencies - Document Processing
 # Install with: pip install -r requirements-offline-docs.txt
 # For offline installation:
 #   pip download -r requirements-offline-docs.txt -d ./packages
 #   pip install --no-index --find-links=./packages -r requirements-offline-docs.txt
 #
 # Recommended: Use pip install lightrag-hku[offline-docs] for the same effect
 # Or use constraints: pip install --constraint constraints-offline.txt -r requirements-offline-docs.txt
 # Document processing dependencies (with version constraints matching pyproject.toml)
 openpyxl>=3.0.0,<4.0.0
 pycryptodome>=3.0.0,<4.0.0
 pypdf>=6.1.0
 python-docx>=0.8.11,<2.0.0
 python-pptx>=0.6.21,<2.0.0