Improve docling integration with macOS compatibility and CLI flag

- Add --docling CLI flag for easier setup - Add numpy version constraints - Exclude docling on macOS (fork-safety)
2025-11-13 18:58:09 +08:00 · 2025-11-13 18:58:09 +08:00 · c246eff725
commit c246eff725
parent 63510478e5
4 changed files with 874 additions and 681 deletions
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -258,6 +258,14 @@ def parse_args() -> argparse.Namespace:
        help=f"Rerank binding type (default: from env or {DEFAULT_RERANK_BINDING})",
    )
    # Document loading engine configuration
    parser.add_argument(
        "--docling",
        action="store_true",
        default=False,
        help="Enable DOCLING document loading engine (default: from env or DEFAULT)",
    )
    # Conditionally add binding options defined in binding_options module
    # This will add command line arguments for all binding options (e.g., --ollama-embedding-num_ctx)
    # and corresponding environment variables (e.g., OLLAMA_EMBEDDING_NUM_CTX)
@ -371,8 +379,13 @@ def parse_args() -> argparse.Namespace:
    )
    args.enable_llm_cache = get_env_value("ENABLE_LLM_CACHE", True, bool)
-    # Select Document loading tool (DOCLING, DEFAULT)
+    # Set document_loading_engine from --docling flag
-    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
+    if args.docling:
        args.document_loading_engine = "DOCLING"
    else:
        args.document_loading_engine = get_env_value(
            "DOCUMENT_LOADING_ENGINE", "DEFAULT"
        )
    # PDF decryption password
    args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
 """
 import asyncio
 from functools import lru_cache
 from lightrag.utils import logger, get_pinyin_sort_key
 import aiofiles
 import shutil
@ -27,19 +28,23 @@ from lightrag.utils import generate_track_id
 from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args
 # Check docling availability at module load time
 DOCLING_AVAILABLE = False
 try:
    import docling  # noqa: F401  # type: ignore[import-not-found]
-    DOCLING_AVAILABLE = True
+@lru_cache(maxsize=1)
-except ImportError:
+def _is_docling_available() -> bool:
-    if global_args.document_loading_engine == "DOCLING":
+    """Check if docling is available (cached check).
-        logger.warning(
+
-            "DOCLING engine requested but 'docling' package not installed. "
+    This function uses lru_cache to avoid repeated import attempts.
-            "Falling back to standard document processing. "
+    The result is cached after the first call.
-            "To use DOCLING, install with: pip install lightrag-hku[api,docling]"
+
-        )
+    Returns:
        bool: True if docling is available, False otherwise
    """
    try:
        import docling  # noqa: F401  # type: ignore[import-not-found]
        return True
    except ImportError:
        return False
 # Function to format datetime to ISO format string with timezone information
@ -1204,12 +1209,19 @@ async def pipeline_enqueue_file(
                        # Try DOCLING first if configured and available
                        if (
                            global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                        ):
                            content = await asyncio.to_thread(
                                _convert_with_docling, file_path
                            )
                        else:
                            if (
                                global_args.document_loading_engine == "DOCLING"
                                and not _is_docling_available()
                            ):
                                logger.warning(
                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to pypdf."
                                )
                            # Use pypdf (non-blocking via to_thread)
                            content = await asyncio.to_thread(
                                _extract_pdf_pypdf,
@ -1238,12 +1250,19 @@ async def pipeline_enqueue_file(
                        # Try DOCLING first if configured and available
                        if (
                            global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                        ):
                            content = await asyncio.to_thread(
                                _convert_with_docling, file_path
                            )
                        else:
                            if (
                                global_args.document_loading_engine == "DOCLING"
                                and not _is_docling_available()
                            ):
                                logger.warning(
                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-docx."
                                )
                            # Use python-docx (non-blocking via to_thread)
                            content = await asyncio.to_thread(_extract_docx, file)
                    except Exception as e:
@ -1268,12 +1287,19 @@ async def pipeline_enqueue_file(
                        # Try DOCLING first if configured and available
                        if (
                            global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                        ):
                            content = await asyncio.to_thread(
                                _convert_with_docling, file_path
                            )
                        else:
                            if (
                                global_args.document_loading_engine == "DOCLING"
                                and not _is_docling_available()
                            ):
                                logger.warning(
                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-pptx."
                                )
                            # Use python-pptx (non-blocking via to_thread)
                            content = await asyncio.to_thread(_extract_pptx, file)
                    except Exception as e:
@ -1298,12 +1324,19 @@ async def pipeline_enqueue_file(
                        # Try DOCLING first if configured and available
                        if (
                            global_args.document_loading_engine == "DOCLING"
-                            and DOCLING_AVAILABLE
+                            and _is_docling_available()
                        ):
                            content = await asyncio.to_thread(
                                _convert_with_docling, file_path
                            )
                        else:
                            if (
                                global_args.document_loading_engine == "DOCLING"
                                and not _is_docling_available()
                            ):
                                logger.warning(
                                    f"DOCLING engine configured but not available for {file_path.name}. Falling back to openpyxl."
                                )
                            # Use openpyxl (non-blocking via to_thread)
                            content = await asyncio.to_thread(_extract_xlsx, file)
                    except Exception as e:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -29,7 +29,7 @@ dependencies = [
    "json_repair",
    "nano-vectordb",
    "networkx",
-    "numpy",
+    "numpy>=1.24.0,<2.0.0",
    "pandas>=2.0.0,<2.4.0",
    "pipmaster",
    "pydantic",
@ -50,7 +50,7 @@ api = [
    "json_repair",
    "nano-vectordb",
    "networkx",
-    "numpy",
+    "numpy>=1.24.0,<2.0.0",
    "openai>=1.0.0,<3.0.0",
    "pandas>=2.0.0,<2.4.0",
    "pipmaster",
@ -90,7 +90,9 @@ api = [
 # Advanced document processing engine (optional)
 docling = [
-    "docling>=2.0.0,<3.0.0",
+    # On macOS, pytorch and frameworks use Objective-C are not fork-safe,
    # and not compatible to gunicorn multi-worker mode
    "docling>=2.0.0,<3.0.0; sys_platform != 'darwin'",
 ]
 # Offline deployment dependencies (layered design for flexibility)
--- a/uv.lock
+++ b/uv.lock