Improve docling integration with macOS compatibility and CLI flag

- Add --docling CLI flag for easier setup - Add numpy version constraints - Exclude docling on macOS (fork-safety) (cherry picked from commit a24d8181c2)
2025-11-13 18:58:09 +08:00 · 2025-11-13 18:58:09 +08:00 · 95d47566c1
commit 95d47566c1
parent 033ee5c0f5
4 changed files with 2713 additions and 745 deletions
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -265,6 +265,14 @@ def parse_args() -> argparse.Namespace:
        help=f"Rerank binding type (default: from env or {DEFAULT_RERANK_BINDING})",
    )

+    # Document loading engine configuration
+    parser.add_argument(
+        "--docling",
+        action="store_true",
+        default=False,
+        help="Enable DOCLING document loading engine (default: from env or DEFAULT)",
+    )
+
    # Conditionally add binding options defined in binding_options module
    # This will add command line arguments for all binding options (e.g., --ollama-embedding-num_ctx)
    # and corresponding environment variables (e.g., OLLAMA_EMBEDDING_NUM_CTX)
@ -364,8 +372,13 @@ def parse_args() -> argparse.Namespace:
    )
    args.enable_llm_cache = get_env_value("ENABLE_LLM_CACHE", True, bool)

-    # Select Document loading tool (DOCLING, DEFAULT)
-    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
+    # Set document_loading_engine from --docling flag
+    if args.docling:
+        args.document_loading_engine = "DOCLING"
+    else:
+        args.document_loading_engine = get_env_value(
+            "DOCUMENT_LOADING_ENGINE", "DEFAULT"
+        )

    # Add environment variables that were previously read directly
    args.cors_origins = get_env_value("CORS_ORIGINS", "*")
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,10 +24,12 @@ dependencies = [
    "aiohttp",
    "configparser",
    "future",
+    "google-api-core>=2.0.0,<3.0.0",
+    "google-genai>=1.0.0,<2.0.0",
    "json_repair",
    "nano-vectordb",
    "networkx",
-    "numpy",
+    "numpy>=1.24.0,<2.0.0",
    "pandas>=2.0.0,<2.4.0",
    "pipmaster",
    "pydantic",
@ -48,7 +50,7 @@ api = [
    "json_repair",
    "nano-vectordb",
    "networkx",
-    "numpy",
+    "numpy>=1.24.0,<2.0.0",
    "openai>=1.0.0,<3.0.0",
    "pandas>=2.0.0,<2.4.0",
    "pipmaster",
@ -59,6 +61,8 @@ api = [
    "tenacity",
    "tiktoken",
    "xlsxwriter>=3.1.0",
+    "google-api-core>=2.0.0,<3.0.0",
+    "google-genai>=1.0.0,<2.0.0",
    # API-specific dependencies
    "aiofiles",
    "ascii_colors",
@ -75,18 +79,23 @@ api = [
    "python-multipart",
    "pytz",
    "uvicorn",
+    "gunicorn",
+    # Document processing dependencies (required for API document upload functionality)
+    "openpyxl>=3.0.0,<4.0.0",      # XLSX processing
+    "pycryptodome>=3.0.0,<4.0.0",  # PDF encryption support
+    "pypdf>=6.1.0",                 # PDF processing
+    "python-docx>=0.8.11,<2.0.0",  # DOCX processing
+    "python-pptx>=0.6.21,<2.0.0",  # PPTX processing
+]
+
+# Advanced document processing engine (optional)
+docling = [
+    # On macOS, pytorch and frameworks use Objective-C are not fork-safe,
+    # and not compatible to gunicorn multi-worker mode
+    "docling>=2.0.0,<3.0.0; sys_platform != 'darwin'",
 ]

 # Offline deployment dependencies (layered design for flexibility)
-offline-docs = [
-    # Document processing dependencies
-    "openpyxl>=3.0.0,<4.0.0",
-    "pycryptodome>=3.0.0,<4.0.0",
-    "pypdf2>=3.0.0",
-    "python-docx>=0.8.11,<2.0.0",
-    "python-pptx>=0.6.21,<2.0.0",
-]
-
 offline-storage = [
    # Storage backend dependencies
    "redis>=5.0.0,<8.0.0",
@ -94,7 +103,7 @@ offline-storage = [
    "pymilvus>=2.6.2,<3.0.0",
    "pymongo>=4.0.0,<5.0.0",
    "asyncpg>=0.29.0,<1.0.0",
-    "qdrant-client>=1.7.0,<2.0.0",
+    "qdrant-client>=1.11.0,<2.0.0",
 ]

 offline-llm = [
@ -106,11 +115,22 @@ offline-llm = [
    "aioboto3>=12.0.0,<16.0.0",
    "voyageai>=0.2.0,<1.0.0",
    "llama-index>=0.9.0,<1.0.0",
+    "google-api-core>=2.0.0,<3.0.0",
+    "google-genai>=1.0.0,<2.0.0",
 ]

 offline = [
-    # Complete offline package (includes all offline dependencies)
-    "lightrag-hku[offline-docs,offline-storage,offline-llm]",
+    # Complete offline package (includes api for document processing, plus storage and LLM)
+    "lightrag-hku[api,offline-storage,offline-llm]",
+]
+
+evaluation = [
+    # RAG evaluation dependencies (RAGAS framework)
+    "ragas>=0.3.7",
+    "datasets>=4.3.0",
+    "httpx>=0.28.1",
+    "pytest>=8.4.2",
+    "pytest-asyncio>=1.2.0",
 ]

 observability = [
--- a/uv.lock
+++ b/uv.lock