Improve docling integration with macOS compatibility and CLI flag
- Add --docling CLI flag for easier setup - Add numpy version constraints - Exclude docling on macOS (fork-safety)
This commit is contained in:
parent
63510478e5
commit
c246eff725
4 changed files with 874 additions and 681 deletions
|
|
@ -258,6 +258,14 @@ def parse_args() -> argparse.Namespace:
|
||||||
help=f"Rerank binding type (default: from env or {DEFAULT_RERANK_BINDING})",
|
help=f"Rerank binding type (default: from env or {DEFAULT_RERANK_BINDING})",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Document loading engine configuration
|
||||||
|
parser.add_argument(
|
||||||
|
"--docling",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Enable DOCLING document loading engine (default: from env or DEFAULT)",
|
||||||
|
)
|
||||||
|
|
||||||
# Conditionally add binding options defined in binding_options module
|
# Conditionally add binding options defined in binding_options module
|
||||||
# This will add command line arguments for all binding options (e.g., --ollama-embedding-num_ctx)
|
# This will add command line arguments for all binding options (e.g., --ollama-embedding-num_ctx)
|
||||||
# and corresponding environment variables (e.g., OLLAMA_EMBEDDING_NUM_CTX)
|
# and corresponding environment variables (e.g., OLLAMA_EMBEDDING_NUM_CTX)
|
||||||
|
|
@ -371,8 +379,13 @@ def parse_args() -> argparse.Namespace:
|
||||||
)
|
)
|
||||||
args.enable_llm_cache = get_env_value("ENABLE_LLM_CACHE", True, bool)
|
args.enable_llm_cache = get_env_value("ENABLE_LLM_CACHE", True, bool)
|
||||||
|
|
||||||
# Select Document loading tool (DOCLING, DEFAULT)
|
# Set document_loading_engine from --docling flag
|
||||||
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
|
if args.docling:
|
||||||
|
args.document_loading_engine = "DOCLING"
|
||||||
|
else:
|
||||||
|
args.document_loading_engine = get_env_value(
|
||||||
|
"DOCUMENT_LOADING_ENGINE", "DEFAULT"
|
||||||
|
)
|
||||||
|
|
||||||
# PDF decryption password
|
# PDF decryption password
|
||||||
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
|
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from functools import lru_cache
|
||||||
from lightrag.utils import logger, get_pinyin_sort_key
|
from lightrag.utils import logger, get_pinyin_sort_key
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import shutil
|
import shutil
|
||||||
|
|
@ -27,19 +28,23 @@ from lightrag.utils import generate_track_id
|
||||||
from lightrag.api.utils_api import get_combined_auth_dependency
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
||||||
from ..config import global_args
|
from ..config import global_args
|
||||||
|
|
||||||
# Check docling availability at module load time
|
|
||||||
DOCLING_AVAILABLE = False
|
|
||||||
try:
|
|
||||||
import docling # noqa: F401 # type: ignore[import-not-found]
|
|
||||||
|
|
||||||
DOCLING_AVAILABLE = True
|
@lru_cache(maxsize=1)
|
||||||
except ImportError:
|
def _is_docling_available() -> bool:
|
||||||
if global_args.document_loading_engine == "DOCLING":
|
"""Check if docling is available (cached check).
|
||||||
logger.warning(
|
|
||||||
"DOCLING engine requested but 'docling' package not installed. "
|
This function uses lru_cache to avoid repeated import attempts.
|
||||||
"Falling back to standard document processing. "
|
The result is cached after the first call.
|
||||||
"To use DOCLING, install with: pip install lightrag-hku[api,docling]"
|
|
||||||
)
|
Returns:
|
||||||
|
bool: True if docling is available, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import docling # noqa: F401 # type: ignore[import-not-found]
|
||||||
|
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
# Function to format datetime to ISO format string with timezone information
|
# Function to format datetime to ISO format string with timezone information
|
||||||
|
|
@ -1204,12 +1209,19 @@ async def pipeline_enqueue_file(
|
||||||
# Try DOCLING first if configured and available
|
# Try DOCLING first if configured and available
|
||||||
if (
|
if (
|
||||||
global_args.document_loading_engine == "DOCLING"
|
global_args.document_loading_engine == "DOCLING"
|
||||||
and DOCLING_AVAILABLE
|
and _is_docling_available()
|
||||||
):
|
):
|
||||||
content = await asyncio.to_thread(
|
content = await asyncio.to_thread(
|
||||||
_convert_with_docling, file_path
|
_convert_with_docling, file_path
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if (
|
||||||
|
global_args.document_loading_engine == "DOCLING"
|
||||||
|
and not _is_docling_available()
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"DOCLING engine configured but not available for {file_path.name}. Falling back to pypdf."
|
||||||
|
)
|
||||||
# Use pypdf (non-blocking via to_thread)
|
# Use pypdf (non-blocking via to_thread)
|
||||||
content = await asyncio.to_thread(
|
content = await asyncio.to_thread(
|
||||||
_extract_pdf_pypdf,
|
_extract_pdf_pypdf,
|
||||||
|
|
@ -1238,12 +1250,19 @@ async def pipeline_enqueue_file(
|
||||||
# Try DOCLING first if configured and available
|
# Try DOCLING first if configured and available
|
||||||
if (
|
if (
|
||||||
global_args.document_loading_engine == "DOCLING"
|
global_args.document_loading_engine == "DOCLING"
|
||||||
and DOCLING_AVAILABLE
|
and _is_docling_available()
|
||||||
):
|
):
|
||||||
content = await asyncio.to_thread(
|
content = await asyncio.to_thread(
|
||||||
_convert_with_docling, file_path
|
_convert_with_docling, file_path
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if (
|
||||||
|
global_args.document_loading_engine == "DOCLING"
|
||||||
|
and not _is_docling_available()
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-docx."
|
||||||
|
)
|
||||||
# Use python-docx (non-blocking via to_thread)
|
# Use python-docx (non-blocking via to_thread)
|
||||||
content = await asyncio.to_thread(_extract_docx, file)
|
content = await asyncio.to_thread(_extract_docx, file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -1268,12 +1287,19 @@ async def pipeline_enqueue_file(
|
||||||
# Try DOCLING first if configured and available
|
# Try DOCLING first if configured and available
|
||||||
if (
|
if (
|
||||||
global_args.document_loading_engine == "DOCLING"
|
global_args.document_loading_engine == "DOCLING"
|
||||||
and DOCLING_AVAILABLE
|
and _is_docling_available()
|
||||||
):
|
):
|
||||||
content = await asyncio.to_thread(
|
content = await asyncio.to_thread(
|
||||||
_convert_with_docling, file_path
|
_convert_with_docling, file_path
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if (
|
||||||
|
global_args.document_loading_engine == "DOCLING"
|
||||||
|
and not _is_docling_available()
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"DOCLING engine configured but not available for {file_path.name}. Falling back to python-pptx."
|
||||||
|
)
|
||||||
# Use python-pptx (non-blocking via to_thread)
|
# Use python-pptx (non-blocking via to_thread)
|
||||||
content = await asyncio.to_thread(_extract_pptx, file)
|
content = await asyncio.to_thread(_extract_pptx, file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -1298,12 +1324,19 @@ async def pipeline_enqueue_file(
|
||||||
# Try DOCLING first if configured and available
|
# Try DOCLING first if configured and available
|
||||||
if (
|
if (
|
||||||
global_args.document_loading_engine == "DOCLING"
|
global_args.document_loading_engine == "DOCLING"
|
||||||
and DOCLING_AVAILABLE
|
and _is_docling_available()
|
||||||
):
|
):
|
||||||
content = await asyncio.to_thread(
|
content = await asyncio.to_thread(
|
||||||
_convert_with_docling, file_path
|
_convert_with_docling, file_path
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if (
|
||||||
|
global_args.document_loading_engine == "DOCLING"
|
||||||
|
and not _is_docling_available()
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"DOCLING engine configured but not available for {file_path.name}. Falling back to openpyxl."
|
||||||
|
)
|
||||||
# Use openpyxl (non-blocking via to_thread)
|
# Use openpyxl (non-blocking via to_thread)
|
||||||
content = await asyncio.to_thread(_extract_xlsx, file)
|
content = await asyncio.to_thread(_extract_xlsx, file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ dependencies = [
|
||||||
"json_repair",
|
"json_repair",
|
||||||
"nano-vectordb",
|
"nano-vectordb",
|
||||||
"networkx",
|
"networkx",
|
||||||
"numpy",
|
"numpy>=1.24.0,<2.0.0",
|
||||||
"pandas>=2.0.0,<2.4.0",
|
"pandas>=2.0.0,<2.4.0",
|
||||||
"pipmaster",
|
"pipmaster",
|
||||||
"pydantic",
|
"pydantic",
|
||||||
|
|
@ -50,7 +50,7 @@ api = [
|
||||||
"json_repair",
|
"json_repair",
|
||||||
"nano-vectordb",
|
"nano-vectordb",
|
||||||
"networkx",
|
"networkx",
|
||||||
"numpy",
|
"numpy>=1.24.0,<2.0.0",
|
||||||
"openai>=1.0.0,<3.0.0",
|
"openai>=1.0.0,<3.0.0",
|
||||||
"pandas>=2.0.0,<2.4.0",
|
"pandas>=2.0.0,<2.4.0",
|
||||||
"pipmaster",
|
"pipmaster",
|
||||||
|
|
@ -90,7 +90,9 @@ api = [
|
||||||
|
|
||||||
# Advanced document processing engine (optional)
|
# Advanced document processing engine (optional)
|
||||||
docling = [
|
docling = [
|
||||||
"docling>=2.0.0,<3.0.0",
|
# On macOS, pytorch and frameworks use Objective-C are not fork-safe,
|
||||||
|
# and not compatible to gunicorn multi-worker mode
|
||||||
|
"docling>=2.0.0,<3.0.0; sys_platform != 'darwin'",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Offline deployment dependencies (layered design for flexibility)
|
# Offline deployment dependencies (layered design for flexibility)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue