diff --git a/docs/OfflineDeployment.md b/docs/OfflineDeployment.md index 4e397283..b0e57d91 100644 --- a/docs/OfflineDeployment.md +++ b/docs/OfflineDeployment.md @@ -24,10 +24,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b LightRAG dynamically installs packages for: -- **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl` - **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client` - **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch` -- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN +- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN + +**Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation. ## Quick Start @@ -76,32 +77,31 @@ LightRAG provides flexible dependency groups for different use cases: | Group | Description | Use Case | |-------|-------------|----------| -| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files | +| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support | | `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. | | `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. | -| `offline` | All of the above | Complete offline deployment | +| `offline` | Complete offline package | API + Storage + LLM (all features) | + +**Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration. > Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group. ### Installation Examples ```bash -# Install only document processing dependencies -pip install lightrag-hku[offline-docs] +# Install API with document processing +pip install lightrag-hku[api] -# Install document processing and storage backends -pip install lightrag-hku[offline-docs,offline-storage] +# Install API and storage backends +pip install lightrag-hku[api,offline-storage] -# Install all offline dependencies +# Install all offline dependencies (recommended for offline deployment) pip install lightrag-hku[offline] ``` ### Using Individual Requirements Files ```bash -# Document processing only -pip install -r requirements-offline-docs.txt - # Storage backends only pip install -r requirements-offline-storage.txt @@ -245,8 +245,8 @@ ls -la ~/.tiktoken_cache/ **Solution**: ```bash # Pre-install the specific package you need -# For document processing: -pip install lightrag-hku[offline-docs] +# For API with document processing: +pip install lightrag-hku[api] # For storage backends: pip install lightrag-hku[offline-storage] @@ -298,9 +298,9 @@ mkdir -p ~/my_tiktoken_cache 5. **Minimal Installation**: Only install what you need: ```bash - # If you only process PDFs with OpenAI - pip install lightrag-hku[offline-docs] - # Then manually add: pip install openai + # If you only need API with document processing + pip install lightrag-hku[api] + # Then manually add specific LLM: pip install openai ``` ## Additional Resources diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 8d8ce2e0..ba0ef47c 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key import aiofiles import shutil import traceback -import pipmaster as pm from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any, Literal +from io import BytesIO from fastapi import ( APIRouter, BackgroundTasks, @@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id from lightrag.api.utils_api import get_combined_auth_dependency from ..config import global_args +# Check docling availability at module load time +DOCLING_AVAILABLE = False +try: + import docling # noqa: F401 # type: ignore[import-not-found] + + DOCLING_AVAILABLE = True +except ImportError: + if global_args.document_loading_engine == "DOCLING": + logger.warning( + "DOCLING engine requested but 'docling' package not installed. " + "Falling back to standard document processing. " + "To use DOCLING, install with: pip install lightrag-hku[api,docling]" + ) + # Function to format datetime to ISO format string with timezone information def format_datetime(dt: Any) -> Optional[str]: @@ -853,7 +867,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str Returns: str: Unique filename (may have numeric suffix added) """ - from pathlib import Path import time original_path = Path(original_name) @@ -876,6 +889,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str return f"{base_name}_{timestamp}{extension}" +# Document processing helper functions (synchronous) +# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop + + +def _convert_with_docling(file_path: Path) -> str: + """Convert document using docling (synchronous). + + Args: + file_path: Path to the document file + + Returns: + str: Extracted markdown content + """ + from docling.document_converter import DocumentConverter # type: ignore + + converter = DocumentConverter() + result = converter.convert(file_path) + return result.document.export_to_markdown() + + +def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str: + """Extract PDF content using pypdf (synchronous). + + Args: + file_bytes: PDF file content as bytes + password: Optional password for encrypted PDFs + + Returns: + str: Extracted text content + + Raises: + Exception: If PDF is encrypted and password is incorrect or missing + """ + from pypdf import PdfReader # type: ignore + + pdf_file = BytesIO(file_bytes) + reader = PdfReader(pdf_file) + + # Check if PDF is encrypted + if reader.is_encrypted: + if not password: + raise Exception("PDF is encrypted but no password provided") + + decrypt_result = reader.decrypt(password) + if decrypt_result == 0: + raise Exception("Incorrect PDF password") + + # Extract text from all pages + content = "" + for page in reader.pages: + content += page.extract_text() + "\n" + + return content + + +def _extract_docx(file_bytes: bytes) -> str: + """Extract DOCX content (synchronous). + + Args: + file_bytes: DOCX file content as bytes + + Returns: + str: Extracted text content + """ + from docx import Document # type: ignore + + docx_file = BytesIO(file_bytes) + doc = Document(docx_file) + return "\n".join([paragraph.text for paragraph in doc.paragraphs]) + + +def _extract_pptx(file_bytes: bytes) -> str: + """Extract PPTX content (synchronous). + + Args: + file_bytes: PPTX file content as bytes + + Returns: + str: Extracted text content + """ + from pptx import Presentation # type: ignore + + pptx_file = BytesIO(file_bytes) + prs = Presentation(pptx_file) + content = "" + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" + return content + + +def _extract_xlsx(file_bytes: bytes) -> str: + """Extract XLSX content (synchronous). + + Args: + file_bytes: XLSX file content as bytes + + Returns: + str: Extracted text content + """ + from openpyxl import load_workbook # type: ignore + + xlsx_file = BytesIO(file_bytes) + wb = load_workbook(xlsx_file) + content = "" + for sheet in wb: + content += f"Sheet: {sheet.title}\n" + for row in sheet.iter_rows(values_only=True): + content += ( + "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n" + ) + content += "\n" + return content + + async def pipeline_enqueue_file( rag: LightRAG, file_path: Path, track_id: str = None ) -> tuple[bool, str]: @@ -1046,15 +1175,16 @@ async def pipeline_enqueue_file( case ".pdf": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path + ) else: +<<<<<<< HEAD if not pm.is_installed("pypdf2"): # type: ignore pm.install("pypdf2") from PyPDF2 import PdfReader # type: ignore @@ -1064,6 +1194,14 @@ async def pipeline_enqueue_file( reader = PdfReader(pdf_file) for page in reader.pages: content += page.extract_text() + "\n" +======= + # Use pypdf (non-blocking via to_thread) + content = await asyncio.to_thread( + _extract_pdf_pypdf, + file, + global_args.pdf_decrypt_password, + ) +>>>>>>> 4b31942e (refactor: move document deps to api group, remove dynamic imports) except Exception as e: error_files = [ { @@ -1083,28 +1221,17 @@ async def pipeline_enqueue_file( case ".docx": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() - else: - if not pm.is_installed("python-docx"): # type: ignore - try: - pm.install("python-docx") - except Exception: - pm.install("docx") - from docx import Document # type: ignore - from io import BytesIO - - docx_file = BytesIO(file) - doc = Document(docx_file) - content = "\n".join( - [paragraph.text for paragraph in doc.paragraphs] + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path ) + else: + # Use python-docx (non-blocking via to_thread) + content = await asyncio.to_thread(_extract_docx, file) except Exception as e: error_files = [ { @@ -1124,26 +1251,17 @@ async def pipeline_enqueue_file( case ".pptx": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path + ) else: - if not pm.is_installed("python-pptx"): # type: ignore - pm.install("pptx") - from pptx import Presentation # type: ignore - from io import BytesIO - - pptx_file = BytesIO(file) - prs = Presentation(pptx_file) - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" + # Use python-pptx (non-blocking via to_thread) + content = await asyncio.to_thread(_extract_pptx, file) except Exception as e: error_files = [ { @@ -1163,33 +1281,17 @@ async def pipeline_enqueue_file( case ".xlsx": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path + ) else: - if not pm.is_installed("openpyxl"): # type: ignore - pm.install("openpyxl") - from openpyxl import load_workbook # type: ignore - from io import BytesIO - - xlsx_file = BytesIO(file) - wb = load_workbook(xlsx_file) - for sheet in wb: - content += f"Sheet: {sheet.title}\n" - for row in sheet.iter_rows(values_only=True): - content += ( - "\t".join( - str(cell) if cell is not None else "" - for cell in row - ) - + "\n" - ) - content += "\n" + # Use openpyxl (non-blocking via to_thread) + content = await asyncio.to_thread(_extract_xlsx, file) except Exception as e: error_files = [ { diff --git a/pyproject.toml b/pyproject.toml index 5903df22..f7b10b8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,9 +77,21 @@ api = [ "python-multipart", "pytz", "uvicorn", + # Document processing dependencies (required for API document upload functionality) + "openpyxl>=3.0.0,<4.0.0", # XLSX processing + "pycryptodome>=3.0.0,<4.0.0", # PDF encryption support + "pypdf>=6.1.0", # PDF processing + "python-docx>=0.8.11,<2.0.0", # DOCX processing + "python-pptx>=0.6.21,<2.0.0", # PPTX processing +] + +# Advanced document processing engine (optional) +docling = [ + "docling>=2.0.0,<3.0.0", ] # Offline deployment dependencies (layered design for flexibility) +<<<<<<< HEAD offline-docs = [ # Document processing dependencies "docling>=1.0.0,<3.0.0", @@ -89,6 +101,8 @@ offline-docs = [ "openpyxl>=3.0.0,<4.0.0", ] +======= +>>>>>>> 4b31942e (refactor: move document deps to api group, remove dynamic imports) offline-storage = [ # Storage backend dependencies "redis>=5.0.0,<7.0.0", @@ -111,8 +125,8 @@ offline-llm = [ ] offline = [ - # Complete offline package (includes all offline dependencies) - "lightrag-hku[offline-docs,offline-storage,offline-llm]", + # Complete offline package (includes api for document processing, plus storage and LLM) + "lightrag-hku[api,offline-storage,offline-llm]", ] [project.scripts]