refactor: move document deps to api group, remove dynamic imports
- Merge offline-docs into api extras - Remove pipmaster dynamic installs - Add async document processing - Pre-check docling availability - Update offline deployment docs
This commit is contained in:
parent
7d394fb0a4
commit
69a0b74ce7
4 changed files with 205 additions and 191 deletions
|
|
@ -23,10 +23,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b
|
||||||
|
|
||||||
LightRAG dynamically installs packages for:
|
LightRAG dynamically installs packages for:
|
||||||
|
|
||||||
- **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl`
|
|
||||||
- **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client`
|
- **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client`
|
||||||
- **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch`
|
- **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch`
|
||||||
- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
|
- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
|
||||||
|
|
||||||
|
**Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation.
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
|
|
@ -75,32 +76,31 @@ LightRAG provides flexible dependency groups for different use cases:
|
||||||
|
|
||||||
| Group | Description | Use Case |
|
| Group | Description | Use Case |
|
||||||
|-------|-------------|----------|
|
|-------|-------------|----------|
|
||||||
| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files |
|
| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support |
|
||||||
| `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. |
|
| `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. |
|
||||||
| `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. |
|
| `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. |
|
||||||
| `offline` | All of the above | Complete offline deployment |
|
| `offline` | Complete offline package | API + Storage + LLM (all features) |
|
||||||
|
|
||||||
|
**Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration.
|
||||||
|
|
||||||
> Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group.
|
> Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group.
|
||||||
|
|
||||||
### Installation Examples
|
### Installation Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Install only document processing dependencies
|
# Install API with document processing
|
||||||
pip install lightrag-hku[offline-docs]
|
pip install lightrag-hku[api]
|
||||||
|
|
||||||
# Install document processing and storage backends
|
# Install API and storage backends
|
||||||
pip install lightrag-hku[offline-docs,offline-storage]
|
pip install lightrag-hku[api,offline-storage]
|
||||||
|
|
||||||
# Install all offline dependencies
|
# Install all offline dependencies (recommended for offline deployment)
|
||||||
pip install lightrag-hku[offline]
|
pip install lightrag-hku[offline]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Using Individual Requirements Files
|
### Using Individual Requirements Files
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Document processing only
|
|
||||||
pip install -r requirements-offline-docs.txt
|
|
||||||
|
|
||||||
# Storage backends only
|
# Storage backends only
|
||||||
pip install -r requirements-offline-storage.txt
|
pip install -r requirements-offline-storage.txt
|
||||||
|
|
||||||
|
|
@ -244,8 +244,8 @@ ls -la ~/.tiktoken_cache/
|
||||||
**Solution**:
|
**Solution**:
|
||||||
```bash
|
```bash
|
||||||
# Pre-install the specific package you need
|
# Pre-install the specific package you need
|
||||||
# For document processing:
|
# For API with document processing:
|
||||||
pip install lightrag-hku[offline-docs]
|
pip install lightrag-hku[api]
|
||||||
|
|
||||||
# For storage backends:
|
# For storage backends:
|
||||||
pip install lightrag-hku[offline-storage]
|
pip install lightrag-hku[offline-storage]
|
||||||
|
|
@ -297,9 +297,9 @@ mkdir -p ~/my_tiktoken_cache
|
||||||
|
|
||||||
5. **Minimal Installation**: Only install what you need:
|
5. **Minimal Installation**: Only install what you need:
|
||||||
```bash
|
```bash
|
||||||
# If you only process PDFs with OpenAI
|
# If you only need API with document processing
|
||||||
pip install lightrag-hku[offline-docs]
|
pip install lightrag-hku[api]
|
||||||
# Then manually add: pip install openai
|
# Then manually add specific LLM: pip install openai
|
||||||
```
|
```
|
||||||
|
|
||||||
## Additional Resources
|
## Additional Resources
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import shutil
|
import shutil
|
||||||
import traceback
|
import traceback
|
||||||
import pipmaster as pm
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Any, Literal
|
from typing import Dict, List, Optional, Any, Literal
|
||||||
|
from io import BytesIO
|
||||||
from fastapi import (
|
from fastapi import (
|
||||||
APIRouter,
|
APIRouter,
|
||||||
BackgroundTasks,
|
BackgroundTasks,
|
||||||
|
|
@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id
|
||||||
from lightrag.api.utils_api import get_combined_auth_dependency
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
||||||
from ..config import global_args
|
from ..config import global_args
|
||||||
|
|
||||||
|
# Check docling availability at module load time
|
||||||
|
DOCLING_AVAILABLE = False
|
||||||
|
try:
|
||||||
|
import docling # noqa: F401 # type: ignore[import-not-found]
|
||||||
|
|
||||||
|
DOCLING_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
if global_args.document_loading_engine == "DOCLING":
|
||||||
|
logger.warning(
|
||||||
|
"DOCLING engine requested but 'docling' package not installed. "
|
||||||
|
"Falling back to standard document processing. "
|
||||||
|
"To use DOCLING, install with: pip install lightrag-hku[api,docling]"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Function to format datetime to ISO format string with timezone information
|
# Function to format datetime to ISO format string with timezone information
|
||||||
def format_datetime(dt: Any) -> Optional[str]:
|
def format_datetime(dt: Any) -> Optional[str]:
|
||||||
|
|
@ -879,7 +893,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
|
||||||
Returns:
|
Returns:
|
||||||
str: Unique filename (may have numeric suffix added)
|
str: Unique filename (may have numeric suffix added)
|
||||||
"""
|
"""
|
||||||
from pathlib import Path
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
original_path = Path(original_name)
|
original_path = Path(original_name)
|
||||||
|
|
@ -902,6 +915,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
|
||||||
return f"{base_name}_{timestamp}{extension}"
|
return f"{base_name}_{timestamp}{extension}"
|
||||||
|
|
||||||
|
|
||||||
|
# Document processing helper functions (synchronous)
|
||||||
|
# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_with_docling(file_path: Path) -> str:
|
||||||
|
"""Convert document using docling (synchronous).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the document file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted markdown content
|
||||||
|
"""
|
||||||
|
from docling.document_converter import DocumentConverter # type: ignore
|
||||||
|
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
|
return result.document.export_to_markdown()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
|
||||||
|
"""Extract PDF content using pypdf (synchronous).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_bytes: PDF file content as bytes
|
||||||
|
password: Optional password for encrypted PDFs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted text content
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If PDF is encrypted and password is incorrect or missing
|
||||||
|
"""
|
||||||
|
from pypdf import PdfReader # type: ignore
|
||||||
|
|
||||||
|
pdf_file = BytesIO(file_bytes)
|
||||||
|
reader = PdfReader(pdf_file)
|
||||||
|
|
||||||
|
# Check if PDF is encrypted
|
||||||
|
if reader.is_encrypted:
|
||||||
|
if not password:
|
||||||
|
raise Exception("PDF is encrypted but no password provided")
|
||||||
|
|
||||||
|
decrypt_result = reader.decrypt(password)
|
||||||
|
if decrypt_result == 0:
|
||||||
|
raise Exception("Incorrect PDF password")
|
||||||
|
|
||||||
|
# Extract text from all pages
|
||||||
|
content = ""
|
||||||
|
for page in reader.pages:
|
||||||
|
content += page.extract_text() + "\n"
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_docx(file_bytes: bytes) -> str:
|
||||||
|
"""Extract DOCX content (synchronous).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_bytes: DOCX file content as bytes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted text content
|
||||||
|
"""
|
||||||
|
from docx import Document # type: ignore
|
||||||
|
|
||||||
|
docx_file = BytesIO(file_bytes)
|
||||||
|
doc = Document(docx_file)
|
||||||
|
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pptx(file_bytes: bytes) -> str:
|
||||||
|
"""Extract PPTX content (synchronous).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_bytes: PPTX file content as bytes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted text content
|
||||||
|
"""
|
||||||
|
from pptx import Presentation # type: ignore
|
||||||
|
|
||||||
|
pptx_file = BytesIO(file_bytes)
|
||||||
|
prs = Presentation(pptx_file)
|
||||||
|
content = ""
|
||||||
|
for slide in prs.slides:
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text"):
|
||||||
|
content += shape.text + "\n"
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_xlsx(file_bytes: bytes) -> str:
|
||||||
|
"""Extract XLSX content (synchronous).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_bytes: XLSX file content as bytes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted text content
|
||||||
|
"""
|
||||||
|
from openpyxl import load_workbook # type: ignore
|
||||||
|
|
||||||
|
xlsx_file = BytesIO(file_bytes)
|
||||||
|
wb = load_workbook(xlsx_file)
|
||||||
|
content = ""
|
||||||
|
for sheet in wb:
|
||||||
|
content += f"Sheet: {sheet.title}\n"
|
||||||
|
for row in sheet.iter_rows(values_only=True):
|
||||||
|
content += (
|
||||||
|
"\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
|
||||||
|
)
|
||||||
|
content += "\n"
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
async def pipeline_enqueue_file(
|
async def pipeline_enqueue_file(
|
||||||
rag: LightRAG, file_path: Path, track_id: str = None
|
rag: LightRAG, file_path: Path, track_id: str = None
|
||||||
) -> tuple[bool, str]:
|
) -> tuple[bool, str]:
|
||||||
|
|
@ -1072,87 +1201,21 @@ async def pipeline_enqueue_file(
|
||||||
|
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
try:
|
try:
|
||||||
if global_args.document_loading_engine == "DOCLING":
|
# Try DOCLING first if configured and available
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if (
|
||||||
pm.install("docling")
|
global_args.document_loading_engine == "DOCLING"
|
||||||
from docling.document_converter import DocumentConverter # type: ignore
|
and DOCLING_AVAILABLE
|
||||||
|
):
|
||||||
converter = DocumentConverter()
|
content = await asyncio.to_thread(
|
||||||
result = converter.convert(file_path)
|
_convert_with_docling, file_path
|
||||||
content = result.document.export_to_markdown()
|
)
|
||||||
else:
|
else:
|
||||||
if not pm.is_installed("pypdf"): # type: ignore
|
# Use pypdf (non-blocking via to_thread)
|
||||||
pm.install("pypdf")
|
content = await asyncio.to_thread(
|
||||||
if not pm.is_installed("pycryptodome"): # type: ignore
|
_extract_pdf_pypdf,
|
||||||
pm.install("pycryptodome")
|
file,
|
||||||
from pypdf import PdfReader # type: ignore
|
global_args.pdf_decrypt_password,
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
pdf_file = BytesIO(file)
|
|
||||||
reader = PdfReader(pdf_file)
|
|
||||||
|
|
||||||
# Check if PDF is encrypted
|
|
||||||
if reader.is_encrypted:
|
|
||||||
pdf_password = global_args.pdf_decrypt_password
|
|
||||||
if not pdf_password:
|
|
||||||
# PDF is encrypted but no password provided
|
|
||||||
error_files = [
|
|
||||||
{
|
|
||||||
"file_path": str(file_path.name),
|
|
||||||
"error_description": "[File Extraction]PDF is encrypted but no password provided",
|
|
||||||
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
|
|
||||||
"file_size": file_size,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
await rag.apipeline_enqueue_error_documents(
|
|
||||||
error_files, track_id
|
|
||||||
)
|
)
|
||||||
logger.error(
|
|
||||||
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
|
|
||||||
)
|
|
||||||
return False, track_id
|
|
||||||
|
|
||||||
# Try to decrypt with password
|
|
||||||
try:
|
|
||||||
decrypt_result = reader.decrypt(pdf_password)
|
|
||||||
if decrypt_result == 0:
|
|
||||||
# Password is incorrect
|
|
||||||
error_files = [
|
|
||||||
{
|
|
||||||
"file_path": str(file_path.name),
|
|
||||||
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
|
|
||||||
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
|
|
||||||
"file_size": file_size,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
await rag.apipeline_enqueue_error_documents(
|
|
||||||
error_files, track_id
|
|
||||||
)
|
|
||||||
logger.error(
|
|
||||||
f"[File Extraction]Incorrect PDF password: {file_path.name}"
|
|
||||||
)
|
|
||||||
return False, track_id
|
|
||||||
except Exception as decrypt_error:
|
|
||||||
# Decryption process error
|
|
||||||
error_files = [
|
|
||||||
{
|
|
||||||
"file_path": str(file_path.name),
|
|
||||||
"error_description": "[File Extraction]PDF decryption failed",
|
|
||||||
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
|
|
||||||
"file_size": file_size,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
await rag.apipeline_enqueue_error_documents(
|
|
||||||
error_files, track_id
|
|
||||||
)
|
|
||||||
logger.error(
|
|
||||||
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
|
|
||||||
)
|
|
||||||
return False, track_id
|
|
||||||
|
|
||||||
# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
|
|
||||||
for page in reader.pages:
|
|
||||||
content += page.extract_text() + "\n"
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_files = [
|
error_files = [
|
||||||
{
|
{
|
||||||
|
|
@ -1172,28 +1235,17 @@ async def pipeline_enqueue_file(
|
||||||
|
|
||||||
case ".docx":
|
case ".docx":
|
||||||
try:
|
try:
|
||||||
if global_args.document_loading_engine == "DOCLING":
|
# Try DOCLING first if configured and available
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if (
|
||||||
pm.install("docling")
|
global_args.document_loading_engine == "DOCLING"
|
||||||
from docling.document_converter import DocumentConverter # type: ignore
|
and DOCLING_AVAILABLE
|
||||||
|
):
|
||||||
converter = DocumentConverter()
|
content = await asyncio.to_thread(
|
||||||
result = converter.convert(file_path)
|
_convert_with_docling, file_path
|
||||||
content = result.document.export_to_markdown()
|
|
||||||
else:
|
|
||||||
if not pm.is_installed("python-docx"): # type: ignore
|
|
||||||
try:
|
|
||||||
pm.install("python-docx")
|
|
||||||
except Exception:
|
|
||||||
pm.install("docx")
|
|
||||||
from docx import Document # type: ignore
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
docx_file = BytesIO(file)
|
|
||||||
doc = Document(docx_file)
|
|
||||||
content = "\n".join(
|
|
||||||
[paragraph.text for paragraph in doc.paragraphs]
|
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
# Use python-docx (non-blocking via to_thread)
|
||||||
|
content = await asyncio.to_thread(_extract_docx, file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_files = [
|
error_files = [
|
||||||
{
|
{
|
||||||
|
|
@ -1213,26 +1265,17 @@ async def pipeline_enqueue_file(
|
||||||
|
|
||||||
case ".pptx":
|
case ".pptx":
|
||||||
try:
|
try:
|
||||||
if global_args.document_loading_engine == "DOCLING":
|
# Try DOCLING first if configured and available
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if (
|
||||||
pm.install("docling")
|
global_args.document_loading_engine == "DOCLING"
|
||||||
from docling.document_converter import DocumentConverter # type: ignore
|
and DOCLING_AVAILABLE
|
||||||
|
):
|
||||||
converter = DocumentConverter()
|
content = await asyncio.to_thread(
|
||||||
result = converter.convert(file_path)
|
_convert_with_docling, file_path
|
||||||
content = result.document.export_to_markdown()
|
)
|
||||||
else:
|
else:
|
||||||
if not pm.is_installed("python-pptx"): # type: ignore
|
# Use python-pptx (non-blocking via to_thread)
|
||||||
pm.install("pptx")
|
content = await asyncio.to_thread(_extract_pptx, file)
|
||||||
from pptx import Presentation # type: ignore
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
pptx_file = BytesIO(file)
|
|
||||||
prs = Presentation(pptx_file)
|
|
||||||
for slide in prs.slides:
|
|
||||||
for shape in slide.shapes:
|
|
||||||
if hasattr(shape, "text"):
|
|
||||||
content += shape.text + "\n"
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_files = [
|
error_files = [
|
||||||
{
|
{
|
||||||
|
|
@ -1252,33 +1295,17 @@ async def pipeline_enqueue_file(
|
||||||
|
|
||||||
case ".xlsx":
|
case ".xlsx":
|
||||||
try:
|
try:
|
||||||
if global_args.document_loading_engine == "DOCLING":
|
# Try DOCLING first if configured and available
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if (
|
||||||
pm.install("docling")
|
global_args.document_loading_engine == "DOCLING"
|
||||||
from docling.document_converter import DocumentConverter # type: ignore
|
and DOCLING_AVAILABLE
|
||||||
|
):
|
||||||
converter = DocumentConverter()
|
content = await asyncio.to_thread(
|
||||||
result = converter.convert(file_path)
|
_convert_with_docling, file_path
|
||||||
content = result.document.export_to_markdown()
|
)
|
||||||
else:
|
else:
|
||||||
if not pm.is_installed("openpyxl"): # type: ignore
|
# Use openpyxl (non-blocking via to_thread)
|
||||||
pm.install("openpyxl")
|
content = await asyncio.to_thread(_extract_xlsx, file)
|
||||||
from openpyxl import load_workbook # type: ignore
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
xlsx_file = BytesIO(file)
|
|
||||||
wb = load_workbook(xlsx_file)
|
|
||||||
for sheet in wb:
|
|
||||||
content += f"Sheet: {sheet.title}\n"
|
|
||||||
for row in sheet.iter_rows(values_only=True):
|
|
||||||
content += (
|
|
||||||
"\t".join(
|
|
||||||
str(cell) if cell is not None else ""
|
|
||||||
for cell in row
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
content += "\n"
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_files = [
|
error_files = [
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -79,18 +79,20 @@ api = [
|
||||||
"python-multipart",
|
"python-multipart",
|
||||||
"pytz",
|
"pytz",
|
||||||
"uvicorn",
|
"uvicorn",
|
||||||
|
# Document processing dependencies (required for API document upload functionality)
|
||||||
|
"openpyxl>=3.0.0,<4.0.0", # XLSX processing
|
||||||
|
"pycryptodome>=3.0.0,<4.0.0", # PDF encryption support
|
||||||
|
"pypdf>=6.1.0", # PDF processing
|
||||||
|
"python-docx>=0.8.11,<2.0.0", # DOCX processing
|
||||||
|
"python-pptx>=0.6.21,<2.0.0", # PPTX processing
|
||||||
|
]
|
||||||
|
|
||||||
|
# Advanced document processing engine (optional)
|
||||||
|
docling = [
|
||||||
|
"docling>=2.0.0,<3.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Offline deployment dependencies (layered design for flexibility)
|
# Offline deployment dependencies (layered design for flexibility)
|
||||||
offline-docs = [
|
|
||||||
# Document processing dependencies
|
|
||||||
"openpyxl>=3.0.0,<4.0.0",
|
|
||||||
"pycryptodome>=3.0.0,<4.0.0",
|
|
||||||
"pypdf>=6.1.0",
|
|
||||||
"python-docx>=0.8.11,<2.0.0",
|
|
||||||
"python-pptx>=0.6.21,<2.0.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
offline-storage = [
|
offline-storage = [
|
||||||
# Storage backend dependencies
|
# Storage backend dependencies
|
||||||
"redis>=5.0.0,<8.0.0",
|
"redis>=5.0.0,<8.0.0",
|
||||||
|
|
@ -115,8 +117,8 @@ offline-llm = [
|
||||||
]
|
]
|
||||||
|
|
||||||
offline = [
|
offline = [
|
||||||
# Complete offline package (includes all offline dependencies)
|
# Complete offline package (includes api for document processing, plus storage and LLM)
|
||||||
"lightrag-hku[offline-docs,offline-storage,offline-llm]",
|
"lightrag-hku[api,offline-storage,offline-llm]",
|
||||||
]
|
]
|
||||||
|
|
||||||
evaluation = [
|
evaluation = [
|
||||||
|
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
# LightRAG Offline Dependencies - Document Processing
|
|
||||||
# Install with: pip install -r requirements-offline-docs.txt
|
|
||||||
# For offline installation:
|
|
||||||
# pip download -r requirements-offline-docs.txt -d ./packages
|
|
||||||
# pip install --no-index --find-links=./packages -r requirements-offline-docs.txt
|
|
||||||
#
|
|
||||||
# Recommended: Use pip install lightrag-hku[offline-docs] for the same effect
|
|
||||||
# Or use constraints: pip install --constraint constraints-offline.txt -r requirements-offline-docs.txt
|
|
||||||
|
|
||||||
# Document processing dependencies (with version constraints matching pyproject.toml)
|
|
||||||
openpyxl>=3.0.0,<4.0.0
|
|
||||||
pycryptodome>=3.0.0,<4.0.0
|
|
||||||
pypdf>=6.1.0
|
|
||||||
python-docx>=0.8.11,<2.0.0
|
|
||||||
python-pptx>=0.6.21,<2.0.0
|
|
||||||
Loading…
Add table
Reference in a new issue