From 61b57cbb5d6ae59e85a32753365f09195b369ec1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 1 Nov 2025 15:01:17 +0800 Subject: [PATCH] Add PDF decryption support for password-protected files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add PDF_DECRYPT_PASSWORD env variable • Check encryption status before reading • Handle decrypt errors gracefully • Log detailed error messages • Support both encrypted/plain PDFs --- env.example | 3 ++ lightrag/api/config.py | 3 ++ lightrag/api/routers/document_routes.py | 61 +++++++++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/env.example b/env.example index 3c5113ff..90977e6e 100644 --- a/env.example +++ b/env.example @@ -119,6 +119,9 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true ### Document processing output language: English, Chinese, French, German ... SUMMARY_LANGUAGE=English +### PDF decryption password for protected PDF files +# PDF_DECRYPT_PASSWORD=your_pdf_password_here + ### Entity types that the LLM will attempt to recognize # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]' diff --git a/lightrag/api/config.py b/lightrag/api/config.py index de569f47..de4fa00b 100644 --- a/lightrag/api/config.py +++ b/lightrag/api/config.py @@ -342,6 +342,9 @@ def parse_args() -> argparse.Namespace: # Select Document loading tool (DOCLING, DEFAULT) args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT") + # PDF decryption password + args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None) + # Add environment variables that were previously read directly args.cors_origins = get_env_value("CORS_ORIGINS", "*") args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index c794cb20..3e479a53 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1090,6 +1090,67 @@ async def pipeline_enqueue_file( pdf_file = BytesIO(file) reader = PdfReader(pdf_file) + + # Check if PDF is encrypted + if reader.is_encrypted: + pdf_password = global_args.pdf_decrypt_password + if not pdf_password: + # PDF is encrypted but no password provided + error_files = [ + { + "file_path": str(file_path.name), + "error_description": "[File Extraction]PDF is encrypted but no password provided", + "original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file", + "file_size": file_size, + } + ] + await rag.apipeline_enqueue_error_documents( + error_files, track_id + ) + logger.error( + f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}" + ) + return False, track_id + + # Try to decrypt with password + try: + decrypt_result = reader.decrypt(pdf_password) + if decrypt_result == 0: + # Password is incorrect + error_files = [ + { + "file_path": str(file_path.name), + "error_description": "[File Extraction]Failed to decrypt PDF - incorrect password", + "original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file", + "file_size": file_size, + } + ] + await rag.apipeline_enqueue_error_documents( + error_files, track_id + ) + logger.error( + f"[File Extraction]Incorrect PDF password: {file_path.name}" + ) + return False, track_id + except Exception as decrypt_error: + # Decryption process error + error_files = [ + { + "file_path": str(file_path.name), + "error_description": "[File Extraction]PDF decryption failed", + "original_error": f"Error during PDF decryption: {str(decrypt_error)}", + "file_size": file_size, + } + ] + await rag.apipeline_enqueue_error_documents( + error_files, track_id + ) + logger.error( + f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}" + ) + return False, track_id + + # Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly) for page in reader.pages: content += page.extract_text() + "\n" except Exception as e: