Add PDF decryption support for password-protected files

• Add PDF_DECRYPT_PASSWORD env variable • Check encryption status before reading • Handle decrypt errors gracefully • Log detailed error messages • Support both encrypted/plain PDFs
2025-11-01 15:01:17 +08:00 · 2025-11-01 15:01:17 +08:00 · 61b57cbb5d
commit 61b57cbb5d
parent 728721b14f
3 changed files with 67 additions and 0 deletions
--- a/env.example
+++ b/env.example
@ -119,6 +119,9 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
 ### Document processing output language: English, Chinese, French, German ...
 SUMMARY_LANGUAGE=English

+### PDF decryption password for protected PDF files
+# PDF_DECRYPT_PASSWORD=your_pdf_password_here
+
 ### Entity types that the LLM will attempt to recognize
 # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'

--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -342,6 +342,9 @@ def parse_args() -> argparse.Namespace:
    # Select Document loading tool (DOCLING, DEFAULT)
    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")

+    # PDF decryption password
+    args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
+
    # Add environment variables that were previously read directly
    args.cors_origins = get_env_value("CORS_ORIGINS", "*")
    args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -1090,6 +1090,67 @@ async def pipeline_enqueue_file(

                            pdf_file = BytesIO(file)
                            reader = PdfReader(pdf_file)
+
+                            # Check if PDF is encrypted
+                            if reader.is_encrypted:
+                                pdf_password = global_args.pdf_decrypt_password
+                                if not pdf_password:
+                                    # PDF is encrypted but no password provided
+                                    error_files = [
+                                        {
+                                            "file_path": str(file_path.name),
+                                            "error_description": "[File Extraction]PDF is encrypted but no password provided",
+                                            "original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
+                                            "file_size": file_size,
+                                        }
+                                    ]
+                                    await rag.apipeline_enqueue_error_documents(
+                                        error_files, track_id
+                                    )
+                                    logger.error(
+                                        f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
+                                    )
+                                    return False, track_id
+
+                                # Try to decrypt with password
+                                try:
+                                    decrypt_result = reader.decrypt(pdf_password)
+                                    if decrypt_result == 0:
+                                        # Password is incorrect
+                                        error_files = [
+                                            {
+                                                "file_path": str(file_path.name),
+                                                "error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
+                                                "original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
+                                                "file_size": file_size,
+                                            }
+                                        ]
+                                        await rag.apipeline_enqueue_error_documents(
+                                            error_files, track_id
+                                        )
+                                        logger.error(
+                                            f"[File Extraction]Incorrect PDF password: {file_path.name}"
+                                        )
+                                        return False, track_id
+                                except Exception as decrypt_error:
+                                    # Decryption process error
+                                    error_files = [
+                                        {
+                                            "file_path": str(file_path.name),
+                                            "error_description": "[File Extraction]PDF decryption failed",
+                                            "original_error": f"Error during PDF decryption: {str(decrypt_error)}",
+                                            "file_size": file_size,
+                                        }
+                                    ]
+                                    await rag.apipeline_enqueue_error_documents(
+                                        error_files, track_id
+                                    )
+                                    logger.error(
+                                        f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
+                                    )
+                                    return False, track_id
+
+                            # Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
                            for page in reader.pages:
                                content += page.extract_text() + "\n"
                    except Exception as e: