Add PDF decryption support for password-protected files

• Add PDF_DECRYPT_PASSWORD env variable
• Check encryption status before reading
• Handle decrypt errors gracefully
• Log detailed error messages
• Support both encrypted/plain PDFs
This commit is contained in:
yangdx 2025-11-01 15:01:17 +08:00
parent 728721b14f
commit 61b57cbb5d
3 changed files with 67 additions and 0 deletions

View file

@ -119,6 +119,9 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
### Document processing output language: English, Chinese, French, German ...
SUMMARY_LANGUAGE=English
### PDF decryption password for protected PDF files
# PDF_DECRYPT_PASSWORD=your_pdf_password_here
### Entity types that the LLM will attempt to recognize
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'

View file

@ -342,6 +342,9 @@ def parse_args() -> argparse.Namespace:
# Select Document loading tool (DOCLING, DEFAULT)
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
# PDF decryption password
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
# Add environment variables that were previously read directly
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)

View file

@ -1090,6 +1090,67 @@ async def pipeline_enqueue_file(
pdf_file = BytesIO(file)
reader = PdfReader(pdf_file)
# Check if PDF is encrypted
if reader.is_encrypted:
pdf_password = global_args.pdf_decrypt_password
if not pdf_password:
# PDF is encrypted but no password provided
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]PDF is encrypted but no password provided",
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
)
return False, track_id
# Try to decrypt with password
try:
decrypt_result = reader.decrypt(pdf_password)
if decrypt_result == 0:
# Password is incorrect
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]Incorrect PDF password: {file_path.name}"
)
return False, track_id
except Exception as decrypt_error:
# Decryption process error
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]PDF decryption failed",
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
)
return False, track_id
# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
for page in reader.pages:
content += page.extract_text() + "\n"
except Exception as e: