cherry-pick 61b57cbb
This commit is contained in:
parent
fce5dc6be6
commit
a4d6692e2d
2 changed files with 64 additions and 0 deletions
|
|
@ -357,6 +357,9 @@ def parse_args() -> argparse.Namespace:
|
||||||
# Select Document loading tool (DOCLING, DEFAULT)
|
# Select Document loading tool (DOCLING, DEFAULT)
|
||||||
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
|
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
|
||||||
|
|
||||||
|
# PDF decryption password
|
||||||
|
args.pdf_decrypt_password = get_env_value("PDF_DECRYPT_PASSWORD", None)
|
||||||
|
|
||||||
# Add environment variables that were previously read directly
|
# Add environment variables that were previously read directly
|
||||||
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
|
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
|
||||||
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
|
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
|
||||||
|
|
|
||||||
|
|
@ -1068,6 +1068,67 @@ async def pipeline_enqueue_file(
|
||||||
|
|
||||||
pdf_file = BytesIO(file)
|
pdf_file = BytesIO(file)
|
||||||
reader = PdfReader(pdf_file)
|
reader = PdfReader(pdf_file)
|
||||||
|
|
||||||
|
# Check if PDF is encrypted
|
||||||
|
if reader.is_encrypted:
|
||||||
|
pdf_password = global_args.pdf_decrypt_password
|
||||||
|
if not pdf_password:
|
||||||
|
# PDF is encrypted but no password provided
|
||||||
|
error_files = [
|
||||||
|
{
|
||||||
|
"file_path": str(file_path.name),
|
||||||
|
"error_description": "[File Extraction]PDF is encrypted but no password provided",
|
||||||
|
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
|
||||||
|
"file_size": file_size,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
await rag.apipeline_enqueue_error_documents(
|
||||||
|
error_files, track_id
|
||||||
|
)
|
||||||
|
logger.error(
|
||||||
|
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
|
||||||
|
)
|
||||||
|
return False, track_id
|
||||||
|
|
||||||
|
# Try to decrypt with password
|
||||||
|
try:
|
||||||
|
decrypt_result = reader.decrypt(pdf_password)
|
||||||
|
if decrypt_result == 0:
|
||||||
|
# Password is incorrect
|
||||||
|
error_files = [
|
||||||
|
{
|
||||||
|
"file_path": str(file_path.name),
|
||||||
|
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
|
||||||
|
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
|
||||||
|
"file_size": file_size,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
await rag.apipeline_enqueue_error_documents(
|
||||||
|
error_files, track_id
|
||||||
|
)
|
||||||
|
logger.error(
|
||||||
|
f"[File Extraction]Incorrect PDF password: {file_path.name}"
|
||||||
|
)
|
||||||
|
return False, track_id
|
||||||
|
except Exception as decrypt_error:
|
||||||
|
# Decryption process error
|
||||||
|
error_files = [
|
||||||
|
{
|
||||||
|
"file_path": str(file_path.name),
|
||||||
|
"error_description": "[File Extraction]PDF decryption failed",
|
||||||
|
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
|
||||||
|
"file_size": file_size,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
await rag.apipeline_enqueue_error_documents(
|
||||||
|
error_files, track_id
|
||||||
|
)
|
||||||
|
logger.error(
|
||||||
|
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
|
||||||
|
)
|
||||||
|
return False, track_id
|
||||||
|
|
||||||
|
# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
|
||||||
for page in reader.pages:
|
for page in reader.pages:
|
||||||
content += page.extract_text() + "\n"
|
content += page.extract_text() + "\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue