Merge pull request #631 from langflow-ai/fix-large-pdf-ingest

fix: Configurable ingestion timeout limits
This commit is contained in:
Eric Hare 2025-12-09 11:36:25 -08:00 committed by GitHub
commit 8fdeafb608
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 29 additions and 2 deletions

View file

@ -2,6 +2,14 @@
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
DISABLE_INGEST_WITH_LANGFLOW=false
# Langflow HTTP timeout configuration (in seconds)
# For large documents (300+ pages), ingestion can take 30+ minutes
# Increase these values if you experience timeouts with very large PDFs
# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
# LANGFLOW_TIMEOUT=2400
# LANGFLOW_CONNECT_TIMEOUT=30
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
LANGFLOW_SECRET_KEY=

View file

@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
"DISABLE_INGEST_WITH_LANGFLOW", "false"
).lower() in ("true", "1", "yes")
# Langflow HTTP timeout configuration (in seconds)
# For large documents (300+ pages), ingestion can take 30+ minutes
# Default: 40 minutes total, 40 minutes read timeout
LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes
LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds
def is_no_auth_mode():
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
@ -317,9 +323,22 @@ class AppClients:
# Initialize document converter
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
# Initialize Langflow HTTP client
# Initialize Langflow HTTP client with extended timeouts for large documents
# Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
self.langflow_http_client = httpx.AsyncClient(
base_url=LANGFLOW_URL, timeout=1200.0
base_url=LANGFLOW_URL,
timeout=httpx.Timeout(
timeout=LANGFLOW_TIMEOUT, # Total timeout
connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout
read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs)
write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout
pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout
)
)
logger.info(
"Initialized Langflow HTTP client with extended timeouts",
timeout_seconds=LANGFLOW_TIMEOUT,
connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
)
return self