Merge pull request #631 from langflow-ai/fix-large-pdf-ingest
fix: Configurable ingestion timeout limits
This commit is contained in:
commit
8fdeafb608
2 changed files with 29 additions and 2 deletions
|
|
@ -2,6 +2,14 @@
|
|||
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
|
||||
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
|
||||
DISABLE_INGEST_WITH_LANGFLOW=false
|
||||
|
||||
# Langflow HTTP timeout configuration (in seconds)
|
||||
# For large documents (300+ pages), ingestion can take 30+ minutes
|
||||
# Increase these values if you experience timeouts with very large PDFs
|
||||
# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
|
||||
# LANGFLOW_TIMEOUT=2400
|
||||
# LANGFLOW_CONNECT_TIMEOUT=30
|
||||
|
||||
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
|
||||
LANGFLOW_SECRET_KEY=
|
||||
|
||||
|
|
|
|||
|
|
@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
|
|||
"DISABLE_INGEST_WITH_LANGFLOW", "false"
|
||||
).lower() in ("true", "1", "yes")
|
||||
|
||||
# Langflow HTTP timeout configuration (in seconds)
|
||||
# For large documents (300+ pages), ingestion can take 30+ minutes
|
||||
# Default: 40 minutes total, 40 minutes read timeout
|
||||
LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes
|
||||
LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds
|
||||
|
||||
|
||||
def is_no_auth_mode():
|
||||
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
|
||||
|
|
@ -317,9 +323,22 @@ class AppClients:
|
|||
# Initialize document converter
|
||||
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
|
||||
|
||||
# Initialize Langflow HTTP client
|
||||
# Initialize Langflow HTTP client with extended timeouts for large documents
|
||||
# Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
|
||||
self.langflow_http_client = httpx.AsyncClient(
|
||||
base_url=LANGFLOW_URL, timeout=1200.0
|
||||
base_url=LANGFLOW_URL,
|
||||
timeout=httpx.Timeout(
|
||||
timeout=LANGFLOW_TIMEOUT, # Total timeout
|
||||
connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout
|
||||
read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs)
|
||||
write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout
|
||||
pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"Initialized Langflow HTTP client with extended timeouts",
|
||||
timeout_seconds=LANGFLOW_TIMEOUT,
|
||||
connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
|
||||
)
|
||||
|
||||
return self
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue