Merge pull request #631 from langflow-ai/fix-large-pdf-ingest

fix: Configurable ingestion timeout limits
2025-12-09 11:36:25 -08:00 · 2025-12-09 11:36:25 -08:00 · 8fdeafb608
commit 8fdeafb608
parent c3e8827a2e 79cc531dc4
2 changed files with 29 additions and 2 deletions
--- a/.env.example
+++ b/.env.example
@ -2,6 +2,14 @@
 # Set to true to disable Langflow ingestion and use traditional OpenRAG processor
 # If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
 DISABLE_INGEST_WITH_LANGFLOW=false
 # Langflow HTTP timeout configuration (in seconds)
 # For large documents (300+ pages), ingestion can take 30+ minutes
 # Increase these values if you experience timeouts with very large PDFs
 # Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
 # LANGFLOW_TIMEOUT=2400
 # LANGFLOW_CONNECT_TIMEOUT=30
 # make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
 LANGFLOW_SECRET_KEY=
--- a/src/config/settings.py
+++ b/src/config/settings.py
@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
    "DISABLE_INGEST_WITH_LANGFLOW", "false"
 ).lower() in ("true", "1", "yes")
 # Langflow HTTP timeout configuration (in seconds)
 # For large documents (300+ pages), ingestion can take 30+ minutes
 # Default: 40 minutes total, 40 minutes read timeout
 LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400"))  # 40 minutes
 LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30"))  # 30 seconds
 def is_no_auth_mode():
    """Check if we're running in no-auth mode (OAuth credentials missing)"""
@ -317,9 +323,22 @@ class AppClients:
        # Initialize document converter
        self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
-        # Initialize Langflow HTTP client
+        # Initialize Langflow HTTP client with extended timeouts for large documents
        # Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
        self.langflow_http_client = httpx.AsyncClient(
-            base_url=LANGFLOW_URL, timeout=1200.0
+            base_url=LANGFLOW_URL,
            timeout=httpx.Timeout(
                timeout=LANGFLOW_TIMEOUT,  # Total timeout
                connect=LANGFLOW_CONNECT_TIMEOUT,  # Connection timeout
                read=LANGFLOW_TIMEOUT,  # Read timeout (most important for large PDFs)
                write=LANGFLOW_CONNECT_TIMEOUT,  # Write timeout
                pool=LANGFLOW_CONNECT_TIMEOUT,  # Pool timeout
            )
        )
        logger.info(
            "Initialized Langflow HTTP client with extended timeouts",
            timeout_seconds=LANGFLOW_TIMEOUT,
            connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
        )
        return self