Merge pull request #620 from langflow-ai/fix/watsonx_fixes

2025-12-05 18:47:46 -05:00 · 2025-12-05 18:47:46 -05:00 · 5bf2076b05
commit 5bf2076b05
parent 4eb707a160 0fb9809b8b
6 changed files with 1825 additions and 49 deletions
--- a/flows/components/opensearch.py
+++ b/flows/components/opensearch.py
@ -865,58 +865,99 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
            metadatas.append(data_copy)
        self.log(metadatas)
-        # Generate embeddings (threaded for concurrency) with retries
+        # Generate embeddings with rate-limit-aware retry logic using tenacity
-        def embed_chunk(chunk_text: str) -> list[float]:
+        from tenacity import (
-            return selected_embedding.embed_documents([chunk_text])[0]
+            retry,
            retry_if_exception,
            stop_after_attempt,
            wait_exponential,
        )
-        vectors: list[list[float]] | None = None
+        def is_rate_limit_error(exception: Exception) -> bool:
-        last_exception: Exception | None = None
+            """Check if exception is a rate limit error (429)."""
-        delay = 1.0
+            error_str = str(exception).lower()
-        attempts = 0
+            return "429" in error_str or "rate_limit" in error_str or "rate limit" in error_str
-        max_attempts = 3
+
        def is_other_retryable_error(exception: Exception) -> bool:
            """Check if exception is retryable but not a rate limit error."""
            # Retry on most exceptions except for specific non-retryable ones
            # Add other non-retryable exceptions here if needed
            return not is_rate_limit_error(exception)
        # Create retry decorator for rate limit errors (longer backoff)
        retry_on_rate_limit = retry(
            retry=retry_if_exception(is_rate_limit_error),
            stop=stop_after_attempt(5),
            wait=wait_exponential(multiplier=2, min=2, max=30),
            reraise=True,
            before_sleep=lambda retry_state: logger.warning(
                f"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), "
                f"backing off for {retry_state.next_action.sleep:.1f}s"
            ),
        )
        # Create retry decorator for other errors (shorter backoff)
        retry_on_other_errors = retry(
            retry=retry_if_exception(is_other_retryable_error),
            stop=stop_after_attempt(3),
            wait=wait_exponential(multiplier=1, min=1, max=8),
            reraise=True,
            before_sleep=lambda retry_state: logger.warning(
                f"Error embedding chunk (attempt {retry_state.attempt_number}/3), "
                f"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}"
            ),
        )
        def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:
            """Embed a single chunk with rate-limit-aware retry logic."""
            @retry_on_rate_limit
            @retry_on_other_errors
            def _embed(text: str) -> list[float]:
                return selected_embedding.embed_documents([text])[0]
        while attempts < max_attempts:
            attempts += 1
            try:
-                # Restrict concurrency for IBM/Watsonx models to avoid rate limits
+                return _embed(chunk_text)
-                is_ibm = (embedding_model and "ibm" in str(embedding_model).lower()) or (
+            except Exception as e:
-                    selected_embedding and "watsonx" in type(selected_embedding).__name__.lower()
+                logger.error(
                    f"Failed to embed chunk {chunk_idx} after all retries: {e}",
                    error=str(e),
                )
-                logger.debug(f"Is IBM: {is_ibm}")
+                raise
                max_workers = 1 if is_ibm else min(max(len(texts), 1), 8)
-                with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Restrict concurrency for IBM/Watsonx models to avoid rate limits
-                    futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)}
+        is_ibm = (embedding_model and "ibm" in str(embedding_model).lower()) or (
-                    vectors = [None] * len(texts)
+            selected_embedding and "watsonx" in type(selected_embedding).__name__.lower()
-                    for future in as_completed(futures):
+        )
-                        idx = futures[future]
+        logger.debug(f"Is IBM: {is_ibm}")
                        vectors[idx] = future.result()
                break
            except Exception as exc:
                last_exception = exc
                if attempts >= max_attempts:
                    logger.error(
                        f"Embedding generation failed for model {embedding_model} after retries",
                        error=str(exc),
                    )
                    raise
                logger.warning(
                    "Threaded embedding generation failed for model %s (attempt %s/%s), retrying in %.1fs",
                    embedding_model,
                    attempts,
                    max_attempts,
                    delay,
                )
                time.sleep(delay)
                delay = min(delay * 2, 8.0)
-        if vectors is None:
+        # For IBM models, use sequential processing with rate limiting
-            raise RuntimeError(
+        # For other models, use parallel processing
-                f"Embedding generation failed for {embedding_model}: {last_exception}"
+        vectors: list[list[float]] = [None] * len(texts)
-                if last_exception
+
-                else f"Embedding generation failed for {embedding_model}"
+        if is_ibm:
            # Sequential processing with inter-request delay for IBM models
            inter_request_delay = 0.6  # ~1.67 req/s, safely under 2 req/s limit
            logger.info(
                f"Using sequential processing for IBM model with {inter_request_delay}s delay between requests"
            )
            for idx, chunk in enumerate(texts):
                if idx > 0:
                    # Add delay between requests (but not before the first one)
                    time.sleep(inter_request_delay)
                vectors[idx] = embed_chunk_with_retry(chunk, idx)
        else:
            # Parallel processing for non-IBM models
            max_workers = min(max(len(texts), 1), 8)
            logger.debug(f"Using parallel processing with {max_workers} workers")
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = {executor.submit(embed_chunk_with_retry, chunk, idx): idx for idx, chunk in enumerate(texts)}
                for future in as_completed(futures):
                    idx = futures[future]
                    vectors[idx] = future.result()
        if not vectors:
            self.log(f"No vectors generated from documents for model {embedding_model}.")
            return
--- a/flows/components/opensearch_multimodel.py
+++ b/flows/components/opensearch_multimodel.py
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
--- a/flows/openrag_agent.json
+++ b/flows/openrag_agent.json
--- a/flows/openrag_nudges.json
+++ b/flows/openrag_nudges.json
--- a/flows/openrag_url_mcp.json
+++ b/flows/openrag_url_mcp.json