From a6f1ed7c48db5dbc51757eed750047399517ef01 Mon Sep 17 00:00:00 2001
From: Eric Hare <ericrhare@gmail.com>
Date: Tue, 2 Dec 2025 11:16:34 -0800
Subject: [PATCH] feat: Provide ollama health checking

---
 .../app/api/queries/useProviderHealthQuery.ts | 35 ++++++++--
 src/api/provider_health.py                    | 69 +++++++++++++++----
 src/api/provider_validation.py                | 38 ++++++++--
 3 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/frontend/app/api/queries/useProviderHealthQuery.ts b/frontend/app/api/queries/useProviderHealthQuery.ts
index 82ca2db2..6a52c24f 100644
--- a/frontend/app/api/queries/useProviderHealthQuery.ts
+++ b/frontend/app/api/queries/useProviderHealthQuery.ts
@@ -26,6 +26,9 @@ export interface ProviderHealthParams {
   provider?: "openai" | "ollama" | "watsonx";
 }
 
+// Track consecutive failures for exponential backoff
+const failureCountMap = new Map<string, number>();
+
 export const useProviderHealthQuery = (
   params?: ProviderHealthParams,
   options?: Omit<
@@ -87,18 +90,42 @@ export const useProviderHealthQuery = (
     }
   }
 
+  const queryKey = ["provider", "health"];
+  const failureCountKey = queryKey.join("-");
+
   const queryResult = useQuery(
     {
-      queryKey: ["provider", "health"],
+      queryKey,
       queryFn: checkProviderHealth,
       retry: false, // Don't retry health checks automatically
       refetchInterval: (query) => {
-        // If healthy, check every 30 seconds; otherwise check every 3 seconds
-        return query.state.data?.status === "healthy" ? 30000 : 3000;
+        const data = query.state.data;
+        const status = data?.status;
+        
+        // If healthy, reset failure count and check every 30 seconds
+        if (status === "healthy") {
+          failureCountMap.set(failureCountKey, 0);
+          return 30000;
+        }
+        
+        // If backend unavailable, use moderate polling
+        if (status === "backend-unavailable") {
+          return 15000;
+        }
+        
+        // For unhealthy/error status, use exponential backoff
+        const currentFailures = failureCountMap.get(failureCountKey) || 0;
+        failureCountMap.set(failureCountKey, currentFailures + 1);
+        
+        // Exponential backoff: 5s, 10s, 20s, then 30s
+        const backoffDelays = [5000, 10000, 20000, 30000];
+        const delay = backoffDelays[Math.min(currentFailures, backoffDelays.length - 1)];
+        
+        return delay;
       },
       refetchOnWindowFocus: true,
       refetchOnMount: true,
-      staleTime: 30000, // Consider data stale after 25 seconds
+      staleTime: 30000, // Consider data stale after 30 seconds
       enabled: !!settings?.edited && options?.enabled !== false, // Only run after onboarding is complete
       ...options,
     },
diff --git a/src/api/provider_health.py b/src/api/provider_health.py
index da7c7371..67b6e765 100644
--- a/src/api/provider_health.py
+++ b/src/api/provider_health.py
@@ -1,9 +1,10 @@
 """Provider health check endpoint."""
 
+import httpx
 from starlette.responses import JSONResponse
 from utils.logging_config import get_logger
 from config.settings import get_openrag_config
-from api.provider_validation import validate_provider_setup
+from api.provider_validation import validate_provider_setup, _test_ollama_lightweight_health
 
 logger = get_logger(__name__)
 
@@ -116,31 +117,69 @@ async def check_provider_health(request):
             )
         else:
             # Validate both LLM and embedding providers
+            # Note: For Ollama, we use lightweight checks that don't require model inference.
+            # This prevents false-positive errors when Ollama is busy processing other requests.
             llm_error = None
             embedding_error = None
 
             # Validate LLM provider
             try:
-                await validate_provider_setup(
-                    provider=provider,
-                    api_key=api_key,
-                    llm_model=llm_model,
-                    endpoint=endpoint,
-                    project_id=project_id,
-                )
+                # For Ollama, use lightweight health check that doesn't block on active requests
+                if provider == "ollama":
+                    try:
+                        await _test_ollama_lightweight_health(endpoint)
+                    except Exception as lightweight_error:
+                        # If lightweight check fails, Ollama is down or misconfigured
+                        llm_error = str(lightweight_error)
+                        logger.error(f"LLM provider ({provider}) lightweight check failed: {llm_error}")
+                        raise
+                else:
+                    await validate_provider_setup(
+                        provider=provider,
+                        api_key=api_key,
+                        llm_model=llm_model,
+                        endpoint=endpoint,
+                        project_id=project_id,
+                    )
+            except httpx.TimeoutException as e:
+                # Timeout means provider is busy, not misconfigured
+                if provider == "ollama":
+                    llm_error = None  # Don't treat as error
+                    logger.info(f"LLM provider ({provider}) appears busy: {str(e)}")
+                else:
+                    llm_error = str(e)
+                    logger.error(f"LLM provider ({provider}) validation timed out: {llm_error}")
             except Exception as e:
                 llm_error = str(e)
                 logger.error(f"LLM provider ({provider}) validation failed: {llm_error}")
 
             # Validate embedding provider
             try:
-                await validate_provider_setup(
-                    provider=embedding_provider,
-                    api_key=embedding_api_key,
-                    embedding_model=embedding_model,
-                    endpoint=embedding_endpoint,
-                    project_id=embedding_project_id,
-                )
+                # For Ollama, use lightweight health check first
+                if embedding_provider == "ollama":
+                    try:
+                        await _test_ollama_lightweight_health(embedding_endpoint)
+                    except Exception as lightweight_error:
+                        # If lightweight check fails, Ollama is down or misconfigured
+                        embedding_error = str(lightweight_error)
+                        logger.error(f"Embedding provider ({embedding_provider}) lightweight check failed: {embedding_error}")
+                        raise
+                else:
+                    await validate_provider_setup(
+                        provider=embedding_provider,
+                        api_key=embedding_api_key,
+                        embedding_model=embedding_model,
+                        endpoint=embedding_endpoint,
+                        project_id=embedding_project_id,
+                    )
+            except httpx.TimeoutException as e:
+                # Timeout means provider is busy, not misconfigured
+                if embedding_provider == "ollama":
+                    embedding_error = None  # Don't treat as error
+                    logger.info(f"Embedding provider ({embedding_provider}) appears busy: {str(e)}")
+                else:
+                    embedding_error = str(e)
+                    logger.error(f"Embedding provider ({embedding_provider}) validation timed out: {embedding_error}")
             except Exception as e:
                 embedding_error = str(e)
                 logger.error(f"Embedding provider ({embedding_provider}) validation failed: {embedding_error}")
diff --git a/src/api/provider_validation.py b/src/api/provider_validation.py
index 2fcc1e65..cb3d571a 100644
--- a/src/api/provider_validation.py
+++ b/src/api/provider_validation.py
@@ -364,6 +364,36 @@ async def _test_watsonx_embedding(
 
 
 # Ollama validation functions
+async def _test_ollama_lightweight_health(endpoint: str) -> None:
+    """Test Ollama availability with lightweight /api/tags endpoint.
+    
+    This endpoint is very fast and doesn't block on active requests,
+    making it ideal for health checks when Ollama might be busy.
+    """
+    try:
+        ollama_url = transform_localhost_url(endpoint)
+        url = f"{ollama_url}/api/tags"
+
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                url,
+                timeout=10.0,  # Short timeout for lightweight check
+            )
+
+            if response.status_code != 200:
+                logger.error(f"Ollama lightweight health check failed: {response.status_code}")
+                raise Exception(f"Ollama endpoint not responding: {response.status_code}")
+
+            logger.info("Ollama lightweight health check passed")
+
+    except httpx.TimeoutException:
+        logger.error("Ollama lightweight health check timed out")
+        raise Exception("Ollama endpoint timed out")
+    except Exception as e:
+        logger.error(f"Ollama lightweight health check failed: {str(e)}")
+        raise
+
+
 async def _test_ollama_completion_with_tools(llm_model: str, endpoint: str) -> None:
     """Test Ollama completion with tool calling."""
     try:
@@ -401,7 +431,7 @@ async def _test_ollama_completion_with_tools(llm_model: str, endpoint: str) -> N
             response = await client.post(
                 url,
                 json=payload,
-                timeout=30.0,
+                timeout=120.0,  # Increased timeout for Ollama when potentially busy
             )
 
             if response.status_code != 200:
@@ -412,7 +442,7 @@ async def _test_ollama_completion_with_tools(llm_model: str, endpoint: str) -> N
 
     except httpx.TimeoutException:
         logger.error("Ollama completion test timed out")
-        raise Exception("Request timed out")
+        raise httpx.TimeoutException("Ollama is busy or model inference timed out")
     except Exception as e:
         logger.error(f"Ollama completion test failed: {str(e)}")
         raise
@@ -433,7 +463,7 @@ async def _test_ollama_embedding(embedding_model: str, endpoint: str) -> None:
             response = await client.post(
                 url,
                 json=payload,
-                timeout=30.0,
+                timeout=120.0,  # Increased timeout for Ollama when potentially busy
             )
 
             if response.status_code != 200:
@@ -448,7 +478,7 @@ async def _test_ollama_embedding(embedding_model: str, endpoint: str) -> None:
 
     except httpx.TimeoutException:
         logger.error("Ollama embedding test timed out")
-        raise Exception("Request timed out")
+        raise httpx.TimeoutException("Ollama is busy or embedding generation timed out")
     except Exception as e:
         logger.error(f"Ollama embedding test failed: {str(e)}")
         raise