Add timeout support to Gemini LLM and improve parameter handling

• Add timeout parameter to Gemini client • Convert timeout seconds to milliseconds • Update function signatures consistently • Add Gemini thinking config example • Clean up parameter documentation
2025-11-07 15:50:14 +08:00 · 2025-11-07 15:50:14 +08:00 · fc40a36968
commit fc40a36968
parent 3cb4eae492
4 changed files with 35 additions and 13 deletions
--- a/env.example
+++ b/env.example
@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key
 ### Gemini example
 # LLM_BINDING=gemini
 # LLM_MODEL=gemini-flash-latest
-# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
 # LLM_BINDING_API_KEY=your_gemini_api_key
-# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192
+# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
+GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
+# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
 # GEMINI_LLM_TEMPERATURE=0.7

 ### OpenAI Compatible API Specific Parameters
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -512,7 +512,9 @@ def create_app(args):

        return optimized_azure_openai_model_complete

-    def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args):
+    def create_optimized_gemini_llm_func(
+        config_cache: LLMConfigCache, args, llm_timeout: int
+    ):
        """Create optimized Gemini LLM function with cached configuration"""

        async def optimized_gemini_model_complete(
@ -527,6 +529,8 @@ def create_app(args):
            if history_messages is None:
                history_messages = []

+            # Use pre-processed configuration to avoid repeated parsing
+            kwargs["timeout"] = llm_timeout
            if (
                config_cache.gemini_llm_options is not None
                and "generation_config" not in kwargs
@ -568,7 +572,7 @@ def create_app(args):
                    config_cache, args, llm_timeout
                )
            elif binding == "gemini":
-                return create_optimized_gemini_llm_func(config_cache, args)
+                return create_optimized_gemini_llm_func(config_cache, args, llm_timeout)
            else:  # openai and compatible
                # Use optimized function with pre-processed configuration
                return create_optimized_openai_llm_func(config_cache, args, llm_timeout)
--- a/lightrag/llm/gemini.py
+++ b/lightrag/llm/gemini.py
@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__)


@lru_cache(maxsize=8)
-def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client:
+def _get_gemini_client(
+    api_key: str, base_url: str | None, timeout: int | None = None
+) -> genai.Client:
    """
    Create (or fetch cached) Gemini client.

    Args:
        api_key: Google Gemini API key.
        base_url: Optional custom API endpoint.
+        timeout: Optional request timeout in milliseconds.

    Returns:
        genai.Client: Configured Gemini client instance.
    """
    client_kwargs: dict[str, Any] = {"api_key": api_key}

-    if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
+    if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None:
        try:
-            client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url)
+            http_options_kwargs = {}
+            if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
+                http_options_kwargs["api_endpoint"] = base_url
+            if timeout is not None:
+                http_options_kwargs["timeout"] = timeout
+
+            client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs)
        except Exception as exc:  # pragma: no cover - defensive
-            LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc)
+            LOG.warning("Failed to apply custom Gemini http_options: %s", exc)

    try:
        return genai.Client(**client_kwargs)
@ -166,6 +175,7 @@ async def gemini_complete_if_cache(
    stream: bool | None = None,
    keyword_extraction: bool = False,
    generation_config: dict[str, Any] | None = None,
+    timeout: int | None = None,
    **_: Any,
 ) -> str | AsyncIterator[str]:
    """
@ -190,10 +200,10 @@ async def gemini_complete_if_cache(
        generation_config: Optional generation configuration dict.
        keyword_extraction: Whether to use JSON response format.
        token_tracker: Optional token usage tracker for monitoring API usage.
-        hashing_kv: Storage interface (for interface parity with other bindings).
        stream: Whether to stream the response.
+        hashing_kv: Storage interface (for interface parity with other bindings).
        enable_cot: Whether to include Chain of Thought content in the response.
-        timeout: Request timeout (handled by caller if needed).
+        timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API).
        **_: Additional keyword arguments (ignored).

    Returns:
@ -207,7 +217,9 @@ async def gemini_complete_if_cache(
    loop = asyncio.get_running_loop()

    key = _ensure_api_key(api_key)
-    client = _get_gemini_client(key, base_url)
+    # Convert timeout from seconds to milliseconds for Gemini API
+    timeout_ms = timeout * 1000 if timeout else None
+    client = _get_gemini_client(key, base_url, timeout_ms)

    history_block = _format_history_messages(history_messages)
    prompt_sections = []
@ -279,7 +291,9 @@ async def gemini_complete_if_cache(

                            # Send thought content if COT is active
                            if cot_active:
-                                loop.call_soon_threadsafe(queue.put_nowait, thought_text)
+                                loop.call_soon_threadsafe(
+                                    queue.put_nowait, thought_text
+                                )
                    else:
                        # COT disabled - only send regular content
                        if regular_text:
--- a/lightrag/llm/openai.py
+++ b/lightrag/llm/openai.py
@ -138,6 +138,9 @@ async def openai_complete_if_cache(
    base_url: str | None = None,
    api_key: str | None = None,
    token_tracker: Any | None = None,
+    keyword_extraction: bool = False,  # Will be removed from kwargs before passing to OpenAI
+    stream: bool | None = None,
+    timeout: int | None = None,
    **kwargs: Any,
 ) -> str:
    """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@ -172,9 +175,9 @@ async def openai_complete_if_cache(
            - openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
                These will be passed to the client constructor but will be overridden by
                explicit parameters (api_key, base_url).
-            - hashing_kv: Will be removed from kwargs before passing to OpenAI.
            - keyword_extraction: Will be removed from kwargs before passing to OpenAI.
            - stream: Whether to stream the response. Default is False.
+            - timeout: Request timeout in seconds. Default is None.

    Returns:
        The completed text (with integrated COT content if available) or an async iterator