diff --git a/env.example b/env.example
index ce1eab0f..1e581751 100644
--- a/env.example
+++ b/env.example
@@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key
 ### Gemini example
 # LLM_BINDING=gemini
 # LLM_MODEL=gemini-flash-latest
-# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
 # LLM_BINDING_API_KEY=your_gemini_api_key
-# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192
+# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
+GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
+# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
 # GEMINI_LLM_TEMPERATURE=0.7
 
 ### OpenAI Compatible API Specific Parameters
diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 70e17bb6..c9bb1a44 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -512,7 +512,9 @@ def create_app(args):
 
         return optimized_azure_openai_model_complete
 
-    def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args):
+    def create_optimized_gemini_llm_func(
+        config_cache: LLMConfigCache, args, llm_timeout: int
+    ):
         """Create optimized Gemini LLM function with cached configuration"""
 
         async def optimized_gemini_model_complete(
@@ -527,6 +529,8 @@ def create_app(args):
             if history_messages is None:
                 history_messages = []
 
+            # Use pre-processed configuration to avoid repeated parsing
+            kwargs["timeout"] = llm_timeout
             if (
                 config_cache.gemini_llm_options is not None
                 and "generation_config" not in kwargs
@@ -568,7 +572,7 @@ def create_app(args):
                     config_cache, args, llm_timeout
                 )
             elif binding == "gemini":
-                return create_optimized_gemini_llm_func(config_cache, args)
+                return create_optimized_gemini_llm_func(config_cache, args, llm_timeout)
             else:  # openai and compatible
                 # Use optimized function with pre-processed configuration
                 return create_optimized_openai_llm_func(config_cache, args, llm_timeout)
diff --git a/lightrag/llm/gemini.py b/lightrag/llm/gemini.py
index 4cec3e71..f06ec6b3 100644
--- a/lightrag/llm/gemini.py
+++ b/lightrag/llm/gemini.py
@@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__)
 
 
 @lru_cache(maxsize=8)
-def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client:
+def _get_gemini_client(
+    api_key: str, base_url: str | None, timeout: int | None = None
+) -> genai.Client:
     """
     Create (or fetch cached) Gemini client.
 
     Args:
         api_key: Google Gemini API key.
         base_url: Optional custom API endpoint.
+        timeout: Optional request timeout in milliseconds.
 
     Returns:
         genai.Client: Configured Gemini client instance.
     """
     client_kwargs: dict[str, Any] = {"api_key": api_key}
 
-    if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
+    if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None:
         try:
-            client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url)
+            http_options_kwargs = {}
+            if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
+                http_options_kwargs["api_endpoint"] = base_url
+            if timeout is not None:
+                http_options_kwargs["timeout"] = timeout
+
+            client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs)
         except Exception as exc:  # pragma: no cover - defensive
-            LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc)
+            LOG.warning("Failed to apply custom Gemini http_options: %s", exc)
 
     try:
         return genai.Client(**client_kwargs)
@@ -166,6 +175,7 @@ async def gemini_complete_if_cache(
     stream: bool | None = None,
     keyword_extraction: bool = False,
     generation_config: dict[str, Any] | None = None,
+    timeout: int | None = None,
     **_: Any,
 ) -> str | AsyncIterator[str]:
     """
@@ -190,10 +200,10 @@ async def gemini_complete_if_cache(
         generation_config: Optional generation configuration dict.
         keyword_extraction: Whether to use JSON response format.
         token_tracker: Optional token usage tracker for monitoring API usage.
-        hashing_kv: Storage interface (for interface parity with other bindings).
         stream: Whether to stream the response.
+        hashing_kv: Storage interface (for interface parity with other bindings).
         enable_cot: Whether to include Chain of Thought content in the response.
-        timeout: Request timeout (handled by caller if needed).
+        timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API).
         **_: Additional keyword arguments (ignored).
 
     Returns:
@@ -207,7 +217,9 @@ async def gemini_complete_if_cache(
     loop = asyncio.get_running_loop()
 
     key = _ensure_api_key(api_key)
-    client = _get_gemini_client(key, base_url)
+    # Convert timeout from seconds to milliseconds for Gemini API
+    timeout_ms = timeout * 1000 if timeout else None
+    client = _get_gemini_client(key, base_url, timeout_ms)
 
     history_block = _format_history_messages(history_messages)
     prompt_sections = []
@@ -279,7 +291,9 @@ async def gemini_complete_if_cache(
 
                             # Send thought content if COT is active
                             if cot_active:
-                                loop.call_soon_threadsafe(queue.put_nowait, thought_text)
+                                loop.call_soon_threadsafe(
+                                    queue.put_nowait, thought_text
+                                )
                     else:
                         # COT disabled - only send regular content
                         if regular_text:
diff --git a/lightrag/llm/openai.py b/lightrag/llm/openai.py
index 2cdbb72b..511a3a62 100644
--- a/lightrag/llm/openai.py
+++ b/lightrag/llm/openai.py
@@ -138,6 +138,9 @@ async def openai_complete_if_cache(
     base_url: str | None = None,
     api_key: str | None = None,
     token_tracker: Any | None = None,
+    keyword_extraction: bool = False,  # Will be removed from kwargs before passing to OpenAI
+    stream: bool | None = None,
+    timeout: int | None = None,
     **kwargs: Any,
 ) -> str:
     """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@@ -172,9 +175,9 @@ async def openai_complete_if_cache(
             - openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
                 These will be passed to the client constructor but will be overridden by
                 explicit parameters (api_key, base_url).
-            - hashing_kv: Will be removed from kwargs before passing to OpenAI.
             - keyword_extraction: Will be removed from kwargs before passing to OpenAI.
             - stream: Whether to stream the response. Default is False.
+            - timeout: Request timeout in seconds. Default is None.
 
     Returns:
         The completed text (with integrated COT content if available) or an async iterator