Add timeout support to Gemini LLM and improve parameter handling

• Add timeout parameter to Gemini client
• Convert timeout seconds to milliseconds
• Update function signatures consistently
• Add Gemini thinking config example
• Clean up parameter documentation
This commit is contained in:
yangdx 2025-11-07 15:50:14 +08:00
parent 3cb4eae492
commit fc40a36968
4 changed files with 35 additions and 13 deletions

View file

@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key
### Gemini example ### Gemini example
# LLM_BINDING=gemini # LLM_BINDING=gemini
# LLM_MODEL=gemini-flash-latest # LLM_MODEL=gemini-flash-latest
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
# LLM_BINDING_API_KEY=your_gemini_api_key # LLM_BINDING_API_KEY=your_gemini_api_key
# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192 # LLM_BINDING_HOST=https://generativelanguage.googleapis.com
GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
# GEMINI_LLM_TEMPERATURE=0.7 # GEMINI_LLM_TEMPERATURE=0.7
### OpenAI Compatible API Specific Parameters ### OpenAI Compatible API Specific Parameters

View file

@ -512,7 +512,9 @@ def create_app(args):
return optimized_azure_openai_model_complete return optimized_azure_openai_model_complete
def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args): def create_optimized_gemini_llm_func(
config_cache: LLMConfigCache, args, llm_timeout: int
):
"""Create optimized Gemini LLM function with cached configuration""" """Create optimized Gemini LLM function with cached configuration"""
async def optimized_gemini_model_complete( async def optimized_gemini_model_complete(
@ -527,6 +529,8 @@ def create_app(args):
if history_messages is None: if history_messages is None:
history_messages = [] history_messages = []
# Use pre-processed configuration to avoid repeated parsing
kwargs["timeout"] = llm_timeout
if ( if (
config_cache.gemini_llm_options is not None config_cache.gemini_llm_options is not None
and "generation_config" not in kwargs and "generation_config" not in kwargs
@ -568,7 +572,7 @@ def create_app(args):
config_cache, args, llm_timeout config_cache, args, llm_timeout
) )
elif binding == "gemini": elif binding == "gemini":
return create_optimized_gemini_llm_func(config_cache, args) return create_optimized_gemini_llm_func(config_cache, args, llm_timeout)
else: # openai and compatible else: # openai and compatible
# Use optimized function with pre-processed configuration # Use optimized function with pre-processed configuration
return create_optimized_openai_llm_func(config_cache, args, llm_timeout) return create_optimized_openai_llm_func(config_cache, args, llm_timeout)

View file

@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__)
@lru_cache(maxsize=8) @lru_cache(maxsize=8)
def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client: def _get_gemini_client(
api_key: str, base_url: str | None, timeout: int | None = None
) -> genai.Client:
""" """
Create (or fetch cached) Gemini client. Create (or fetch cached) Gemini client.
Args: Args:
api_key: Google Gemini API key. api_key: Google Gemini API key.
base_url: Optional custom API endpoint. base_url: Optional custom API endpoint.
timeout: Optional request timeout in milliseconds.
Returns: Returns:
genai.Client: Configured Gemini client instance. genai.Client: Configured Gemini client instance.
""" """
client_kwargs: dict[str, Any] = {"api_key": api_key} client_kwargs: dict[str, Any] = {"api_key": api_key}
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT: if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None:
try: try:
client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url) http_options_kwargs = {}
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
http_options_kwargs["api_endpoint"] = base_url
if timeout is not None:
http_options_kwargs["timeout"] = timeout
client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs)
except Exception as exc: # pragma: no cover - defensive except Exception as exc: # pragma: no cover - defensive
LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc) LOG.warning("Failed to apply custom Gemini http_options: %s", exc)
try: try:
return genai.Client(**client_kwargs) return genai.Client(**client_kwargs)
@ -166,6 +175,7 @@ async def gemini_complete_if_cache(
stream: bool | None = None, stream: bool | None = None,
keyword_extraction: bool = False, keyword_extraction: bool = False,
generation_config: dict[str, Any] | None = None, generation_config: dict[str, Any] | None = None,
timeout: int | None = None,
**_: Any, **_: Any,
) -> str | AsyncIterator[str]: ) -> str | AsyncIterator[str]:
""" """
@ -190,10 +200,10 @@ async def gemini_complete_if_cache(
generation_config: Optional generation configuration dict. generation_config: Optional generation configuration dict.
keyword_extraction: Whether to use JSON response format. keyword_extraction: Whether to use JSON response format.
token_tracker: Optional token usage tracker for monitoring API usage. token_tracker: Optional token usage tracker for monitoring API usage.
hashing_kv: Storage interface (for interface parity with other bindings).
stream: Whether to stream the response. stream: Whether to stream the response.
hashing_kv: Storage interface (for interface parity with other bindings).
enable_cot: Whether to include Chain of Thought content in the response. enable_cot: Whether to include Chain of Thought content in the response.
timeout: Request timeout (handled by caller if needed). timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API).
**_: Additional keyword arguments (ignored). **_: Additional keyword arguments (ignored).
Returns: Returns:
@ -207,7 +217,9 @@ async def gemini_complete_if_cache(
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
key = _ensure_api_key(api_key) key = _ensure_api_key(api_key)
client = _get_gemini_client(key, base_url) # Convert timeout from seconds to milliseconds for Gemini API
timeout_ms = timeout * 1000 if timeout else None
client = _get_gemini_client(key, base_url, timeout_ms)
history_block = _format_history_messages(history_messages) history_block = _format_history_messages(history_messages)
prompt_sections = [] prompt_sections = []
@ -279,7 +291,9 @@ async def gemini_complete_if_cache(
# Send thought content if COT is active # Send thought content if COT is active
if cot_active: if cot_active:
loop.call_soon_threadsafe(queue.put_nowait, thought_text) loop.call_soon_threadsafe(
queue.put_nowait, thought_text
)
else: else:
# COT disabled - only send regular content # COT disabled - only send regular content
if regular_text: if regular_text:

View file

@ -138,6 +138,9 @@ async def openai_complete_if_cache(
base_url: str | None = None, base_url: str | None = None,
api_key: str | None = None, api_key: str | None = None,
token_tracker: Any | None = None, token_tracker: Any | None = None,
keyword_extraction: bool = False, # Will be removed from kwargs before passing to OpenAI
stream: bool | None = None,
timeout: int | None = None,
**kwargs: Any, **kwargs: Any,
) -> str: ) -> str:
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration. """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@ -172,9 +175,9 @@ async def openai_complete_if_cache(
- openai_client_configs: Dict of configuration options for the AsyncOpenAI client. - openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
These will be passed to the client constructor but will be overridden by These will be passed to the client constructor but will be overridden by
explicit parameters (api_key, base_url). explicit parameters (api_key, base_url).
- hashing_kv: Will be removed from kwargs before passing to OpenAI.
- keyword_extraction: Will be removed from kwargs before passing to OpenAI. - keyword_extraction: Will be removed from kwargs before passing to OpenAI.
- stream: Whether to stream the response. Default is False. - stream: Whether to stream the response. Default is False.
- timeout: Request timeout in seconds. Default is None.
Returns: Returns:
The completed text (with integrated COT content if available) or an async iterator The completed text (with integrated COT content if available) or an async iterator