Add timeout support to Gemini LLM and improve parameter handling

• Add timeout parameter to Gemini client
• Convert timeout seconds to milliseconds
• Update function signatures consistently
• Add Gemini thinking config example
• Clean up parameter documentation
This commit is contained in:
yangdx 2025-11-07 15:50:14 +08:00
parent 3cb4eae492
commit fc40a36968
4 changed files with 35 additions and 13 deletions

View file

@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key
### Gemini example
# LLM_BINDING=gemini
# LLM_MODEL=gemini-flash-latest
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
# LLM_BINDING_API_KEY=your_gemini_api_key
# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
# GEMINI_LLM_TEMPERATURE=0.7
### OpenAI Compatible API Specific Parameters

View file

@ -512,7 +512,9 @@ def create_app(args):
return optimized_azure_openai_model_complete
def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args):
def create_optimized_gemini_llm_func(
config_cache: LLMConfigCache, args, llm_timeout: int
):
"""Create optimized Gemini LLM function with cached configuration"""
async def optimized_gemini_model_complete(
@ -527,6 +529,8 @@ def create_app(args):
if history_messages is None:
history_messages = []
# Use pre-processed configuration to avoid repeated parsing
kwargs["timeout"] = llm_timeout
if (
config_cache.gemini_llm_options is not None
and "generation_config" not in kwargs
@ -568,7 +572,7 @@ def create_app(args):
config_cache, args, llm_timeout
)
elif binding == "gemini":
return create_optimized_gemini_llm_func(config_cache, args)
return create_optimized_gemini_llm_func(config_cache, args, llm_timeout)
else: # openai and compatible
# Use optimized function with pre-processed configuration
return create_optimized_openai_llm_func(config_cache, args, llm_timeout)

View file

@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__)
@lru_cache(maxsize=8)
def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client:
def _get_gemini_client(
api_key: str, base_url: str | None, timeout: int | None = None
) -> genai.Client:
"""
Create (or fetch cached) Gemini client.
Args:
api_key: Google Gemini API key.
base_url: Optional custom API endpoint.
timeout: Optional request timeout in milliseconds.
Returns:
genai.Client: Configured Gemini client instance.
"""
client_kwargs: dict[str, Any] = {"api_key": api_key}
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None:
try:
client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url)
http_options_kwargs = {}
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
http_options_kwargs["api_endpoint"] = base_url
if timeout is not None:
http_options_kwargs["timeout"] = timeout
client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs)
except Exception as exc: # pragma: no cover - defensive
LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc)
LOG.warning("Failed to apply custom Gemini http_options: %s", exc)
try:
return genai.Client(**client_kwargs)
@ -166,6 +175,7 @@ async def gemini_complete_if_cache(
stream: bool | None = None,
keyword_extraction: bool = False,
generation_config: dict[str, Any] | None = None,
timeout: int | None = None,
**_: Any,
) -> str | AsyncIterator[str]:
"""
@ -190,10 +200,10 @@ async def gemini_complete_if_cache(
generation_config: Optional generation configuration dict.
keyword_extraction: Whether to use JSON response format.
token_tracker: Optional token usage tracker for monitoring API usage.
hashing_kv: Storage interface (for interface parity with other bindings).
stream: Whether to stream the response.
hashing_kv: Storage interface (for interface parity with other bindings).
enable_cot: Whether to include Chain of Thought content in the response.
timeout: Request timeout (handled by caller if needed).
timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API).
**_: Additional keyword arguments (ignored).
Returns:
@ -207,7 +217,9 @@ async def gemini_complete_if_cache(
loop = asyncio.get_running_loop()
key = _ensure_api_key(api_key)
client = _get_gemini_client(key, base_url)
# Convert timeout from seconds to milliseconds for Gemini API
timeout_ms = timeout * 1000 if timeout else None
client = _get_gemini_client(key, base_url, timeout_ms)
history_block = _format_history_messages(history_messages)
prompt_sections = []
@ -279,7 +291,9 @@ async def gemini_complete_if_cache(
# Send thought content if COT is active
if cot_active:
loop.call_soon_threadsafe(queue.put_nowait, thought_text)
loop.call_soon_threadsafe(
queue.put_nowait, thought_text
)
else:
# COT disabled - only send regular content
if regular_text:

View file

@ -138,6 +138,9 @@ async def openai_complete_if_cache(
base_url: str | None = None,
api_key: str | None = None,
token_tracker: Any | None = None,
keyword_extraction: bool = False, # Will be removed from kwargs before passing to OpenAI
stream: bool | None = None,
timeout: int | None = None,
**kwargs: Any,
) -> str:
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@ -172,9 +175,9 @@ async def openai_complete_if_cache(
- openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
These will be passed to the client constructor but will be overridden by
explicit parameters (api_key, base_url).
- hashing_kv: Will be removed from kwargs before passing to OpenAI.
- keyword_extraction: Will be removed from kwargs before passing to OpenAI.
- stream: Whether to stream the response. Default is False.
- timeout: Request timeout in seconds. Default is None.
Returns:
The completed text (with integrated COT content if available) or an async iterator