Add timeout support to Gemini LLM and improve parameter handling
• Add timeout parameter to Gemini client • Convert timeout seconds to milliseconds • Update function signatures consistently • Add Gemini thinking config example • Clean up parameter documentation
This commit is contained in:
parent
3cb4eae492
commit
fc40a36968
4 changed files with 35 additions and 13 deletions
|
|
@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key
|
|||
### Gemini example
|
||||
# LLM_BINDING=gemini
|
||||
# LLM_MODEL=gemini-flash-latest
|
||||
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
|
||||
# LLM_BINDING_API_KEY=your_gemini_api_key
|
||||
# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192
|
||||
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
|
||||
GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
|
||||
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
|
||||
# GEMINI_LLM_TEMPERATURE=0.7
|
||||
|
||||
### OpenAI Compatible API Specific Parameters
|
||||
|
|
|
|||
|
|
@ -512,7 +512,9 @@ def create_app(args):
|
|||
|
||||
return optimized_azure_openai_model_complete
|
||||
|
||||
def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args):
|
||||
def create_optimized_gemini_llm_func(
|
||||
config_cache: LLMConfigCache, args, llm_timeout: int
|
||||
):
|
||||
"""Create optimized Gemini LLM function with cached configuration"""
|
||||
|
||||
async def optimized_gemini_model_complete(
|
||||
|
|
@ -527,6 +529,8 @@ def create_app(args):
|
|||
if history_messages is None:
|
||||
history_messages = []
|
||||
|
||||
# Use pre-processed configuration to avoid repeated parsing
|
||||
kwargs["timeout"] = llm_timeout
|
||||
if (
|
||||
config_cache.gemini_llm_options is not None
|
||||
and "generation_config" not in kwargs
|
||||
|
|
@ -568,7 +572,7 @@ def create_app(args):
|
|||
config_cache, args, llm_timeout
|
||||
)
|
||||
elif binding == "gemini":
|
||||
return create_optimized_gemini_llm_func(config_cache, args)
|
||||
return create_optimized_gemini_llm_func(config_cache, args, llm_timeout)
|
||||
else: # openai and compatible
|
||||
# Use optimized function with pre-processed configuration
|
||||
return create_optimized_openai_llm_func(config_cache, args, llm_timeout)
|
||||
|
|
|
|||
|
|
@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__)
|
|||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client:
|
||||
def _get_gemini_client(
|
||||
api_key: str, base_url: str | None, timeout: int | None = None
|
||||
) -> genai.Client:
|
||||
"""
|
||||
Create (or fetch cached) Gemini client.
|
||||
|
||||
Args:
|
||||
api_key: Google Gemini API key.
|
||||
base_url: Optional custom API endpoint.
|
||||
timeout: Optional request timeout in milliseconds.
|
||||
|
||||
Returns:
|
||||
genai.Client: Configured Gemini client instance.
|
||||
"""
|
||||
client_kwargs: dict[str, Any] = {"api_key": api_key}
|
||||
|
||||
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
|
||||
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None:
|
||||
try:
|
||||
client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url)
|
||||
http_options_kwargs = {}
|
||||
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
|
||||
http_options_kwargs["api_endpoint"] = base_url
|
||||
if timeout is not None:
|
||||
http_options_kwargs["timeout"] = timeout
|
||||
|
||||
client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc)
|
||||
LOG.warning("Failed to apply custom Gemini http_options: %s", exc)
|
||||
|
||||
try:
|
||||
return genai.Client(**client_kwargs)
|
||||
|
|
@ -166,6 +175,7 @@ async def gemini_complete_if_cache(
|
|||
stream: bool | None = None,
|
||||
keyword_extraction: bool = False,
|
||||
generation_config: dict[str, Any] | None = None,
|
||||
timeout: int | None = None,
|
||||
**_: Any,
|
||||
) -> str | AsyncIterator[str]:
|
||||
"""
|
||||
|
|
@ -190,10 +200,10 @@ async def gemini_complete_if_cache(
|
|||
generation_config: Optional generation configuration dict.
|
||||
keyword_extraction: Whether to use JSON response format.
|
||||
token_tracker: Optional token usage tracker for monitoring API usage.
|
||||
hashing_kv: Storage interface (for interface parity with other bindings).
|
||||
stream: Whether to stream the response.
|
||||
hashing_kv: Storage interface (for interface parity with other bindings).
|
||||
enable_cot: Whether to include Chain of Thought content in the response.
|
||||
timeout: Request timeout (handled by caller if needed).
|
||||
timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API).
|
||||
**_: Additional keyword arguments (ignored).
|
||||
|
||||
Returns:
|
||||
|
|
@ -207,7 +217,9 @@ async def gemini_complete_if_cache(
|
|||
loop = asyncio.get_running_loop()
|
||||
|
||||
key = _ensure_api_key(api_key)
|
||||
client = _get_gemini_client(key, base_url)
|
||||
# Convert timeout from seconds to milliseconds for Gemini API
|
||||
timeout_ms = timeout * 1000 if timeout else None
|
||||
client = _get_gemini_client(key, base_url, timeout_ms)
|
||||
|
||||
history_block = _format_history_messages(history_messages)
|
||||
prompt_sections = []
|
||||
|
|
@ -279,7 +291,9 @@ async def gemini_complete_if_cache(
|
|||
|
||||
# Send thought content if COT is active
|
||||
if cot_active:
|
||||
loop.call_soon_threadsafe(queue.put_nowait, thought_text)
|
||||
loop.call_soon_threadsafe(
|
||||
queue.put_nowait, thought_text
|
||||
)
|
||||
else:
|
||||
# COT disabled - only send regular content
|
||||
if regular_text:
|
||||
|
|
|
|||
|
|
@ -138,6 +138,9 @@ async def openai_complete_if_cache(
|
|||
base_url: str | None = None,
|
||||
api_key: str | None = None,
|
||||
token_tracker: Any | None = None,
|
||||
keyword_extraction: bool = False, # Will be removed from kwargs before passing to OpenAI
|
||||
stream: bool | None = None,
|
||||
timeout: int | None = None,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
|
||||
|
|
@ -172,9 +175,9 @@ async def openai_complete_if_cache(
|
|||
- openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
|
||||
These will be passed to the client constructor but will be overridden by
|
||||
explicit parameters (api_key, base_url).
|
||||
- hashing_kv: Will be removed from kwargs before passing to OpenAI.
|
||||
- keyword_extraction: Will be removed from kwargs before passing to OpenAI.
|
||||
- stream: Whether to stream the response. Default is False.
|
||||
- timeout: Request timeout in seconds. Default is None.
|
||||
|
||||
Returns:
|
||||
The completed text (with integrated COT content if available) or an async iterator
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue