Add timeout support to Gemini LLM and improve parameter handling
• Add timeout parameter to Gemini client • Convert timeout seconds to milliseconds • Update function signatures consistently • Add Gemini thinking config example • Clean up parameter documentation
This commit is contained in:
parent
3cb4eae492
commit
fc40a36968
4 changed files with 35 additions and 13 deletions
|
|
@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key
|
||||||
### Gemini example
|
### Gemini example
|
||||||
# LLM_BINDING=gemini
|
# LLM_BINDING=gemini
|
||||||
# LLM_MODEL=gemini-flash-latest
|
# LLM_MODEL=gemini-flash-latest
|
||||||
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
|
|
||||||
# LLM_BINDING_API_KEY=your_gemini_api_key
|
# LLM_BINDING_API_KEY=your_gemini_api_key
|
||||||
# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192
|
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
|
||||||
|
GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
|
||||||
|
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
|
||||||
# GEMINI_LLM_TEMPERATURE=0.7
|
# GEMINI_LLM_TEMPERATURE=0.7
|
||||||
|
|
||||||
### OpenAI Compatible API Specific Parameters
|
### OpenAI Compatible API Specific Parameters
|
||||||
|
|
|
||||||
|
|
@ -512,7 +512,9 @@ def create_app(args):
|
||||||
|
|
||||||
return optimized_azure_openai_model_complete
|
return optimized_azure_openai_model_complete
|
||||||
|
|
||||||
def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args):
|
def create_optimized_gemini_llm_func(
|
||||||
|
config_cache: LLMConfigCache, args, llm_timeout: int
|
||||||
|
):
|
||||||
"""Create optimized Gemini LLM function with cached configuration"""
|
"""Create optimized Gemini LLM function with cached configuration"""
|
||||||
|
|
||||||
async def optimized_gemini_model_complete(
|
async def optimized_gemini_model_complete(
|
||||||
|
|
@ -527,6 +529,8 @@ def create_app(args):
|
||||||
if history_messages is None:
|
if history_messages is None:
|
||||||
history_messages = []
|
history_messages = []
|
||||||
|
|
||||||
|
# Use pre-processed configuration to avoid repeated parsing
|
||||||
|
kwargs["timeout"] = llm_timeout
|
||||||
if (
|
if (
|
||||||
config_cache.gemini_llm_options is not None
|
config_cache.gemini_llm_options is not None
|
||||||
and "generation_config" not in kwargs
|
and "generation_config" not in kwargs
|
||||||
|
|
@ -568,7 +572,7 @@ def create_app(args):
|
||||||
config_cache, args, llm_timeout
|
config_cache, args, llm_timeout
|
||||||
)
|
)
|
||||||
elif binding == "gemini":
|
elif binding == "gemini":
|
||||||
return create_optimized_gemini_llm_func(config_cache, args)
|
return create_optimized_gemini_llm_func(config_cache, args, llm_timeout)
|
||||||
else: # openai and compatible
|
else: # openai and compatible
|
||||||
# Use optimized function with pre-processed configuration
|
# Use optimized function with pre-processed configuration
|
||||||
return create_optimized_openai_llm_func(config_cache, args, llm_timeout)
|
return create_optimized_openai_llm_func(config_cache, args, llm_timeout)
|
||||||
|
|
|
||||||
|
|
@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
@lru_cache(maxsize=8)
|
||||||
def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client:
|
def _get_gemini_client(
|
||||||
|
api_key: str, base_url: str | None, timeout: int | None = None
|
||||||
|
) -> genai.Client:
|
||||||
"""
|
"""
|
||||||
Create (or fetch cached) Gemini client.
|
Create (or fetch cached) Gemini client.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
api_key: Google Gemini API key.
|
api_key: Google Gemini API key.
|
||||||
base_url: Optional custom API endpoint.
|
base_url: Optional custom API endpoint.
|
||||||
|
timeout: Optional request timeout in milliseconds.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
genai.Client: Configured Gemini client instance.
|
genai.Client: Configured Gemini client instance.
|
||||||
"""
|
"""
|
||||||
client_kwargs: dict[str, Any] = {"api_key": api_key}
|
client_kwargs: dict[str, Any] = {"api_key": api_key}
|
||||||
|
|
||||||
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
|
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None:
|
||||||
try:
|
try:
|
||||||
client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url)
|
http_options_kwargs = {}
|
||||||
|
if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
|
||||||
|
http_options_kwargs["api_endpoint"] = base_url
|
||||||
|
if timeout is not None:
|
||||||
|
http_options_kwargs["timeout"] = timeout
|
||||||
|
|
||||||
|
client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs)
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc)
|
LOG.warning("Failed to apply custom Gemini http_options: %s", exc)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return genai.Client(**client_kwargs)
|
return genai.Client(**client_kwargs)
|
||||||
|
|
@ -166,6 +175,7 @@ async def gemini_complete_if_cache(
|
||||||
stream: bool | None = None,
|
stream: bool | None = None,
|
||||||
keyword_extraction: bool = False,
|
keyword_extraction: bool = False,
|
||||||
generation_config: dict[str, Any] | None = None,
|
generation_config: dict[str, Any] | None = None,
|
||||||
|
timeout: int | None = None,
|
||||||
**_: Any,
|
**_: Any,
|
||||||
) -> str | AsyncIterator[str]:
|
) -> str | AsyncIterator[str]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -190,10 +200,10 @@ async def gemini_complete_if_cache(
|
||||||
generation_config: Optional generation configuration dict.
|
generation_config: Optional generation configuration dict.
|
||||||
keyword_extraction: Whether to use JSON response format.
|
keyword_extraction: Whether to use JSON response format.
|
||||||
token_tracker: Optional token usage tracker for monitoring API usage.
|
token_tracker: Optional token usage tracker for monitoring API usage.
|
||||||
hashing_kv: Storage interface (for interface parity with other bindings).
|
|
||||||
stream: Whether to stream the response.
|
stream: Whether to stream the response.
|
||||||
|
hashing_kv: Storage interface (for interface parity with other bindings).
|
||||||
enable_cot: Whether to include Chain of Thought content in the response.
|
enable_cot: Whether to include Chain of Thought content in the response.
|
||||||
timeout: Request timeout (handled by caller if needed).
|
timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API).
|
||||||
**_: Additional keyword arguments (ignored).
|
**_: Additional keyword arguments (ignored).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -207,7 +217,9 @@ async def gemini_complete_if_cache(
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
key = _ensure_api_key(api_key)
|
key = _ensure_api_key(api_key)
|
||||||
client = _get_gemini_client(key, base_url)
|
# Convert timeout from seconds to milliseconds for Gemini API
|
||||||
|
timeout_ms = timeout * 1000 if timeout else None
|
||||||
|
client = _get_gemini_client(key, base_url, timeout_ms)
|
||||||
|
|
||||||
history_block = _format_history_messages(history_messages)
|
history_block = _format_history_messages(history_messages)
|
||||||
prompt_sections = []
|
prompt_sections = []
|
||||||
|
|
@ -279,7 +291,9 @@ async def gemini_complete_if_cache(
|
||||||
|
|
||||||
# Send thought content if COT is active
|
# Send thought content if COT is active
|
||||||
if cot_active:
|
if cot_active:
|
||||||
loop.call_soon_threadsafe(queue.put_nowait, thought_text)
|
loop.call_soon_threadsafe(
|
||||||
|
queue.put_nowait, thought_text
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# COT disabled - only send regular content
|
# COT disabled - only send regular content
|
||||||
if regular_text:
|
if regular_text:
|
||||||
|
|
|
||||||
|
|
@ -138,6 +138,9 @@ async def openai_complete_if_cache(
|
||||||
base_url: str | None = None,
|
base_url: str | None = None,
|
||||||
api_key: str | None = None,
|
api_key: str | None = None,
|
||||||
token_tracker: Any | None = None,
|
token_tracker: Any | None = None,
|
||||||
|
keyword_extraction: bool = False, # Will be removed from kwargs before passing to OpenAI
|
||||||
|
stream: bool | None = None,
|
||||||
|
timeout: int | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
|
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
|
||||||
|
|
@ -172,9 +175,9 @@ async def openai_complete_if_cache(
|
||||||
- openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
|
- openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
|
||||||
These will be passed to the client constructor but will be overridden by
|
These will be passed to the client constructor but will be overridden by
|
||||||
explicit parameters (api_key, base_url).
|
explicit parameters (api_key, base_url).
|
||||||
- hashing_kv: Will be removed from kwargs before passing to OpenAI.
|
|
||||||
- keyword_extraction: Will be removed from kwargs before passing to OpenAI.
|
- keyword_extraction: Will be removed from kwargs before passing to OpenAI.
|
||||||
- stream: Whether to stream the response. Default is False.
|
- stream: Whether to stream the response. Default is False.
|
||||||
|
- timeout: Request timeout in seconds. Default is None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The completed text (with integrated COT content if available) or an async iterator
|
The completed text (with integrated COT content if available) or an async iterator
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue