diff --git a/env.example b/env.example index ce1eab0f..1e581751 100644 --- a/env.example +++ b/env.example @@ -194,9 +194,10 @@ LLM_BINDING_API_KEY=your_api_key ### Gemini example # LLM_BINDING=gemini # LLM_MODEL=gemini-flash-latest -# LLM_BINDING_HOST=https://generativelanguage.googleapis.com # LLM_BINDING_API_KEY=your_gemini_api_key -# GEMINI_LLM_MAX_OUTPUT_TOKENS=8192 +# LLM_BINDING_HOST=https://generativelanguage.googleapis.com +GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}' +# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000 # GEMINI_LLM_TEMPERATURE=0.7 ### OpenAI Compatible API Specific Parameters diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 70e17bb6..c9bb1a44 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -512,7 +512,9 @@ def create_app(args): return optimized_azure_openai_model_complete - def create_optimized_gemini_llm_func(config_cache: LLMConfigCache, args): + def create_optimized_gemini_llm_func( + config_cache: LLMConfigCache, args, llm_timeout: int + ): """Create optimized Gemini LLM function with cached configuration""" async def optimized_gemini_model_complete( @@ -527,6 +529,8 @@ def create_app(args): if history_messages is None: history_messages = [] + # Use pre-processed configuration to avoid repeated parsing + kwargs["timeout"] = llm_timeout if ( config_cache.gemini_llm_options is not None and "generation_config" not in kwargs @@ -568,7 +572,7 @@ def create_app(args): config_cache, args, llm_timeout ) elif binding == "gemini": - return create_optimized_gemini_llm_func(config_cache, args) + return create_optimized_gemini_llm_func(config_cache, args, llm_timeout) else: # openai and compatible # Use optimized function with pre-processed configuration return create_optimized_openai_llm_func(config_cache, args, llm_timeout) diff --git a/lightrag/llm/gemini.py b/lightrag/llm/gemini.py index 4cec3e71..f06ec6b3 100644 --- a/lightrag/llm/gemini.py +++ b/lightrag/llm/gemini.py @@ -33,24 +33,33 @@ LOG = logging.getLogger(__name__) @lru_cache(maxsize=8) -def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client: +def _get_gemini_client( + api_key: str, base_url: str | None, timeout: int | None = None +) -> genai.Client: """ Create (or fetch cached) Gemini client. Args: api_key: Google Gemini API key. base_url: Optional custom API endpoint. + timeout: Optional request timeout in milliseconds. Returns: genai.Client: Configured Gemini client instance. """ client_kwargs: dict[str, Any] = {"api_key": api_key} - if base_url and base_url != DEFAULT_GEMINI_ENDPOINT: + if base_url and base_url != DEFAULT_GEMINI_ENDPOINT or timeout is not None: try: - client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url) + http_options_kwargs = {} + if base_url and base_url != DEFAULT_GEMINI_ENDPOINT: + http_options_kwargs["api_endpoint"] = base_url + if timeout is not None: + http_options_kwargs["timeout"] = timeout + + client_kwargs["http_options"] = types.HttpOptions(**http_options_kwargs) except Exception as exc: # pragma: no cover - defensive - LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc) + LOG.warning("Failed to apply custom Gemini http_options: %s", exc) try: return genai.Client(**client_kwargs) @@ -166,6 +175,7 @@ async def gemini_complete_if_cache( stream: bool | None = None, keyword_extraction: bool = False, generation_config: dict[str, Any] | None = None, + timeout: int | None = None, **_: Any, ) -> str | AsyncIterator[str]: """ @@ -190,10 +200,10 @@ async def gemini_complete_if_cache( generation_config: Optional generation configuration dict. keyword_extraction: Whether to use JSON response format. token_tracker: Optional token usage tracker for monitoring API usage. - hashing_kv: Storage interface (for interface parity with other bindings). stream: Whether to stream the response. + hashing_kv: Storage interface (for interface parity with other bindings). enable_cot: Whether to include Chain of Thought content in the response. - timeout: Request timeout (handled by caller if needed). + timeout: Request timeout in seconds (will be converted to milliseconds for Gemini API). **_: Additional keyword arguments (ignored). Returns: @@ -207,7 +217,9 @@ async def gemini_complete_if_cache( loop = asyncio.get_running_loop() key = _ensure_api_key(api_key) - client = _get_gemini_client(key, base_url) + # Convert timeout from seconds to milliseconds for Gemini API + timeout_ms = timeout * 1000 if timeout else None + client = _get_gemini_client(key, base_url, timeout_ms) history_block = _format_history_messages(history_messages) prompt_sections = [] @@ -279,7 +291,9 @@ async def gemini_complete_if_cache( # Send thought content if COT is active if cot_active: - loop.call_soon_threadsafe(queue.put_nowait, thought_text) + loop.call_soon_threadsafe( + queue.put_nowait, thought_text + ) else: # COT disabled - only send regular content if regular_text: diff --git a/lightrag/llm/openai.py b/lightrag/llm/openai.py index 2cdbb72b..511a3a62 100644 --- a/lightrag/llm/openai.py +++ b/lightrag/llm/openai.py @@ -138,6 +138,9 @@ async def openai_complete_if_cache( base_url: str | None = None, api_key: str | None = None, token_tracker: Any | None = None, + keyword_extraction: bool = False, # Will be removed from kwargs before passing to OpenAI + stream: bool | None = None, + timeout: int | None = None, **kwargs: Any, ) -> str: """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration. @@ -172,9 +175,9 @@ async def openai_complete_if_cache( - openai_client_configs: Dict of configuration options for the AsyncOpenAI client. These will be passed to the client constructor but will be overridden by explicit parameters (api_key, base_url). - - hashing_kv: Will be removed from kwargs before passing to OpenAI. - keyword_extraction: Will be removed from kwargs before passing to OpenAI. - stream: Whether to stream the response. Default is False. + - timeout: Request timeout in seconds. Default is None. Returns: The completed text (with integrated COT content if available) or an async iterator