feat: enhance GeminiClient with max tokens management (#712)

* feat: enhance GeminiClient with max tokens management - Introduced a mapping for maximum output tokens for various Gemini models. - Added methods to resolve max tokens based on precedence rules, allowing for more flexible token management. - Updated tests to verify max tokens behavior, ensuring explicit parameters take precedence and fallback mechanisms work correctly. This change improves the handling of token limits for different models, enhancing the client’s configurability and usability. * refactor: streamline max tokens retrieval in GeminiClient - Removed the fallback to DEFAULT_MAX_TOKENS in favor of directly using model-specific maximum tokens. - Simplified the logic for determining max tokens, enhancing code clarity and maintainability. This change improves the efficiency of token management within the GeminiClient.
2025-07-13 14:37:55 -07:00 · 2025-07-13 14:37:55 -07:00 · 4481702c9f
commit 4481702c9f
parent e16740be9d
2 changed files with 139 additions and 11 deletions
--- a/graphiti_core/llm_client/gemini_client.py
+++ b/graphiti_core/llm_client/gemini_client.py
@ -24,7 +24,7 @@ from pydantic import BaseModel

 from ..prompts.models import Message
 from .client import MULTILINGUAL_EXTRACTION_RESPONSES, LLMClient
-from .config import DEFAULT_MAX_TOKENS, LLMConfig, ModelSize
+from .config import LLMConfig, ModelSize
 from .errors import RateLimitError

 if TYPE_CHECKING:
@ -47,6 +47,25 @@ logger = logging.getLogger(__name__)
 DEFAULT_MODEL = 'gemini-2.5-flash'
 DEFAULT_SMALL_MODEL = 'gemini-2.5-flash-lite-preview-06-17'

+# Maximum output tokens for different Gemini models
+GEMINI_MODEL_MAX_TOKENS = {
+    # Gemini 2.5 models
+    'gemini-2.5-pro': 65536,
+    'gemini-2.5-flash': 65536,
+    'gemini-2.5-flash-lite': 64000,
+    'models/gemini-2.5-flash-lite-preview-06-17': 64000,
+    # Gemini 2.0 models
+    'gemini-2.0-flash': 8192,
+    'gemini-2.0-flash-lite': 8192,
+    # Gemini 1.5 models
+    'gemini-1.5-pro': 8192,
+    'gemini-1.5-flash': 8192,
+    'gemini-1.5-flash-8b': 8192,
+}
+
+# Default max tokens for models not in the mapping
+DEFAULT_GEMINI_MAX_TOKENS = 8192
+

 class GeminiClient(LLMClient):
    """
@ -75,7 +94,7 @@ class GeminiClient(LLMClient):
        self,
        config: LLMConfig | None = None,
        cache: bool = False,
-        max_tokens: int = DEFAULT_MAX_TOKENS,
+        max_tokens: int | None = None,
        thinking_config: types.ThinkingConfig | None = None,
        client: 'genai.Client | None' = None,
    ):
@ -147,6 +166,38 @@ class GeminiClient(LLMClient):
        else:
            return self.model or DEFAULT_MODEL

+    def _get_max_tokens_for_model(self, model: str) -> int:
+        """Get the maximum output tokens for a specific Gemini model."""
+        return GEMINI_MODEL_MAX_TOKENS.get(model, DEFAULT_GEMINI_MAX_TOKENS)
+
+    def _resolve_max_tokens(self, requested_max_tokens: int | None, model: str) -> int:
+        """
+        Resolve the maximum output tokens to use based on precedence rules.
+
+        Precedence order (highest to lowest):
+        1. Explicit max_tokens parameter passed to generate_response()
+        2. Instance max_tokens set during client initialization
+        3. Model-specific maximum tokens from GEMINI_MODEL_MAX_TOKENS mapping
+        4. DEFAULT_MAX_TOKENS as final fallback
+
+        Args:
+            requested_max_tokens: The max_tokens parameter passed to generate_response()
+            model: The model name to look up model-specific limits
+
+        Returns:
+            int: The resolved maximum tokens to use
+        """
+        # 1. Use explicit parameter if provided
+        if requested_max_tokens is not None:
+            return requested_max_tokens
+
+        # 2. Use instance max_tokens if set during initialization
+        if self.max_tokens is not None:
+            return self.max_tokens
+
+        # 3. Use model-specific maximum or return DEFAULT_GEMINI_MAX_TOKENS
+        return self._get_max_tokens_for_model(model)
+
    def salvage_json(self, raw_output: str) -> dict[str, typing.Any] | None:
        """
        Attempt to salvage a JSON object if the raw output is truncated.
@ -184,7 +235,7 @@ class GeminiClient(LLMClient):
        self,
        messages: list[Message],
        response_model: type[BaseModel] | None = None,
-        max_tokens: int = DEFAULT_MAX_TOKENS,
+        max_tokens: int | None = None,
        model_size: ModelSize = ModelSize.medium,
    ) -> dict[str, typing.Any]:
        """
@ -193,7 +244,7 @@ class GeminiClient(LLMClient):
        Args:
            messages (list[Message]): A list of messages to send to the language model.
            response_model (type[BaseModel] | None): An optional Pydantic model to parse the response into.
-            max_tokens (int): The maximum number of tokens to generate in the response.
+            max_tokens (int | None): The maximum number of tokens to generate in the response. If None, uses precedence rules.
            model_size (ModelSize): The size of the model to use (small or medium).

        Returns:
@ -233,10 +284,13 @@ class GeminiClient(LLMClient):
            # Get the appropriate model for the requested size
            model = self._get_model_for_size(model_size)

+            # Resolve max_tokens using precedence rules (see _resolve_max_tokens for details)
+            resolved_max_tokens = self._resolve_max_tokens(max_tokens, model)
+
            # Create generation config
            generation_config = types.GenerateContentConfig(
                temperature=self.temperature,
-                max_output_tokens=max_tokens or self.max_tokens,
+                max_output_tokens=resolved_max_tokens,
                response_mime_type='application/json' if response_model else None,
                response_schema=response_model if response_model else None,
                system_instruction=system_prompt,
@ -315,9 +369,6 @@ class GeminiClient(LLMClient):
        Returns:
            dict[str, typing.Any]: The response from the language model.
        """
-        if max_tokens is None:
-            max_tokens = self.max_tokens
-
        retry_count = 0
        last_error = None
        last_output = None
--- a/tests/llm_client/test_gemini_client.py
+++ b/tests/llm_client/test_gemini_client.py
@ -369,7 +369,7 @@ class TestGeminiClientGenerateResponse:

    @pytest.mark.asyncio
    async def test_custom_max_tokens(self, gemini_client, mock_gemini_client):
-        """Test response generation with custom max tokens."""
+        """Test that explicit max_tokens parameter takes precedence over all other values."""
        # Setup mock response
        mock_response = MagicMock()
        mock_response.text = 'Test response'
@ -377,15 +377,54 @@ class TestGeminiClientGenerateResponse:
        mock_response.prompt_feedback = None
        mock_gemini_client.aio.models.generate_content.return_value = mock_response

-        # Call method with custom max tokens
+        # Call method with custom max tokens (should take precedence)
        messages = [Message(role='user', content='Test message')]
        await gemini_client.generate_response(messages, max_tokens=500)

-        # Verify max tokens is passed in config
+        # Verify explicit max_tokens parameter takes precedence
        call_args = mock_gemini_client.aio.models.generate_content.call_args
        config = call_args[1]['config']
+        # Explicit parameter should override everything else
        assert config.max_output_tokens == 500

+    @pytest.mark.asyncio
+    async def test_max_tokens_precedence_fallback(self, mock_gemini_client):
+        """Test max_tokens precedence when no explicit parameter is provided."""
+        # Setup mock response
+        mock_response = MagicMock()
+        mock_response.text = 'Test response'
+        mock_response.candidates = []
+        mock_response.prompt_feedback = None
+        mock_gemini_client.aio.models.generate_content.return_value = mock_response
+
+        # Test case 1: No explicit max_tokens, has instance max_tokens
+        config = LLMConfig(
+            api_key='test_api_key', model='test-model', temperature=0.5, max_tokens=1000
+        )
+        client = GeminiClient(
+            config=config, cache=False, max_tokens=2000, client=mock_gemini_client
+        )
+
+        messages = [Message(role='user', content='Test message')]
+        await client.generate_response(messages)
+
+        call_args = mock_gemini_client.aio.models.generate_content.call_args
+        config = call_args[1]['config']
+        # Instance max_tokens should be used
+        assert config.max_output_tokens == 2000
+
+        # Test case 2: No explicit max_tokens, no instance max_tokens, uses model mapping
+        config = LLMConfig(api_key='test_api_key', model='gemini-2.5-flash', temperature=0.5)
+        client = GeminiClient(config=config, cache=False, client=mock_gemini_client)
+
+        messages = [Message(role='user', content='Test message')]
+        await client.generate_response(messages)
+
+        call_args = mock_gemini_client.aio.models.generate_content.call_args
+        config = call_args[1]['config']
+        # Model mapping should be used
+        assert config.max_output_tokens == 65536
+
    @pytest.mark.asyncio
    async def test_model_size_selection(self, gemini_client, mock_gemini_client):
        """Test that the correct model is selected based on model size."""
@ -404,6 +443,44 @@ class TestGeminiClientGenerateResponse:
        call_args = mock_gemini_client.aio.models.generate_content.call_args
        assert call_args[1]['model'] == DEFAULT_SMALL_MODEL

+    @pytest.mark.asyncio
+    async def test_gemini_model_max_tokens_mapping(self, mock_gemini_client):
+        """Test that different Gemini models use their correct max tokens."""
+        # Setup mock response
+        mock_response = MagicMock()
+        mock_response.text = 'Test response'
+        mock_response.candidates = []
+        mock_response.prompt_feedback = None
+        mock_gemini_client.aio.models.generate_content.return_value = mock_response
+
+        # Test data: (model_name, expected_max_tokens)
+        test_cases = [
+            ('gemini-2.5-flash', 65536),
+            ('gemini-2.5-pro', 65536),
+            ('gemini-2.5-flash-lite', 64000),
+            ('models/gemini-2.5-flash-lite-preview-06-17', 64000),
+            ('gemini-2.0-flash', 8192),
+            ('gemini-1.5-pro', 8192),
+            ('gemini-1.5-flash', 8192),
+            ('unknown-model', 8192),  # Fallback case
+        ]
+
+        for model_name, expected_max_tokens in test_cases:
+            # Create client with specific model, no explicit max_tokens to test mapping
+            config = LLMConfig(api_key='test_api_key', model=model_name, temperature=0.5)
+            client = GeminiClient(config=config, cache=False, client=mock_gemini_client)
+
+            # Call method without explicit max_tokens to test model mapping fallback
+            messages = [Message(role='user', content='Test message')]
+            await client.generate_response(messages)
+
+            # Verify correct max tokens is used from model mapping
+            call_args = mock_gemini_client.aio.models.generate_content.call_args
+            config = call_args[1]['config']
+            assert config.max_output_tokens == expected_max_tokens, (
+                f'Model {model_name} should use {expected_max_tokens} tokens'
+            )
+

 if __name__ == '__main__':
    pytest.main(['-v', 'test_gemini_client.py'])