Add dynamic max_tokens configuration for Anthropic models (#1043)

* Add dynamic max_tokens configuration for Anthropic models

Implements model-specific max output token limits for AnthropicClient,
following the same pattern as GeminiClient. This replaces the previous
hardcoded min() cap that was preventing models from using their full
output capacity.

Changes:
- Added ANTHROPIC_MODEL_MAX_TOKENS mapping with limits for all supported
  Claude models (ranging from 4K to 65K tokens)
- Implemented _get_max_tokens_for_model() to lookup model-specific limits
- Implemented _resolve_max_tokens() with clear precedence rules:
  1. Explicit max_tokens parameter
  2. Instance max_tokens from initialization
  3. Model-specific limit from mapping
  4. Default fallback (8192 tokens)

This allows edge_operations.py to request 16384 tokens for edge extraction
without being artificially capped, while ensuring cheaper models with lower
limits are still properly handled.

Resolves TODO in anthropic_client.py:207-208.

* Clarify that max_tokens mapping represents standard limits

Updated comments to explicitly state that ANTHROPIC_MODEL_MAX_TOKENS
represents standard limits without beta headers. This prevents confusion
about extended limits (e.g., Claude 3.7's 128K with beta header) which
are not currently implemented in this mapping.
This commit is contained in:
Matthew Mo 2025-11-14 08:34:56 -08:00 committed by GitHub
parent 55ef6acb16
commit 50bcb74502
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -64,6 +64,34 @@ AnthropicModel = Literal[
DEFAULT_MODEL: AnthropicModel = 'claude-3-7-sonnet-latest' DEFAULT_MODEL: AnthropicModel = 'claude-3-7-sonnet-latest'
# Maximum output tokens for different Anthropic models
# Based on official Anthropic documentation (as of 2025)
# Note: These represent standard limits without beta headers.
# Some models support higher limits with additional configuration (e.g., Claude 3.7 supports
# 128K with 'anthropic-beta: output-128k-2025-02-19' header, but this is not currently implemented).
ANTHROPIC_MODEL_MAX_TOKENS = {
# Claude 3.7 models - standard 64K tokens
'claude-3-7-sonnet-latest': 65536,
'claude-3-7-sonnet-20250219': 65536,
# Claude 3.5 models
'claude-3-5-haiku-latest': 8192,
'claude-3-5-haiku-20241022': 8192,
'claude-3-5-sonnet-latest': 8192,
'claude-3-5-sonnet-20241022': 8192,
'claude-3-5-sonnet-20240620': 8192,
# Claude 3 models - 4K tokens
'claude-3-opus-latest': 4096,
'claude-3-opus-20240229': 4096,
'claude-3-sonnet-20240229': 4096,
'claude-3-haiku-20240307': 4096,
# Claude 2 models - 4K tokens
'claude-2.1': 4096,
'claude-2.0': 4096,
}
# Default max tokens for models not in the mapping
DEFAULT_ANTHROPIC_MAX_TOKENS = 8192
class AnthropicClient(LLMClient): class AnthropicClient(LLMClient):
""" """
@ -177,6 +205,45 @@ class AnthropicClient(LLMClient):
tool_choice_cast = typing.cast(ToolChoiceParam, tool_choice) tool_choice_cast = typing.cast(ToolChoiceParam, tool_choice)
return tool_list_cast, tool_choice_cast return tool_list_cast, tool_choice_cast
def _get_max_tokens_for_model(self, model: str) -> int:
"""Get the maximum output tokens for a specific Anthropic model.
Args:
model: The model name to look up
Returns:
int: The maximum output tokens for the model
"""
return ANTHROPIC_MODEL_MAX_TOKENS.get(model, DEFAULT_ANTHROPIC_MAX_TOKENS)
def _resolve_max_tokens(self, requested_max_tokens: int | None, model: str) -> int:
"""
Resolve the maximum output tokens to use based on precedence rules.
Precedence order (highest to lowest):
1. Explicit max_tokens parameter passed to generate_response()
2. Instance max_tokens set during client initialization
3. Model-specific maximum tokens from ANTHROPIC_MODEL_MAX_TOKENS mapping
4. DEFAULT_ANTHROPIC_MAX_TOKENS as final fallback
Args:
requested_max_tokens: The max_tokens parameter passed to generate_response()
model: The model name to look up model-specific limits
Returns:
int: The resolved maximum tokens to use
"""
# 1. Use explicit parameter if provided
if requested_max_tokens is not None:
return requested_max_tokens
# 2. Use instance max_tokens if set during initialization
if self.max_tokens is not None:
return self.max_tokens
# 3. Use model-specific maximum or return DEFAULT_ANTHROPIC_MAX_TOKENS
return self._get_max_tokens_for_model(model)
async def _generate_response( async def _generate_response(
self, self,
messages: list[Message], messages: list[Message],
@ -204,12 +271,9 @@ class AnthropicClient(LLMClient):
user_messages = [{'role': m.role, 'content': m.content} for m in messages[1:]] user_messages = [{'role': m.role, 'content': m.content} for m in messages[1:]]
user_messages_cast = typing.cast(list[MessageParam], user_messages) user_messages_cast = typing.cast(list[MessageParam], user_messages)
# TODO: Replace hacky min finding solution after fixing hardcoded EXTRACT_EDGES_MAX_TOKENS = 16384 in # Resolve max_tokens dynamically based on the model's capabilities
# edge_operations.py. Throws errors with cheaper models that lower max_tokens. # This allows different models to use their full output capacity
max_creation_tokens: int = min( max_creation_tokens: int = self._resolve_max_tokens(max_tokens, self.model)
max_tokens if max_tokens is not None else self.config.max_tokens,
DEFAULT_MAX_TOKENS,
)
try: try:
# Create the appropriate tool based on whether response_model is provided # Create the appropriate tool based on whether response_model is provided