Add dynamic max_tokens configuration for Anthropic models

Implements model-specific max output token limits for AnthropicClient,
following the same pattern as GeminiClient. This replaces the previous
hardcoded min() cap that was preventing models from using their full
output capacity.

Changes:
- Added ANTHROPIC_MODEL_MAX_TOKENS mapping with limits for all supported
  Claude models (ranging from 4K to 65K tokens)
- Implemented _get_max_tokens_for_model() to lookup model-specific limits
- Implemented _resolve_max_tokens() with clear precedence rules:
  1. Explicit max_tokens parameter
  2. Instance max_tokens from initialization
  3. Model-specific limit from mapping
  4. Default fallback (8192 tokens)

This allows edge_operations.py to request 16384 tokens for edge extraction
without being artificially capped, while ensuring cheaper models with lower
limits are still properly handled.

Resolves TODO in anthropic_client.py:207-208.
This commit is contained in:
supmo668 2025-11-03 17:33:32 -08:00
parent 8d99984204
commit 2be658435a

View file

@ -64,6 +64,31 @@ AnthropicModel = Literal[
DEFAULT_MODEL: AnthropicModel = 'claude-3-7-sonnet-latest'
# Maximum output tokens for different Anthropic models
# Based on official Anthropic documentation (as of 2025)
ANTHROPIC_MODEL_MAX_TOKENS = {
# Claude 3.7 models - 64K tokens (128K with beta header)
'claude-3-7-sonnet-latest': 65536,
'claude-3-7-sonnet-20250219': 65536,
# Claude 3.5 models
'claude-3-5-haiku-latest': 8192,
'claude-3-5-haiku-20241022': 8192,
'claude-3-5-sonnet-latest': 8192,
'claude-3-5-sonnet-20241022': 8192,
'claude-3-5-sonnet-20240620': 8192,
# Claude 3 models - 4K tokens
'claude-3-opus-latest': 4096,
'claude-3-opus-20240229': 4096,
'claude-3-sonnet-20240229': 4096,
'claude-3-haiku-20240307': 4096,
# Claude 2 models - 4K tokens
'claude-2.1': 4096,
'claude-2.0': 4096,
}
# Default max tokens for models not in the mapping
DEFAULT_ANTHROPIC_MAX_TOKENS = 8192
class AnthropicClient(LLMClient):
"""
@ -177,6 +202,45 @@ class AnthropicClient(LLMClient):
tool_choice_cast = typing.cast(ToolChoiceParam, tool_choice)
return tool_list_cast, tool_choice_cast
def _get_max_tokens_for_model(self, model: str) -> int:
"""Get the maximum output tokens for a specific Anthropic model.
Args:
model: The model name to look up
Returns:
int: The maximum output tokens for the model
"""
return ANTHROPIC_MODEL_MAX_TOKENS.get(model, DEFAULT_ANTHROPIC_MAX_TOKENS)
def _resolve_max_tokens(self, requested_max_tokens: int | None, model: str) -> int:
"""
Resolve the maximum output tokens to use based on precedence rules.
Precedence order (highest to lowest):
1. Explicit max_tokens parameter passed to generate_response()
2. Instance max_tokens set during client initialization
3. Model-specific maximum tokens from ANTHROPIC_MODEL_MAX_TOKENS mapping
4. DEFAULT_ANTHROPIC_MAX_TOKENS as final fallback
Args:
requested_max_tokens: The max_tokens parameter passed to generate_response()
model: The model name to look up model-specific limits
Returns:
int: The resolved maximum tokens to use
"""
# 1. Use explicit parameter if provided
if requested_max_tokens is not None:
return requested_max_tokens
# 2. Use instance max_tokens if set during initialization
if self.max_tokens is not None:
return self.max_tokens
# 3. Use model-specific maximum or return DEFAULT_ANTHROPIC_MAX_TOKENS
return self._get_max_tokens_for_model(model)
async def _generate_response(
self,
messages: list[Message],
@ -204,12 +268,9 @@ class AnthropicClient(LLMClient):
user_messages = [{'role': m.role, 'content': m.content} for m in messages[1:]]
user_messages_cast = typing.cast(list[MessageParam], user_messages)
# TODO: Replace hacky min finding solution after fixing hardcoded EXTRACT_EDGES_MAX_TOKENS = 16384 in
# edge_operations.py. Throws errors with cheaper models that lower max_tokens.
max_creation_tokens: int = min(
max_tokens if max_tokens is not None else self.config.max_tokens,
DEFAULT_MAX_TOKENS,
)
# Resolve max_tokens dynamically based on the model's capabilities
# This allows different models to use their full output capacity
max_creation_tokens: int = self._resolve_max_tokens(max_tokens, self.model)
try:
# Create the appropriate tool based on whether response_model is provided