mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
perf(gemini): Apply Gemini 2.5 Pro performance optimizations from PR 9913 (#9925)
Co-authored-by: OpenHands-Claude <openhands@all-hands.dev>
This commit is contained in:
parent
03c8312f5f
commit
a32a623078
@ -43,7 +43,7 @@ class LLMConfig(BaseModel):
|
||||
log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
|
||||
custom_tokenizer: A custom tokenizer to use for token counting.
|
||||
native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set.
|
||||
reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models.
|
||||
reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Can apply to all reasoning models.
|
||||
seed: The seed to use for the LLM.
|
||||
safety_settings: Safety settings for models that support them (like Mistral AI and Gemini).
|
||||
"""
|
||||
@ -85,7 +85,7 @@ class LLMConfig(BaseModel):
|
||||
log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions'))
|
||||
custom_tokenizer: str | None = Field(default=None)
|
||||
native_tool_calling: bool | None = Field(default=None)
|
||||
reasoning_effort: str | None = Field(default='high')
|
||||
reasoning_effort: str | None = Field(default=None)
|
||||
seed: int | None = Field(default=None)
|
||||
safety_settings: list[dict[str, str]] | None = Field(
|
||||
default=None,
|
||||
@ -171,6 +171,14 @@ class LLMConfig(BaseModel):
|
||||
if self.openrouter_app_name:
|
||||
os.environ['OR_APP_NAME'] = self.openrouter_app_name
|
||||
|
||||
# Set reasoning_effort to 'high' by default for non-Gemini models
|
||||
# Gemini models use optimized thinking budget when reasoning_effort is None
|
||||
logger.debug(
|
||||
f'Setting reasoning_effort for model {self.model} with reasoning_effort {self.reasoning_effort}'
|
||||
)
|
||||
if self.reasoning_effort is None and 'gemini-2.5-pro' not in self.model:
|
||||
self.reasoning_effort = 'high'
|
||||
|
||||
# Set an API version by default for Azure models
|
||||
# Required for newer models.
|
||||
# Azure issue: https://github.com/All-Hands-AI/OpenHands/issues/7755
|
||||
|
||||
@ -194,7 +194,24 @@ class LLM(RetryMixin, DebugMixin):
|
||||
self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS
|
||||
or self.config.model.split('/')[-1] in REASONING_EFFORT_SUPPORTED_MODELS
|
||||
):
|
||||
kwargs['reasoning_effort'] = self.config.reasoning_effort
|
||||
# For Gemini models, only map 'low' to optimized thinking budget
|
||||
# Let other reasoning_effort values pass through to API as-is
|
||||
if 'gemini-2.5-pro' in self.config.model:
|
||||
logger.debug(
|
||||
f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort}'
|
||||
)
|
||||
if self.config.reasoning_effort in {None, 'low', 'none'}:
|
||||
kwargs['thinking'] = {'budget_tokens': 128}
|
||||
kwargs['allowed_openai_params'] = ['thinking']
|
||||
kwargs.pop('reasoning_effort', None)
|
||||
else:
|
||||
kwargs['reasoning_effort'] = self.config.reasoning_effort
|
||||
logger.debug(
|
||||
f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort} mapped to thinking {kwargs.get("thinking")}'
|
||||
)
|
||||
|
||||
else:
|
||||
kwargs['reasoning_effort'] = self.config.reasoning_effort
|
||||
kwargs.pop(
|
||||
'temperature'
|
||||
) # temperature is not supported for reasoning models
|
||||
|
||||
@ -242,9 +242,6 @@ class ConversationMemory:
|
||||
|
||||
# Add the LLM message (assistant) that initiated the tool calls
|
||||
# (overwrites any previous message with the same response_id)
|
||||
logger.debug(
|
||||
f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}'
|
||||
)
|
||||
pending_tool_call_action_messages[llm_response.id] = Message(
|
||||
role=getattr(assistant_msg, 'role', 'assistant'),
|
||||
# tool call content SHOULD BE a string
|
||||
|
||||
@ -376,7 +376,6 @@ class CLIRuntime(Runtime):
|
||||
if ready_to_read:
|
||||
line = process.stdout.readline()
|
||||
if line:
|
||||
logger.debug(f'LINE: {line}')
|
||||
output_lines.append(line)
|
||||
if self._shell_stream_callback:
|
||||
self._shell_stream_callback(line)
|
||||
@ -387,7 +386,6 @@ class CLIRuntime(Runtime):
|
||||
while line:
|
||||
line = process.stdout.readline()
|
||||
if line:
|
||||
logger.debug(f'LINE: {line}')
|
||||
output_lines.append(line)
|
||||
if self._shell_stream_callback:
|
||||
self._shell_stream_callback(line)
|
||||
|
||||
@ -733,34 +733,31 @@ def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_completion_with_two_positional_args(mock_litellm_completion, default_config):
|
||||
mock_response = {
|
||||
'choices': [{'message': {'content': 'Response to positional args.'}}]
|
||||
def test_llm_gemini_thinking_parameter(mock_litellm_completion, default_config):
|
||||
"""
|
||||
Test that the 'thinking' parameter is correctly passed to litellm_completion
|
||||
when a Gemini model is used with 'low' reasoning_effort.
|
||||
"""
|
||||
# Configure for Gemini model with low reasoning effort
|
||||
gemini_config = copy.deepcopy(default_config)
|
||||
gemini_config.model = 'gemini-2.5-pro'
|
||||
gemini_config.reasoning_effort = 'low'
|
||||
|
||||
# Mock the response from litellm
|
||||
mock_litellm_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}]
|
||||
}
|
||||
mock_litellm_completion.return_value = mock_response
|
||||
|
||||
test_llm = LLM(config=default_config)
|
||||
response = test_llm.completion(
|
||||
'some-model-to-be-ignored',
|
||||
[{'role': 'user', 'content': 'Hello from positional args!'}],
|
||||
stream=False,
|
||||
)
|
||||
# Initialize LLM and call completion
|
||||
llm = LLM(config=gemini_config)
|
||||
llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
|
||||
|
||||
# Assertions
|
||||
assert (
|
||||
response['choices'][0]['message']['content'] == 'Response to positional args.'
|
||||
)
|
||||
# Verify that litellm_completion was called with the 'thinking' parameter
|
||||
mock_litellm_completion.assert_called_once()
|
||||
|
||||
# Check if the correct arguments were passed to litellm_completion
|
||||
call_args, call_kwargs = mock_litellm_completion.call_args
|
||||
assert (
|
||||
call_kwargs['model'] == default_config.model
|
||||
) # Should use the model from config, not the first arg
|
||||
assert call_kwargs['messages'] == [
|
||||
{'role': 'user', 'content': 'Hello from positional args!'}
|
||||
]
|
||||
assert not call_kwargs['stream']
|
||||
assert 'thinking' in call_kwargs
|
||||
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
||||
assert 'reasoning_effort' not in call_kwargs
|
||||
|
||||
# Ensure the first positional argument (model) was ignored
|
||||
assert (
|
||||
@ -1111,3 +1108,203 @@ def test_azure_model_default_max_tokens():
|
||||
|
||||
# Verify the config has the default max_output_tokens value
|
||||
assert llm.config.max_output_tokens is None # Default value
|
||||
|
||||
|
||||
# Gemini Performance Optimization Tests
|
||||
|
||||
|
||||
def test_gemini_model_keeps_none_reasoning_effort():
|
||||
"""Test that Gemini models keep reasoning_effort=None for optimization."""
|
||||
config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
|
||||
# reasoning_effort should remain None for Gemini models
|
||||
assert config.reasoning_effort is None
|
||||
|
||||
|
||||
def test_non_gemini_model_gets_high_reasoning_effort():
|
||||
"""Test that non-Gemini models get reasoning_effort='high' by default."""
|
||||
config = LLMConfig(model='gpt-4o', api_key='test_key')
|
||||
# Non-Gemini models should get reasoning_effort='high'
|
||||
assert config.reasoning_effort == 'high'
|
||||
|
||||
|
||||
def test_explicit_reasoning_effort_preserved():
|
||||
"""Test that explicitly set reasoning_effort is preserved."""
|
||||
config = LLMConfig(
|
||||
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
|
||||
)
|
||||
# Explicitly set reasoning_effort should be preserved
|
||||
assert config.reasoning_effort == 'medium'
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_gemini_none_reasoning_effort_uses_thinking_budget(mock_completion):
|
||||
"""Test that Gemini with reasoning_effort=None uses thinking budget."""
|
||||
config = LLMConfig(
|
||||
model='gemini-2.5-pro', api_key='test_key', reasoning_effort=None
|
||||
)
|
||||
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
||||
}
|
||||
|
||||
llm = LLM(config)
|
||||
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
||||
llm.completion(messages=sample_messages)
|
||||
|
||||
# Verify that thinking budget was set and reasoning_effort was None
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' in call_kwargs
|
||||
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
||||
assert call_kwargs.get('reasoning_effort') is None
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_gemini_low_reasoning_effort_uses_thinking_budget(mock_completion):
|
||||
"""Test that Gemini with reasoning_effort='low' uses thinking budget."""
|
||||
config = LLMConfig(
|
||||
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='low'
|
||||
)
|
||||
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
||||
}
|
||||
|
||||
llm = LLM(config)
|
||||
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
||||
llm.completion(messages=sample_messages)
|
||||
|
||||
# Verify that thinking budget was set and reasoning_effort was None
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' in call_kwargs
|
||||
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
||||
assert call_kwargs.get('reasoning_effort') is None
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
|
||||
"""Test that Gemini with reasoning_effort='medium' passes through to litellm."""
|
||||
config = LLMConfig(
|
||||
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
|
||||
)
|
||||
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
||||
}
|
||||
|
||||
llm = LLM(config)
|
||||
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
||||
llm.completion(messages=sample_messages)
|
||||
|
||||
# Verify that reasoning_effort was passed through and thinking budget was not set
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' not in call_kwargs
|
||||
assert call_kwargs.get('reasoning_effort') == 'medium'
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_gemini_high_reasoning_effort_passes_through(mock_completion):
|
||||
"""Test that Gemini with reasoning_effort='high' passes through to litellm."""
|
||||
config = LLMConfig(
|
||||
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='high'
|
||||
)
|
||||
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
||||
}
|
||||
|
||||
llm = LLM(config)
|
||||
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
||||
llm.completion(messages=sample_messages)
|
||||
|
||||
# Verify that reasoning_effort was passed through and thinking budget was not set
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' not in call_kwargs
|
||||
assert call_kwargs.get('reasoning_effort') == 'high'
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_non_gemini_uses_reasoning_effort(mock_completion):
|
||||
"""Test that non-Gemini models use reasoning_effort instead of thinking budget."""
|
||||
config = LLMConfig(model='o1', api_key='test_key', reasoning_effort='high')
|
||||
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
||||
}
|
||||
|
||||
llm = LLM(config)
|
||||
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
||||
llm.completion(messages=sample_messages)
|
||||
|
||||
# Verify that reasoning_effort was used and thinking budget was not set
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' not in call_kwargs
|
||||
assert call_kwargs.get('reasoning_effort') == 'high'
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_non_reasoning_model_no_optimization(mock_completion):
|
||||
"""Test that non-reasoning models don't get optimization parameters."""
|
||||
config = LLMConfig(
|
||||
model='gpt-3.5-turbo', # Not in REASONING_EFFORT_SUPPORTED_MODELS
|
||||
api_key='test_key',
|
||||
)
|
||||
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
||||
}
|
||||
|
||||
llm = LLM(config)
|
||||
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
||||
llm.completion(messages=sample_messages)
|
||||
|
||||
# Verify that neither thinking budget nor reasoning_effort were set
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' not in call_kwargs
|
||||
assert 'reasoning_effort' not in call_kwargs
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_gemini_performance_optimization_end_to_end(mock_completion):
|
||||
"""Test the complete Gemini performance optimization flow end-to-end."""
|
||||
# Mock the completion response
|
||||
mock_completion.return_value = {
|
||||
'choices': [{'message': {'content': 'Optimized response'}}],
|
||||
'usage': {'prompt_tokens': 50, 'completion_tokens': 25},
|
||||
}
|
||||
|
||||
# Create Gemini configuration
|
||||
config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
|
||||
|
||||
# Verify config has optimized defaults
|
||||
assert config.reasoning_effort is None
|
||||
|
||||
# Create LLM and make completion
|
||||
llm = LLM(config)
|
||||
messages = [{'role': 'user', 'content': 'Solve this complex problem'}]
|
||||
|
||||
response = llm.completion(messages=messages)
|
||||
|
||||
# Verify response was generated
|
||||
assert response['choices'][0]['message']['content'] == 'Optimized response'
|
||||
|
||||
# Verify optimization parameters were applied
|
||||
call_kwargs = mock_completion.call_args[1]
|
||||
assert 'thinking' in call_kwargs
|
||||
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
||||
assert call_kwargs.get('reasoning_effort') is None
|
||||
|
||||
# Verify temperature and top_p were removed for reasoning models
|
||||
assert 'temperature' not in call_kwargs
|
||||
assert 'top_p' not in call_kwargs
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user