perf(gemini): Apply Gemini 2.5 Pro performance optimizations from PR 9913 (#9925)

Co-authored-by: OpenHands-Claude <openhands@all-hands.dev>
This commit is contained in:
Engel Nyst 2025-07-30 01:28:50 +02:00 committed by GitHub
parent 03c8312f5f
commit a32a623078
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 248 additions and 31 deletions

View File

@ -43,7 +43,7 @@ class LLMConfig(BaseModel):
log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
custom_tokenizer: A custom tokenizer to use for token counting.
native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set.
reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models.
reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Can apply to all reasoning models.
seed: The seed to use for the LLM.
safety_settings: Safety settings for models that support them (like Mistral AI and Gemini).
"""
@ -85,7 +85,7 @@ class LLMConfig(BaseModel):
log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions'))
custom_tokenizer: str | None = Field(default=None)
native_tool_calling: bool | None = Field(default=None)
reasoning_effort: str | None = Field(default='high')
reasoning_effort: str | None = Field(default=None)
seed: int | None = Field(default=None)
safety_settings: list[dict[str, str]] | None = Field(
default=None,
@ -171,6 +171,14 @@ class LLMConfig(BaseModel):
if self.openrouter_app_name:
os.environ['OR_APP_NAME'] = self.openrouter_app_name
# Set reasoning_effort to 'high' by default for non-Gemini models
# Gemini models use optimized thinking budget when reasoning_effort is None
logger.debug(
f'Setting reasoning_effort for model {self.model} with reasoning_effort {self.reasoning_effort}'
)
if self.reasoning_effort is None and 'gemini-2.5-pro' not in self.model:
self.reasoning_effort = 'high'
# Set an API version by default for Azure models
# Required for newer models.
# Azure issue: https://github.com/All-Hands-AI/OpenHands/issues/7755

View File

@ -194,7 +194,24 @@ class LLM(RetryMixin, DebugMixin):
self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS
or self.config.model.split('/')[-1] in REASONING_EFFORT_SUPPORTED_MODELS
):
kwargs['reasoning_effort'] = self.config.reasoning_effort
# For Gemini models, only map 'low' to optimized thinking budget
# Let other reasoning_effort values pass through to API as-is
if 'gemini-2.5-pro' in self.config.model:
logger.debug(
f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort}'
)
if self.config.reasoning_effort in {None, 'low', 'none'}:
kwargs['thinking'] = {'budget_tokens': 128}
kwargs['allowed_openai_params'] = ['thinking']
kwargs.pop('reasoning_effort', None)
else:
kwargs['reasoning_effort'] = self.config.reasoning_effort
logger.debug(
f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort} mapped to thinking {kwargs.get("thinking")}'
)
else:
kwargs['reasoning_effort'] = self.config.reasoning_effort
kwargs.pop(
'temperature'
) # temperature is not supported for reasoning models

View File

@ -242,9 +242,6 @@ class ConversationMemory:
# Add the LLM message (assistant) that initiated the tool calls
# (overwrites any previous message with the same response_id)
logger.debug(
f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}'
)
pending_tool_call_action_messages[llm_response.id] = Message(
role=getattr(assistant_msg, 'role', 'assistant'),
# tool call content SHOULD BE a string

View File

@ -376,7 +376,6 @@ class CLIRuntime(Runtime):
if ready_to_read:
line = process.stdout.readline()
if line:
logger.debug(f'LINE: {line}')
output_lines.append(line)
if self._shell_stream_callback:
self._shell_stream_callback(line)
@ -387,7 +386,6 @@ class CLIRuntime(Runtime):
while line:
line = process.stdout.readline()
if line:
logger.debug(f'LINE: {line}')
output_lines.append(line)
if self._shell_stream_callback:
self._shell_stream_callback(line)

View File

@ -733,34 +733,31 @@ def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
@patch('openhands.llm.llm.litellm_completion')
def test_completion_with_two_positional_args(mock_litellm_completion, default_config):
mock_response = {
'choices': [{'message': {'content': 'Response to positional args.'}}]
def test_llm_gemini_thinking_parameter(mock_litellm_completion, default_config):
"""
Test that the 'thinking' parameter is correctly passed to litellm_completion
when a Gemini model is used with 'low' reasoning_effort.
"""
# Configure for Gemini model with low reasoning effort
gemini_config = copy.deepcopy(default_config)
gemini_config.model = 'gemini-2.5-pro'
gemini_config.reasoning_effort = 'low'
# Mock the response from litellm
mock_litellm_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}]
}
mock_litellm_completion.return_value = mock_response
test_llm = LLM(config=default_config)
response = test_llm.completion(
'some-model-to-be-ignored',
[{'role': 'user', 'content': 'Hello from positional args!'}],
stream=False,
)
# Initialize LLM and call completion
llm = LLM(config=gemini_config)
llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
# Assertions
assert (
response['choices'][0]['message']['content'] == 'Response to positional args.'
)
# Verify that litellm_completion was called with the 'thinking' parameter
mock_litellm_completion.assert_called_once()
# Check if the correct arguments were passed to litellm_completion
call_args, call_kwargs = mock_litellm_completion.call_args
assert (
call_kwargs['model'] == default_config.model
) # Should use the model from config, not the first arg
assert call_kwargs['messages'] == [
{'role': 'user', 'content': 'Hello from positional args!'}
]
assert not call_kwargs['stream']
assert 'thinking' in call_kwargs
assert call_kwargs['thinking'] == {'budget_tokens': 128}
assert 'reasoning_effort' not in call_kwargs
# Ensure the first positional argument (model) was ignored
assert (
@ -1111,3 +1108,203 @@ def test_azure_model_default_max_tokens():
# Verify the config has the default max_output_tokens value
assert llm.config.max_output_tokens is None # Default value
# Gemini Performance Optimization Tests
def test_gemini_model_keeps_none_reasoning_effort():
"""Test that Gemini models keep reasoning_effort=None for optimization."""
config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
# reasoning_effort should remain None for Gemini models
assert config.reasoning_effort is None
def test_non_gemini_model_gets_high_reasoning_effort():
"""Test that non-Gemini models get reasoning_effort='high' by default."""
config = LLMConfig(model='gpt-4o', api_key='test_key')
# Non-Gemini models should get reasoning_effort='high'
assert config.reasoning_effort == 'high'
def test_explicit_reasoning_effort_preserved():
"""Test that explicitly set reasoning_effort is preserved."""
config = LLMConfig(
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
)
# Explicitly set reasoning_effort should be preserved
assert config.reasoning_effort == 'medium'
@patch('openhands.llm.llm.litellm_completion')
def test_gemini_none_reasoning_effort_uses_thinking_budget(mock_completion):
"""Test that Gemini with reasoning_effort=None uses thinking budget."""
config = LLMConfig(
model='gemini-2.5-pro', api_key='test_key', reasoning_effort=None
)
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}],
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
}
llm = LLM(config)
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that thinking budget was set and reasoning_effort was None
call_kwargs = mock_completion.call_args[1]
assert 'thinking' in call_kwargs
assert call_kwargs['thinking'] == {'budget_tokens': 128}
assert call_kwargs.get('reasoning_effort') is None
@patch('openhands.llm.llm.litellm_completion')
def test_gemini_low_reasoning_effort_uses_thinking_budget(mock_completion):
"""Test that Gemini with reasoning_effort='low' uses thinking budget."""
config = LLMConfig(
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='low'
)
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}],
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
}
llm = LLM(config)
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that thinking budget was set and reasoning_effort was None
call_kwargs = mock_completion.call_args[1]
assert 'thinking' in call_kwargs
assert call_kwargs['thinking'] == {'budget_tokens': 128}
assert call_kwargs.get('reasoning_effort') is None
@patch('openhands.llm.llm.litellm_completion')
def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
"""Test that Gemini with reasoning_effort='medium' passes through to litellm."""
config = LLMConfig(
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
)
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}],
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
}
llm = LLM(config)
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that reasoning_effort was passed through and thinking budget was not set
call_kwargs = mock_completion.call_args[1]
assert 'thinking' not in call_kwargs
assert call_kwargs.get('reasoning_effort') == 'medium'
@patch('openhands.llm.llm.litellm_completion')
def test_gemini_high_reasoning_effort_passes_through(mock_completion):
"""Test that Gemini with reasoning_effort='high' passes through to litellm."""
config = LLMConfig(
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='high'
)
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}],
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
}
llm = LLM(config)
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that reasoning_effort was passed through and thinking budget was not set
call_kwargs = mock_completion.call_args[1]
assert 'thinking' not in call_kwargs
assert call_kwargs.get('reasoning_effort') == 'high'
@patch('openhands.llm.llm.litellm_completion')
def test_non_gemini_uses_reasoning_effort(mock_completion):
"""Test that non-Gemini models use reasoning_effort instead of thinking budget."""
config = LLMConfig(model='o1', api_key='test_key', reasoning_effort='high')
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}],
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
}
llm = LLM(config)
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that reasoning_effort was used and thinking budget was not set
call_kwargs = mock_completion.call_args[1]
assert 'thinking' not in call_kwargs
assert call_kwargs.get('reasoning_effort') == 'high'
@patch('openhands.llm.llm.litellm_completion')
def test_non_reasoning_model_no_optimization(mock_completion):
"""Test that non-reasoning models don't get optimization parameters."""
config = LLMConfig(
model='gpt-3.5-turbo', # Not in REASONING_EFFORT_SUPPORTED_MODELS
api_key='test_key',
)
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Test response'}}],
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
}
llm = LLM(config)
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that neither thinking budget nor reasoning_effort were set
call_kwargs = mock_completion.call_args[1]
assert 'thinking' not in call_kwargs
assert 'reasoning_effort' not in call_kwargs
@patch('openhands.llm.llm.litellm_completion')
def test_gemini_performance_optimization_end_to_end(mock_completion):
"""Test the complete Gemini performance optimization flow end-to-end."""
# Mock the completion response
mock_completion.return_value = {
'choices': [{'message': {'content': 'Optimized response'}}],
'usage': {'prompt_tokens': 50, 'completion_tokens': 25},
}
# Create Gemini configuration
config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
# Verify config has optimized defaults
assert config.reasoning_effort is None
# Create LLM and make completion
llm = LLM(config)
messages = [{'role': 'user', 'content': 'Solve this complex problem'}]
response = llm.completion(messages=messages)
# Verify response was generated
assert response['choices'][0]['message']['content'] == 'Optimized response'
# Verify optimization parameters were applied
call_kwargs = mock_completion.call_args[1]
assert 'thinking' in call_kwargs
assert call_kwargs['thinking'] == {'budget_tokens': 128}
assert call_kwargs.get('reasoning_effort') is None
# Verify temperature and top_p were removed for reasoning models
assert 'temperature' not in call_kwargs
assert 'top_p' not in call_kwargs