From a32a6230782be168002cc4050efa71c9aabccc9c Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Wed, 30 Jul 2025 01:28:50 +0200 Subject: [PATCH] perf(gemini): Apply Gemini 2.5 Pro performance optimizations from PR 9913 (#9925) Co-authored-by: OpenHands-Claude --- openhands/core/config/llm_config.py | 12 +- openhands/llm/llm.py | 19 +- openhands/memory/conversation_memory.py | 3 - openhands/runtime/impl/cli/cli_runtime.py | 2 - tests/unit/test_llm.py | 243 ++++++++++++++++++++-- 5 files changed, 248 insertions(+), 31 deletions(-) diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py index d05037bde4..2a11d95213 100644 --- a/openhands/core/config/llm_config.py +++ b/openhands/core/config/llm_config.py @@ -43,7 +43,7 @@ class LLMConfig(BaseModel): log_completions_folder: The folder to log LLM completions to. Required if log_completions is True. custom_tokenizer: A custom tokenizer to use for token counting. native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set. - reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models. + reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Can apply to all reasoning models. seed: The seed to use for the LLM. safety_settings: Safety settings for models that support them (like Mistral AI and Gemini). """ @@ -85,7 +85,7 @@ class LLMConfig(BaseModel): log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions')) custom_tokenizer: str | None = Field(default=None) native_tool_calling: bool | None = Field(default=None) - reasoning_effort: str | None = Field(default='high') + reasoning_effort: str | None = Field(default=None) seed: int | None = Field(default=None) safety_settings: list[dict[str, str]] | None = Field( default=None, @@ -171,6 +171,14 @@ class LLMConfig(BaseModel): if self.openrouter_app_name: os.environ['OR_APP_NAME'] = self.openrouter_app_name + # Set reasoning_effort to 'high' by default for non-Gemini models + # Gemini models use optimized thinking budget when reasoning_effort is None + logger.debug( + f'Setting reasoning_effort for model {self.model} with reasoning_effort {self.reasoning_effort}' + ) + if self.reasoning_effort is None and 'gemini-2.5-pro' not in self.model: + self.reasoning_effort = 'high' + # Set an API version by default for Azure models # Required for newer models. # Azure issue: https://github.com/All-Hands-AI/OpenHands/issues/7755 diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 1db12ee32e..617ad64950 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -194,7 +194,24 @@ class LLM(RetryMixin, DebugMixin): self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS or self.config.model.split('/')[-1] in REASONING_EFFORT_SUPPORTED_MODELS ): - kwargs['reasoning_effort'] = self.config.reasoning_effort + # For Gemini models, only map 'low' to optimized thinking budget + # Let other reasoning_effort values pass through to API as-is + if 'gemini-2.5-pro' in self.config.model: + logger.debug( + f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort}' + ) + if self.config.reasoning_effort in {None, 'low', 'none'}: + kwargs['thinking'] = {'budget_tokens': 128} + kwargs['allowed_openai_params'] = ['thinking'] + kwargs.pop('reasoning_effort', None) + else: + kwargs['reasoning_effort'] = self.config.reasoning_effort + logger.debug( + f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort} mapped to thinking {kwargs.get("thinking")}' + ) + + else: + kwargs['reasoning_effort'] = self.config.reasoning_effort kwargs.pop( 'temperature' ) # temperature is not supported for reasoning models diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py index a6ced05ae7..61ceaaa327 100644 --- a/openhands/memory/conversation_memory.py +++ b/openhands/memory/conversation_memory.py @@ -242,9 +242,6 @@ class ConversationMemory: # Add the LLM message (assistant) that initiated the tool calls # (overwrites any previous message with the same response_id) - logger.debug( - f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}' - ) pending_tool_call_action_messages[llm_response.id] = Message( role=getattr(assistant_msg, 'role', 'assistant'), # tool call content SHOULD BE a string diff --git a/openhands/runtime/impl/cli/cli_runtime.py b/openhands/runtime/impl/cli/cli_runtime.py index 105b066e18..6e1e635a69 100644 --- a/openhands/runtime/impl/cli/cli_runtime.py +++ b/openhands/runtime/impl/cli/cli_runtime.py @@ -376,7 +376,6 @@ class CLIRuntime(Runtime): if ready_to_read: line = process.stdout.readline() if line: - logger.debug(f'LINE: {line}') output_lines.append(line) if self._shell_stream_callback: self._shell_stream_callback(line) @@ -387,7 +386,6 @@ class CLIRuntime(Runtime): while line: line = process.stdout.readline() if line: - logger.debug(f'LINE: {line}') output_lines.append(line) if self._shell_stream_callback: self._shell_stream_callback(line) diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index c5a1c7f48a..0a644fdc8f 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -733,34 +733,31 @@ def test_completion_with_litellm_mock(mock_litellm_completion, default_config): @patch('openhands.llm.llm.litellm_completion') -def test_completion_with_two_positional_args(mock_litellm_completion, default_config): - mock_response = { - 'choices': [{'message': {'content': 'Response to positional args.'}}] +def test_llm_gemini_thinking_parameter(mock_litellm_completion, default_config): + """ + Test that the 'thinking' parameter is correctly passed to litellm_completion + when a Gemini model is used with 'low' reasoning_effort. + """ + # Configure for Gemini model with low reasoning effort + gemini_config = copy.deepcopy(default_config) + gemini_config.model = 'gemini-2.5-pro' + gemini_config.reasoning_effort = 'low' + + # Mock the response from litellm + mock_litellm_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}] } - mock_litellm_completion.return_value = mock_response - test_llm = LLM(config=default_config) - response = test_llm.completion( - 'some-model-to-be-ignored', - [{'role': 'user', 'content': 'Hello from positional args!'}], - stream=False, - ) + # Initialize LLM and call completion + llm = LLM(config=gemini_config) + llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}]) - # Assertions - assert ( - response['choices'][0]['message']['content'] == 'Response to positional args.' - ) + # Verify that litellm_completion was called with the 'thinking' parameter mock_litellm_completion.assert_called_once() - - # Check if the correct arguments were passed to litellm_completion call_args, call_kwargs = mock_litellm_completion.call_args - assert ( - call_kwargs['model'] == default_config.model - ) # Should use the model from config, not the first arg - assert call_kwargs['messages'] == [ - {'role': 'user', 'content': 'Hello from positional args!'} - ] - assert not call_kwargs['stream'] + assert 'thinking' in call_kwargs + assert call_kwargs['thinking'] == {'budget_tokens': 128} + assert 'reasoning_effort' not in call_kwargs # Ensure the first positional argument (model) was ignored assert ( @@ -1111,3 +1108,203 @@ def test_azure_model_default_max_tokens(): # Verify the config has the default max_output_tokens value assert llm.config.max_output_tokens is None # Default value + + +# Gemini Performance Optimization Tests + + +def test_gemini_model_keeps_none_reasoning_effort(): + """Test that Gemini models keep reasoning_effort=None for optimization.""" + config = LLMConfig(model='gemini-2.5-pro', api_key='test_key') + # reasoning_effort should remain None for Gemini models + assert config.reasoning_effort is None + + +def test_non_gemini_model_gets_high_reasoning_effort(): + """Test that non-Gemini models get reasoning_effort='high' by default.""" + config = LLMConfig(model='gpt-4o', api_key='test_key') + # Non-Gemini models should get reasoning_effort='high' + assert config.reasoning_effort == 'high' + + +def test_explicit_reasoning_effort_preserved(): + """Test that explicitly set reasoning_effort is preserved.""" + config = LLMConfig( + model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium' + ) + # Explicitly set reasoning_effort should be preserved + assert config.reasoning_effort == 'medium' + + +@patch('openhands.llm.llm.litellm_completion') +def test_gemini_none_reasoning_effort_uses_thinking_budget(mock_completion): + """Test that Gemini with reasoning_effort=None uses thinking budget.""" + config = LLMConfig( + model='gemini-2.5-pro', api_key='test_key', reasoning_effort=None + ) + + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}], + 'usage': {'prompt_tokens': 10, 'completion_tokens': 5}, + } + + llm = LLM(config) + sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}] + llm.completion(messages=sample_messages) + + # Verify that thinking budget was set and reasoning_effort was None + call_kwargs = mock_completion.call_args[1] + assert 'thinking' in call_kwargs + assert call_kwargs['thinking'] == {'budget_tokens': 128} + assert call_kwargs.get('reasoning_effort') is None + + +@patch('openhands.llm.llm.litellm_completion') +def test_gemini_low_reasoning_effort_uses_thinking_budget(mock_completion): + """Test that Gemini with reasoning_effort='low' uses thinking budget.""" + config = LLMConfig( + model='gemini-2.5-pro', api_key='test_key', reasoning_effort='low' + ) + + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}], + 'usage': {'prompt_tokens': 10, 'completion_tokens': 5}, + } + + llm = LLM(config) + sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}] + llm.completion(messages=sample_messages) + + # Verify that thinking budget was set and reasoning_effort was None + call_kwargs = mock_completion.call_args[1] + assert 'thinking' in call_kwargs + assert call_kwargs['thinking'] == {'budget_tokens': 128} + assert call_kwargs.get('reasoning_effort') is None + + +@patch('openhands.llm.llm.litellm_completion') +def test_gemini_medium_reasoning_effort_passes_through(mock_completion): + """Test that Gemini with reasoning_effort='medium' passes through to litellm.""" + config = LLMConfig( + model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium' + ) + + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}], + 'usage': {'prompt_tokens': 10, 'completion_tokens': 5}, + } + + llm = LLM(config) + sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}] + llm.completion(messages=sample_messages) + + # Verify that reasoning_effort was passed through and thinking budget was not set + call_kwargs = mock_completion.call_args[1] + assert 'thinking' not in call_kwargs + assert call_kwargs.get('reasoning_effort') == 'medium' + + +@patch('openhands.llm.llm.litellm_completion') +def test_gemini_high_reasoning_effort_passes_through(mock_completion): + """Test that Gemini with reasoning_effort='high' passes through to litellm.""" + config = LLMConfig( + model='gemini-2.5-pro', api_key='test_key', reasoning_effort='high' + ) + + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}], + 'usage': {'prompt_tokens': 10, 'completion_tokens': 5}, + } + + llm = LLM(config) + sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}] + llm.completion(messages=sample_messages) + + # Verify that reasoning_effort was passed through and thinking budget was not set + call_kwargs = mock_completion.call_args[1] + assert 'thinking' not in call_kwargs + assert call_kwargs.get('reasoning_effort') == 'high' + + +@patch('openhands.llm.llm.litellm_completion') +def test_non_gemini_uses_reasoning_effort(mock_completion): + """Test that non-Gemini models use reasoning_effort instead of thinking budget.""" + config = LLMConfig(model='o1', api_key='test_key', reasoning_effort='high') + + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}], + 'usage': {'prompt_tokens': 10, 'completion_tokens': 5}, + } + + llm = LLM(config) + sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}] + llm.completion(messages=sample_messages) + + # Verify that reasoning_effort was used and thinking budget was not set + call_kwargs = mock_completion.call_args[1] + assert 'thinking' not in call_kwargs + assert call_kwargs.get('reasoning_effort') == 'high' + + +@patch('openhands.llm.llm.litellm_completion') +def test_non_reasoning_model_no_optimization(mock_completion): + """Test that non-reasoning models don't get optimization parameters.""" + config = LLMConfig( + model='gpt-3.5-turbo', # Not in REASONING_EFFORT_SUPPORTED_MODELS + api_key='test_key', + ) + + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Test response'}}], + 'usage': {'prompt_tokens': 10, 'completion_tokens': 5}, + } + + llm = LLM(config) + sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}] + llm.completion(messages=sample_messages) + + # Verify that neither thinking budget nor reasoning_effort were set + call_kwargs = mock_completion.call_args[1] + assert 'thinking' not in call_kwargs + assert 'reasoning_effort' not in call_kwargs + + +@patch('openhands.llm.llm.litellm_completion') +def test_gemini_performance_optimization_end_to_end(mock_completion): + """Test the complete Gemini performance optimization flow end-to-end.""" + # Mock the completion response + mock_completion.return_value = { + 'choices': [{'message': {'content': 'Optimized response'}}], + 'usage': {'prompt_tokens': 50, 'completion_tokens': 25}, + } + + # Create Gemini configuration + config = LLMConfig(model='gemini-2.5-pro', api_key='test_key') + + # Verify config has optimized defaults + assert config.reasoning_effort is None + + # Create LLM and make completion + llm = LLM(config) + messages = [{'role': 'user', 'content': 'Solve this complex problem'}] + + response = llm.completion(messages=messages) + + # Verify response was generated + assert response['choices'][0]['message']['content'] == 'Optimized response' + + # Verify optimization parameters were applied + call_kwargs = mock_completion.call_args[1] + assert 'thinking' in call_kwargs + assert call_kwargs['thinking'] == {'budget_tokens': 128} + assert call_kwargs.get('reasoning_effort') is None + + # Verify temperature and top_p were removed for reasoning models + assert 'temperature' not in call_kwargs + assert 'top_p' not in call_kwargs