perf(gemini): Apply Gemini 2.5 Pro performance optimizations from PR 9913 (#9925)

Co-authored-by: OpenHands-Claude <openhands@all-hands.dev>
2025-12-26 05:48:36 +08:00 · 2025-07-30 01:28:50 +02:00 · 2025-07-30 01:28:50 +02:00 · a32a623078
commit a32a623078
parent 03c8312f5f
5 changed files with 248 additions and 31 deletions
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@ -43,7 +43,7 @@ class LLMConfig(BaseModel):
        log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
        custom_tokenizer: A custom tokenizer to use for token counting.
        native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set.
-        reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models.
+        reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Can apply to all reasoning models.
        seed: The seed to use for the LLM.
        safety_settings: Safety settings for models that support them (like Mistral AI and Gemini).
    """
@ -85,7 +85,7 @@ class LLMConfig(BaseModel):
    log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions'))
    custom_tokenizer: str | None = Field(default=None)
    native_tool_calling: bool | None = Field(default=None)
-    reasoning_effort: str | None = Field(default='high')
+    reasoning_effort: str | None = Field(default=None)
    seed: int | None = Field(default=None)
    safety_settings: list[dict[str, str]] | None = Field(
        default=None,
@ -171,6 +171,14 @@ class LLMConfig(BaseModel):
        if self.openrouter_app_name:
            os.environ['OR_APP_NAME'] = self.openrouter_app_name

+        # Set reasoning_effort to 'high' by default for non-Gemini models
+        # Gemini models use optimized thinking budget when reasoning_effort is None
+        logger.debug(
+            f'Setting reasoning_effort for model {self.model} with reasoning_effort {self.reasoning_effort}'
+        )
+        if self.reasoning_effort is None and 'gemini-2.5-pro' not in self.model:
+            self.reasoning_effort = 'high'
+
        # Set an API version by default for Azure models
        # Required for newer models.
        # Azure issue: https://github.com/All-Hands-AI/OpenHands/issues/7755
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@ -194,7 +194,24 @@ class LLM(RetryMixin, DebugMixin):
            self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS
            or self.config.model.split('/')[-1] in REASONING_EFFORT_SUPPORTED_MODELS
        ):
-            kwargs['reasoning_effort'] = self.config.reasoning_effort
+            # For Gemini models, only map 'low' to optimized thinking budget
+            # Let other reasoning_effort values pass through to API as-is
+            if 'gemini-2.5-pro' in self.config.model:
+                logger.debug(
+                    f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort}'
+                )
+                if self.config.reasoning_effort in {None, 'low', 'none'}:
+                    kwargs['thinking'] = {'budget_tokens': 128}
+                    kwargs['allowed_openai_params'] = ['thinking']
+                    kwargs.pop('reasoning_effort', None)
+                else:
+                    kwargs['reasoning_effort'] = self.config.reasoning_effort
+                logger.debug(
+                    f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort} mapped to thinking {kwargs.get("thinking")}'
+                )
+
+            else:
+                kwargs['reasoning_effort'] = self.config.reasoning_effort
            kwargs.pop(
                'temperature'
            )  # temperature is not supported for reasoning models
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@ -242,9 +242,6 @@ class ConversationMemory:

            # Add the LLM message (assistant) that initiated the tool calls
            # (overwrites any previous message with the same response_id)
-            logger.debug(
-                f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}'
-            )
            pending_tool_call_action_messages[llm_response.id] = Message(
                role=getattr(assistant_msg, 'role', 'assistant'),
                # tool call content SHOULD BE a string
--- a/openhands/runtime/impl/cli/cli_runtime.py
+++ b/openhands/runtime/impl/cli/cli_runtime.py
@ -376,7 +376,6 @@ class CLIRuntime(Runtime):
                    if ready_to_read:
                        line = process.stdout.readline()
                        if line:
-                            logger.debug(f'LINE: {line}')
                            output_lines.append(line)
                            if self._shell_stream_callback:
                                self._shell_stream_callback(line)
@ -387,7 +386,6 @@ class CLIRuntime(Runtime):
                    while line:
                        line = process.stdout.readline()
                        if line:
-                            logger.debug(f'LINE: {line}')
                            output_lines.append(line)
                            if self._shell_stream_callback:
                                self._shell_stream_callback(line)
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@ -733,34 +733,31 @@ def test_completion_with_litellm_mock(mock_litellm_completion, default_config):


@patch('openhands.llm.llm.litellm_completion')
-def test_completion_with_two_positional_args(mock_litellm_completion, default_config):
-    mock_response = {
-        'choices': [{'message': {'content': 'Response to positional args.'}}]
+def test_llm_gemini_thinking_parameter(mock_litellm_completion, default_config):
+    """
+    Test that the 'thinking' parameter is correctly passed to litellm_completion
+    when a Gemini model is used with 'low' reasoning_effort.
+    """
+    # Configure for Gemini model with low reasoning effort
+    gemini_config = copy.deepcopy(default_config)
+    gemini_config.model = 'gemini-2.5-pro'
+    gemini_config.reasoning_effort = 'low'
+
+    # Mock the response from litellm
+    mock_litellm_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}]
    }
-    mock_litellm_completion.return_value = mock_response

-    test_llm = LLM(config=default_config)
-    response = test_llm.completion(
-        'some-model-to-be-ignored',
-        [{'role': 'user', 'content': 'Hello from positional args!'}],
-        stream=False,
-    )
+    # Initialize LLM and call completion
+    llm = LLM(config=gemini_config)
+    llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])

-    # Assertions
-    assert (
-        response['choices'][0]['message']['content'] == 'Response to positional args.'
-    )
+    # Verify that litellm_completion was called with the 'thinking' parameter
    mock_litellm_completion.assert_called_once()
-
-    # Check if the correct arguments were passed to litellm_completion
    call_args, call_kwargs = mock_litellm_completion.call_args
-    assert (
-        call_kwargs['model'] == default_config.model
-    )  # Should use the model from config, not the first arg
-    assert call_kwargs['messages'] == [
-        {'role': 'user', 'content': 'Hello from positional args!'}
-    ]
-    assert not call_kwargs['stream']
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert 'reasoning_effort' not in call_kwargs

    # Ensure the first positional argument (model) was ignored
    assert (
@ -1111,3 +1108,203 @@ def test_azure_model_default_max_tokens():

    # Verify the config has the default max_output_tokens value
    assert llm.config.max_output_tokens is None  # Default value
+
+
+# Gemini Performance Optimization Tests
+
+
+def test_gemini_model_keeps_none_reasoning_effort():
+    """Test that Gemini models keep reasoning_effort=None for optimization."""
+    config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
+    # reasoning_effort should remain None for Gemini models
+    assert config.reasoning_effort is None
+
+
+def test_non_gemini_model_gets_high_reasoning_effort():
+    """Test that non-Gemini models get reasoning_effort='high' by default."""
+    config = LLMConfig(model='gpt-4o', api_key='test_key')
+    # Non-Gemini models should get reasoning_effort='high'
+    assert config.reasoning_effort == 'high'
+
+
+def test_explicit_reasoning_effort_preserved():
+    """Test that explicitly set reasoning_effort is preserved."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
+    )
+    # Explicitly set reasoning_effort should be preserved
+    assert config.reasoning_effort == 'medium'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_none_reasoning_effort_uses_thinking_budget(mock_completion):
+    """Test that Gemini with reasoning_effort=None uses thinking budget."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort=None
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that thinking budget was set and reasoning_effort was None
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert call_kwargs.get('reasoning_effort') is None
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_low_reasoning_effort_uses_thinking_budget(mock_completion):
+    """Test that Gemini with reasoning_effort='low' uses thinking budget."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='low'
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that thinking budget was set and reasoning_effort was None
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert call_kwargs.get('reasoning_effort') is None
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
+    """Test that Gemini with reasoning_effort='medium' passes through to litellm."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that reasoning_effort was passed through and thinking budget was not set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert call_kwargs.get('reasoning_effort') == 'medium'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_high_reasoning_effort_passes_through(mock_completion):
+    """Test that Gemini with reasoning_effort='high' passes through to litellm."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='high'
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that reasoning_effort was passed through and thinking budget was not set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert call_kwargs.get('reasoning_effort') == 'high'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_non_gemini_uses_reasoning_effort(mock_completion):
+    """Test that non-Gemini models use reasoning_effort instead of thinking budget."""
+    config = LLMConfig(model='o1', api_key='test_key', reasoning_effort='high')
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that reasoning_effort was used and thinking budget was not set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert call_kwargs.get('reasoning_effort') == 'high'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_non_reasoning_model_no_optimization(mock_completion):
+    """Test that non-reasoning models don't get optimization parameters."""
+    config = LLMConfig(
+        model='gpt-3.5-turbo',  # Not in REASONING_EFFORT_SUPPORTED_MODELS
+        api_key='test_key',
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that neither thinking budget nor reasoning_effort were set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert 'reasoning_effort' not in call_kwargs
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_performance_optimization_end_to_end(mock_completion):
+    """Test the complete Gemini performance optimization flow end-to-end."""
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Optimized response'}}],
+        'usage': {'prompt_tokens': 50, 'completion_tokens': 25},
+    }
+
+    # Create Gemini configuration
+    config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
+
+    # Verify config has optimized defaults
+    assert config.reasoning_effort is None
+
+    # Create LLM and make completion
+    llm = LLM(config)
+    messages = [{'role': 'user', 'content': 'Solve this complex problem'}]
+
+    response = llm.completion(messages=messages)
+
+    # Verify response was generated
+    assert response['choices'][0]['message']['content'] == 'Optimized response'
+
+    # Verify optimization parameters were applied
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert call_kwargs.get('reasoning_effort') is None
+
+    # Verify temperature and top_p were removed for reasoning models
+    assert 'temperature' not in call_kwargs
+    assert 'top_p' not in call_kwargs