From a32a6230782be168002cc4050efa71c9aabccc9c Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 30 Jul 2025 01:28:50 +0200
Subject: [PATCH] perf(gemini): Apply Gemini 2.5 Pro performance optimizations
 from PR 9913 (#9925)

Co-authored-by: OpenHands-Claude <openhands@all-hands.dev>
---
 openhands/core/config/llm_config.py       |  12 +-
 openhands/llm/llm.py                      |  19 +-
 openhands/memory/conversation_memory.py   |   3 -
 openhands/runtime/impl/cli/cli_runtime.py |   2 -
 tests/unit/test_llm.py                    | 243 ++++++++++++++++++++--
 5 files changed, 248 insertions(+), 31 deletions(-)

diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index d05037bde4..2a11d95213 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -43,7 +43,7 @@ class LLMConfig(BaseModel):
         log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
         custom_tokenizer: A custom tokenizer to use for token counting.
         native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set.
-        reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models.
+        reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Can apply to all reasoning models.
         seed: The seed to use for the LLM.
         safety_settings: Safety settings for models that support them (like Mistral AI and Gemini).
     """
@@ -85,7 +85,7 @@ class LLMConfig(BaseModel):
     log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions'))
     custom_tokenizer: str | None = Field(default=None)
     native_tool_calling: bool | None = Field(default=None)
-    reasoning_effort: str | None = Field(default='high')
+    reasoning_effort: str | None = Field(default=None)
     seed: int | None = Field(default=None)
     safety_settings: list[dict[str, str]] | None = Field(
         default=None,
@@ -171,6 +171,14 @@ class LLMConfig(BaseModel):
         if self.openrouter_app_name:
             os.environ['OR_APP_NAME'] = self.openrouter_app_name
 
+        # Set reasoning_effort to 'high' by default for non-Gemini models
+        # Gemini models use optimized thinking budget when reasoning_effort is None
+        logger.debug(
+            f'Setting reasoning_effort for model {self.model} with reasoning_effort {self.reasoning_effort}'
+        )
+        if self.reasoning_effort is None and 'gemini-2.5-pro' not in self.model:
+            self.reasoning_effort = 'high'
+
         # Set an API version by default for Azure models
         # Required for newer models.
         # Azure issue: https://github.com/All-Hands-AI/OpenHands/issues/7755
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 1db12ee32e..617ad64950 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -194,7 +194,24 @@ class LLM(RetryMixin, DebugMixin):
             self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS
             or self.config.model.split('/')[-1] in REASONING_EFFORT_SUPPORTED_MODELS
         ):
-            kwargs['reasoning_effort'] = self.config.reasoning_effort
+            # For Gemini models, only map 'low' to optimized thinking budget
+            # Let other reasoning_effort values pass through to API as-is
+            if 'gemini-2.5-pro' in self.config.model:
+                logger.debug(
+                    f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort}'
+                )
+                if self.config.reasoning_effort in {None, 'low', 'none'}:
+                    kwargs['thinking'] = {'budget_tokens': 128}
+                    kwargs['allowed_openai_params'] = ['thinking']
+                    kwargs.pop('reasoning_effort', None)
+                else:
+                    kwargs['reasoning_effort'] = self.config.reasoning_effort
+                logger.debug(
+                    f'Gemini model {self.config.model} with reasoning_effort {self.config.reasoning_effort} mapped to thinking {kwargs.get("thinking")}'
+                )
+
+            else:
+                kwargs['reasoning_effort'] = self.config.reasoning_effort
             kwargs.pop(
                 'temperature'
             )  # temperature is not supported for reasoning models
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index a6ced05ae7..61ceaaa327 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -242,9 +242,6 @@ class ConversationMemory:
 
             # Add the LLM message (assistant) that initiated the tool calls
             # (overwrites any previous message with the same response_id)
-            logger.debug(
-                f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}'
-            )
             pending_tool_call_action_messages[llm_response.id] = Message(
                 role=getattr(assistant_msg, 'role', 'assistant'),
                 # tool call content SHOULD BE a string
diff --git a/openhands/runtime/impl/cli/cli_runtime.py b/openhands/runtime/impl/cli/cli_runtime.py
index 105b066e18..6e1e635a69 100644
--- a/openhands/runtime/impl/cli/cli_runtime.py
+++ b/openhands/runtime/impl/cli/cli_runtime.py
@@ -376,7 +376,6 @@ class CLIRuntime(Runtime):
                     if ready_to_read:
                         line = process.stdout.readline()
                         if line:
-                            logger.debug(f'LINE: {line}')
                             output_lines.append(line)
                             if self._shell_stream_callback:
                                 self._shell_stream_callback(line)
@@ -387,7 +386,6 @@ class CLIRuntime(Runtime):
                     while line:
                         line = process.stdout.readline()
                         if line:
-                            logger.debug(f'LINE: {line}')
                             output_lines.append(line)
                             if self._shell_stream_callback:
                                 self._shell_stream_callback(line)
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
index c5a1c7f48a..0a644fdc8f 100644
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -733,34 +733,31 @@ def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
 
 
 @patch('openhands.llm.llm.litellm_completion')
-def test_completion_with_two_positional_args(mock_litellm_completion, default_config):
-    mock_response = {
-        'choices': [{'message': {'content': 'Response to positional args.'}}]
+def test_llm_gemini_thinking_parameter(mock_litellm_completion, default_config):
+    """
+    Test that the 'thinking' parameter is correctly passed to litellm_completion
+    when a Gemini model is used with 'low' reasoning_effort.
+    """
+    # Configure for Gemini model with low reasoning effort
+    gemini_config = copy.deepcopy(default_config)
+    gemini_config.model = 'gemini-2.5-pro'
+    gemini_config.reasoning_effort = 'low'
+
+    # Mock the response from litellm
+    mock_litellm_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}]
     }
-    mock_litellm_completion.return_value = mock_response
 
-    test_llm = LLM(config=default_config)
-    response = test_llm.completion(
-        'some-model-to-be-ignored',
-        [{'role': 'user', 'content': 'Hello from positional args!'}],
-        stream=False,
-    )
+    # Initialize LLM and call completion
+    llm = LLM(config=gemini_config)
+    llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
 
-    # Assertions
-    assert (
-        response['choices'][0]['message']['content'] == 'Response to positional args.'
-    )
+    # Verify that litellm_completion was called with the 'thinking' parameter
     mock_litellm_completion.assert_called_once()
-
-    # Check if the correct arguments were passed to litellm_completion
     call_args, call_kwargs = mock_litellm_completion.call_args
-    assert (
-        call_kwargs['model'] == default_config.model
-    )  # Should use the model from config, not the first arg
-    assert call_kwargs['messages'] == [
-        {'role': 'user', 'content': 'Hello from positional args!'}
-    ]
-    assert not call_kwargs['stream']
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert 'reasoning_effort' not in call_kwargs
 
     # Ensure the first positional argument (model) was ignored
     assert (
@@ -1111,3 +1108,203 @@ def test_azure_model_default_max_tokens():
 
     # Verify the config has the default max_output_tokens value
     assert llm.config.max_output_tokens is None  # Default value
+
+
+# Gemini Performance Optimization Tests
+
+
+def test_gemini_model_keeps_none_reasoning_effort():
+    """Test that Gemini models keep reasoning_effort=None for optimization."""
+    config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
+    # reasoning_effort should remain None for Gemini models
+    assert config.reasoning_effort is None
+
+
+def test_non_gemini_model_gets_high_reasoning_effort():
+    """Test that non-Gemini models get reasoning_effort='high' by default."""
+    config = LLMConfig(model='gpt-4o', api_key='test_key')
+    # Non-Gemini models should get reasoning_effort='high'
+    assert config.reasoning_effort == 'high'
+
+
+def test_explicit_reasoning_effort_preserved():
+    """Test that explicitly set reasoning_effort is preserved."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
+    )
+    # Explicitly set reasoning_effort should be preserved
+    assert config.reasoning_effort == 'medium'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_none_reasoning_effort_uses_thinking_budget(mock_completion):
+    """Test that Gemini with reasoning_effort=None uses thinking budget."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort=None
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that thinking budget was set and reasoning_effort was None
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert call_kwargs.get('reasoning_effort') is None
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_low_reasoning_effort_uses_thinking_budget(mock_completion):
+    """Test that Gemini with reasoning_effort='low' uses thinking budget."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='low'
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that thinking budget was set and reasoning_effort was None
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert call_kwargs.get('reasoning_effort') is None
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
+    """Test that Gemini with reasoning_effort='medium' passes through to litellm."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that reasoning_effort was passed through and thinking budget was not set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert call_kwargs.get('reasoning_effort') == 'medium'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_high_reasoning_effort_passes_through(mock_completion):
+    """Test that Gemini with reasoning_effort='high' passes through to litellm."""
+    config = LLMConfig(
+        model='gemini-2.5-pro', api_key='test_key', reasoning_effort='high'
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that reasoning_effort was passed through and thinking budget was not set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert call_kwargs.get('reasoning_effort') == 'high'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_non_gemini_uses_reasoning_effort(mock_completion):
+    """Test that non-Gemini models use reasoning_effort instead of thinking budget."""
+    config = LLMConfig(model='o1', api_key='test_key', reasoning_effort='high')
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that reasoning_effort was used and thinking budget was not set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert call_kwargs.get('reasoning_effort') == 'high'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_non_reasoning_model_no_optimization(mock_completion):
+    """Test that non-reasoning models don't get optimization parameters."""
+    config = LLMConfig(
+        model='gpt-3.5-turbo',  # Not in REASONING_EFFORT_SUPPORTED_MODELS
+        api_key='test_key',
+    )
+
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}],
+        'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
+    }
+
+    llm = LLM(config)
+    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
+    llm.completion(messages=sample_messages)
+
+    # Verify that neither thinking budget nor reasoning_effort were set
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' not in call_kwargs
+    assert 'reasoning_effort' not in call_kwargs
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_gemini_performance_optimization_end_to_end(mock_completion):
+    """Test the complete Gemini performance optimization flow end-to-end."""
+    # Mock the completion response
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'Optimized response'}}],
+        'usage': {'prompt_tokens': 50, 'completion_tokens': 25},
+    }
+
+    # Create Gemini configuration
+    config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
+
+    # Verify config has optimized defaults
+    assert config.reasoning_effort is None
+
+    # Create LLM and make completion
+    llm = LLM(config)
+    messages = [{'role': 'user', 'content': 'Solve this complex problem'}]
+
+    response = llm.completion(messages=messages)
+
+    # Verify response was generated
+    assert response['choices'][0]['message']['content'] == 'Optimized response'
+
+    # Verify optimization parameters were applied
+    call_kwargs = mock_completion.call_args[1]
+    assert 'thinking' in call_kwargs
+    assert call_kwargs['thinking'] == {'budget_tokens': 128}
+    assert call_kwargs.get('reasoning_effort') is None
+
+    # Verify temperature and top_p were removed for reasoning models
+    assert 'temperature' not in call_kwargs
+    assert 'top_p' not in call_kwargs