fix(llm): retry on InternalServerError and Timeout; handle Gemini returns len(choices) < 1 (#7713)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
2025-12-26 05:48:36 +08:00 · 2025-04-05 14:33:17 -07:00 · 2025-04-05 14:33:17 -07:00 · 0ab9d97f2d
commit 0ab9d97f2d
parent b326433330
4 changed files with 266 additions and 4 deletions
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@ -88,6 +88,16 @@ class LLMResponseError(Exception):
        super().__init__(message)


+# This exception should be retried
+# Typically, after retry with a non-zero temperature, the LLM will return a response
+class LLMNoResponseError(Exception):
+    def __init__(
+        self,
+        message: str = 'LLM did not return a response. This is only seen in Gemini models so far.',
+    ) -> None:
+        super().__init__(message)
+
+
 class UserCancelledError(Exception):
    def __init__(self, message: str = 'User cancelled the request') -> None:
        super().__init__(message)
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@ -23,6 +23,7 @@ from litellm.exceptions import (
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
 from litellm.utils import create_pretrained_tokenizer

+from openhands.core.exceptions import LLMNoResponseError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
 from openhands.llm.debug_mixin import DebugMixin
@ -37,7 +38,12 @@ from openhands.llm.retry_mixin import RetryMixin
 __all__ = ['LLM']

 # tuple of exceptions to retry on
-LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (RateLimitError,)
+LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
+    RateLimitError,
+    litellm.Timeout,
+    litellm.InternalServerError,
+    LLMNoResponseError,
+)

 # cache prompt supporting models
 # remove this when we gemini and deepseek are supported
@ -63,6 +69,7 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
    'o1-2024-12-17',
    'o3-mini-2025-01-31',
    'o3-mini',
+    'gemini-2.5-pro',
 ]

 REASONING_EFFORT_SUPPORTED_MODELS = [
@ -267,8 +274,12 @@ class LLM(RetryMixin, DebugMixin):

            # if we mocked function calling, and we have tools, convert the response back to function calling format
            if mock_function_calling and mock_fncall_tools is not None:
-                logger.debug(f'Response choices: {len(resp.choices)}')
-                assert len(resp.choices) >= 1
+                if len(resp.choices) < 1:
+                    raise LLMNoResponseError(
+                        'Response choices is less than 1 - This is only seen in Gemini models so far. Response: '
+                        + str(resp)
+                    )
+
                non_fncall_response_message = resp.choices[0].message
                fn_call_messages_with_response = (
                    convert_non_fncall_messages_to_fncall_messages(
@ -282,6 +293,13 @@ class LLM(RetryMixin, DebugMixin):
                    )
                resp.choices[0].message = fn_call_response_message

+            # Check if resp has 'choices' key with at least one item
+            if not resp.get('choices') or len(resp['choices']) < 1:
+                raise LLMNoResponseError(
+                    'Response choices is less than 1 - This is only seen in Gemini models so far. Response: '
+                    + str(resp)
+                )
+
            message_back: str = resp['choices'][0]['message']['content'] or ''
            tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][
                'message'
--- a/openhands/llm/retry_mixin.py
+++ b/openhands/llm/retry_mixin.py
@ -5,6 +5,7 @@ from tenacity import (
    wait_exponential,
 )

+from openhands.core.exceptions import LLMNoResponseError
 from openhands.core.logger import openhands_logger as logger
 from openhands.utils.tenacity_stop import stop_if_should_exit

@ -35,6 +36,22 @@ class RetryMixin:
            if retry_listener:
                retry_listener(retry_state.attempt_number, num_retries)

+            # Check if the exception is LLMNoResponseError
+            exception = retry_state.outcome.exception()
+            if isinstance(exception, LLMNoResponseError):
+                if hasattr(retry_state, 'kwargs'):
+                    # Only change temperature if it's zero or not set
+                    current_temp = retry_state.kwargs.get('temperature', 0)
+                    if current_temp == 0:
+                        retry_state.kwargs['temperature'] = 1.0
+                        logger.warning(
+                            'LLMNoResponseError detected with temperature=0, setting temperature to 1.0 for next attempt.'
+                        )
+                    else:
+                        logger.warning(
+                            f'LLMNoResponseError detected with temperature={current_temp}, keeping original temperature'
+                        )
+
        return retry(
            before_sleep=before_sleep,
            stop=stop_after_attempt(num_retries) | stop_if_should_exit(),
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@ -10,7 +10,7 @@ from litellm.exceptions import (
 )

 from openhands.core.config import LLMConfig
-from openhands.core.exceptions import OperationCancelled
+from openhands.core.exceptions import LLMNoResponseError, OperationCancelled
 from openhands.core.message import Message, TextContent
 from openhands.llm.llm import LLM
 from openhands.llm.metrics import Metrics, TokenUsage
@ -385,6 +385,223 @@ def test_completion_keyboard_interrupt_handler(mock_litellm_completion, default_
    _should_exit = False


+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_retry_with_llm_no_response_error_zero_temp(
+    mock_litellm_completion, default_config
+):
+    """
+    Test that the retry decorator properly handles LLMNoResponseError by:
+    1. First call to llm_completion uses temperature=0 and throws LLMNoResponseError
+    2. Second call should have temperature=0.2 and return a successful response
+    """
+
+    # Define a side effect function that checks the temperature parameter
+    # and returns different responses based on it
+    def side_effect(*args, **kwargs):
+        temperature = kwargs.get('temperature', 0)
+
+        # First call with temperature=0 should raise LLMNoResponseError
+        if temperature == 0:
+            raise LLMNoResponseError('LLM did not return a response')
+
+        else:
+            return {
+                'choices': [
+                    {'message': {'content': f'Response with temperature={temperature}'}}
+                ]
+            }
+
+    mock_litellm_completion.side_effect = side_effect
+
+    # Create LLM instance and make a completion call
+    llm = LLM(config=default_config)
+    response = llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+        temperature=0,  # Initial temperature is 0
+    )
+
+    # Verify the response after retry
+    assert (
+        response['choices'][0]['message']['content'] == 'Response with temperature=1.0'
+    )
+
+    # Verify that litellm_completion was called twice
+    assert mock_litellm_completion.call_count == 2
+
+    # Check the temperature in the first call (should be 0)
+    first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
+    assert first_call_kwargs.get('temperature') == 0
+
+    # Check the temperature in the second call (should be 1.0)
+    second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
+    assert second_call_kwargs.get('temperature') == 1.0
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_retry_with_llm_no_response_error_nonzero_temp(
+    mock_litellm_completion, default_config
+):
+    """
+    Test that the retry decorator works for LLMNoResponseError when initial temperature is non-zero,
+    and keeps the original temperature on retry.
+
+    This test verifies that when LLMNoResponseError is raised with a non-zero temperature:
+    1. The retry mechanism is triggered
+    2. The temperature remains unchanged (not set to 0.2)
+    3. After all retries are exhausted, the exception is propagated
+    """
+    # Define a side effect function that always raises LLMNoResponseError
+    mock_litellm_completion.side_effect = LLMNoResponseError(
+        'LLM did not return a response'
+    )
+
+    # Create LLM instance and make a completion call with non-zero temperature
+    llm = LLM(config=default_config)
+
+    # We expect this to raise an exception after all retries are exhausted
+    with pytest.raises(LLMNoResponseError):
+        llm.completion(
+            messages=[{'role': 'user', 'content': 'Hello!'}],
+            stream=False,
+            temperature=0.7,  # Initial temperature is non-zero
+        )
+
+    # Verify that litellm_completion was called multiple times (retries happened)
+    # The default_config has num_retries=2, so it should be called 2 times
+    assert mock_litellm_completion.call_count == 2
+
+    # Check the temperature in the first call (should be 0.7)
+    first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
+    assert first_call_kwargs.get('temperature') == 0.7
+
+    # Check the temperature in the second call (should remain 0.7, not changed to 0.2)
+    second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
+    assert second_call_kwargs.get('temperature') == 0.7
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_retry_with_llm_no_response_error_nonzero_temp_successful_retry(
+    mock_litellm_completion, default_config
+):
+    """
+    Test that the retry decorator works for LLMNoResponseError with non-zero temperature
+    and successfully retries while preserving the original temperature.
+
+    This test verifies that:
+    1. First call to llm_completion with temperature=0.7 throws LLMNoResponseError
+    2. Second call with the same temperature=0.7 returns a successful response
+    """
+
+    # Define a side effect function that raises LLMNoResponseError on first call
+    # and returns a successful response on second call
+    def side_effect(*args, **kwargs):
+        temperature = kwargs.get('temperature', 0)
+
+        if mock_litellm_completion.call_count == 1:
+            # First call should raise LLMNoResponseError
+            raise LLMNoResponseError('LLM did not return a response')
+        else:
+            # Second call should return a successful response
+            return {
+                'choices': [
+                    {
+                        'message': {
+                            'content': f'Successful response with temperature={temperature}'
+                        }
+                    }
+                ]
+            }
+
+    mock_litellm_completion.side_effect = side_effect
+
+    # Create LLM instance and make a completion call with non-zero temperature
+    llm = LLM(config=default_config)
+    response = llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+        temperature=0.7,  # Non-zero temperature
+    )
+
+    # Verify the response after retry
+    assert (
+        response['choices'][0]['message']['content']
+        == 'Successful response with temperature=0.7'
+    )
+
+    # Verify that litellm_completion was called twice
+    assert mock_litellm_completion.call_count == 2
+
+    # Check the temperature in the first call (should be 0.7)
+    first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
+    assert first_call_kwargs.get('temperature') == 0.7
+
+    # Check the temperature in the second call (should still be 0.7)
+    second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
+    assert second_call_kwargs.get('temperature') == 0.7
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_retry_with_llm_no_response_error_successful_retry(
+    mock_litellm_completion, default_config
+):
+    """
+    Test that the retry decorator works for LLMNoResponseError with zero temperature
+    and successfully retries with temperature=0.2.
+
+    This test verifies that:
+    1. First call to llm_completion with temperature=0 throws LLMNoResponseError
+    2. Second call with temperature=0.2 returns a successful response
+    """
+
+    # Define a side effect function that raises LLMNoResponseError on first call
+    # and returns a successful response on second call
+    def side_effect(*args, **kwargs):
+        temperature = kwargs.get('temperature', 0)
+
+        if mock_litellm_completion.call_count == 1:
+            # First call should raise LLMNoResponseError
+            raise LLMNoResponseError('LLM did not return a response')
+        else:
+            # Second call should return a successful response
+            return {
+                'choices': [
+                    {
+                        'message': {
+                            'content': f'Successful response with temperature={temperature}'
+                        }
+                    }
+                ]
+            }
+
+    mock_litellm_completion.side_effect = side_effect
+
+    # Create LLM instance and make a completion call with explicit temperature=0
+    llm = LLM(config=default_config)
+    response = llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+        temperature=0,  # Explicitly set temperature to 0
+    )
+
+    # Verify the response after retry
+    assert (
+        response['choices'][0]['message']['content']
+        == 'Successful response with temperature=1.0'
+    )
+
+    # Verify that litellm_completion was called twice
+    assert mock_litellm_completion.call_count == 2
+
+    # Check the temperature in the first call (should be 0)
+    first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
+    assert first_call_kwargs.get('temperature') == 0
+
+    # Check the temperature in the second call (should be 1.0)
+    second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
+    assert second_call_kwargs.get('temperature') == 1.0
+
+
@patch('openhands.llm.llm.litellm_completion')
 def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
    mock_response = {