fix(llm): retry on InternalServerError and Timeout; handle Gemini returns len(choices) < 1 (#7713)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Xingyao Wang 2025-04-05 14:33:17 -07:00 committed by GitHub
parent b326433330
commit 0ab9d97f2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 266 additions and 4 deletions

View File

@ -88,6 +88,16 @@ class LLMResponseError(Exception):
super().__init__(message)
# This exception should be retried
# Typically, after retry with a non-zero temperature, the LLM will return a response
class LLMNoResponseError(Exception):
def __init__(
self,
message: str = 'LLM did not return a response. This is only seen in Gemini models so far.',
) -> None:
super().__init__(message)
class UserCancelledError(Exception):
def __init__(self, message: str = 'User cancelled the request') -> None:
super().__init__(message)

View File

@ -23,6 +23,7 @@ from litellm.exceptions import (
from litellm.types.utils import CostPerToken, ModelResponse, Usage
from litellm.utils import create_pretrained_tokenizer
from openhands.core.exceptions import LLMNoResponseError
from openhands.core.logger import openhands_logger as logger
from openhands.core.message import Message
from openhands.llm.debug_mixin import DebugMixin
@ -37,7 +38,12 @@ from openhands.llm.retry_mixin import RetryMixin
__all__ = ['LLM']
# tuple of exceptions to retry on
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (RateLimitError,)
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
RateLimitError,
litellm.Timeout,
litellm.InternalServerError,
LLMNoResponseError,
)
# cache prompt supporting models
# remove this when we gemini and deepseek are supported
@ -63,6 +69,7 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
'o1-2024-12-17',
'o3-mini-2025-01-31',
'o3-mini',
'gemini-2.5-pro',
]
REASONING_EFFORT_SUPPORTED_MODELS = [
@ -267,8 +274,12 @@ class LLM(RetryMixin, DebugMixin):
# if we mocked function calling, and we have tools, convert the response back to function calling format
if mock_function_calling and mock_fncall_tools is not None:
logger.debug(f'Response choices: {len(resp.choices)}')
assert len(resp.choices) >= 1
if len(resp.choices) < 1:
raise LLMNoResponseError(
'Response choices is less than 1 - This is only seen in Gemini models so far. Response: '
+ str(resp)
)
non_fncall_response_message = resp.choices[0].message
fn_call_messages_with_response = (
convert_non_fncall_messages_to_fncall_messages(
@ -282,6 +293,13 @@ class LLM(RetryMixin, DebugMixin):
)
resp.choices[0].message = fn_call_response_message
# Check if resp has 'choices' key with at least one item
if not resp.get('choices') or len(resp['choices']) < 1:
raise LLMNoResponseError(
'Response choices is less than 1 - This is only seen in Gemini models so far. Response: '
+ str(resp)
)
message_back: str = resp['choices'][0]['message']['content'] or ''
tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][
'message'

View File

@ -5,6 +5,7 @@ from tenacity import (
wait_exponential,
)
from openhands.core.exceptions import LLMNoResponseError
from openhands.core.logger import openhands_logger as logger
from openhands.utils.tenacity_stop import stop_if_should_exit
@ -35,6 +36,22 @@ class RetryMixin:
if retry_listener:
retry_listener(retry_state.attempt_number, num_retries)
# Check if the exception is LLMNoResponseError
exception = retry_state.outcome.exception()
if isinstance(exception, LLMNoResponseError):
if hasattr(retry_state, 'kwargs'):
# Only change temperature if it's zero or not set
current_temp = retry_state.kwargs.get('temperature', 0)
if current_temp == 0:
retry_state.kwargs['temperature'] = 1.0
logger.warning(
'LLMNoResponseError detected with temperature=0, setting temperature to 1.0 for next attempt.'
)
else:
logger.warning(
f'LLMNoResponseError detected with temperature={current_temp}, keeping original temperature'
)
return retry(
before_sleep=before_sleep,
stop=stop_after_attempt(num_retries) | stop_if_should_exit(),

View File

@ -10,7 +10,7 @@ from litellm.exceptions import (
)
from openhands.core.config import LLMConfig
from openhands.core.exceptions import OperationCancelled
from openhands.core.exceptions import LLMNoResponseError, OperationCancelled
from openhands.core.message import Message, TextContent
from openhands.llm.llm import LLM
from openhands.llm.metrics import Metrics, TokenUsage
@ -385,6 +385,223 @@ def test_completion_keyboard_interrupt_handler(mock_litellm_completion, default_
_should_exit = False
@patch('openhands.llm.llm.litellm_completion')
def test_completion_retry_with_llm_no_response_error_zero_temp(
mock_litellm_completion, default_config
):
"""
Test that the retry decorator properly handles LLMNoResponseError by:
1. First call to llm_completion uses temperature=0 and throws LLMNoResponseError
2. Second call should have temperature=0.2 and return a successful response
"""
# Define a side effect function that checks the temperature parameter
# and returns different responses based on it
def side_effect(*args, **kwargs):
temperature = kwargs.get('temperature', 0)
# First call with temperature=0 should raise LLMNoResponseError
if temperature == 0:
raise LLMNoResponseError('LLM did not return a response')
else:
return {
'choices': [
{'message': {'content': f'Response with temperature={temperature}'}}
]
}
mock_litellm_completion.side_effect = side_effect
# Create LLM instance and make a completion call
llm = LLM(config=default_config)
response = llm.completion(
messages=[{'role': 'user', 'content': 'Hello!'}],
stream=False,
temperature=0, # Initial temperature is 0
)
# Verify the response after retry
assert (
response['choices'][0]['message']['content'] == 'Response with temperature=1.0'
)
# Verify that litellm_completion was called twice
assert mock_litellm_completion.call_count == 2
# Check the temperature in the first call (should be 0)
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
assert first_call_kwargs.get('temperature') == 0
# Check the temperature in the second call (should be 1.0)
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
assert second_call_kwargs.get('temperature') == 1.0
@patch('openhands.llm.llm.litellm_completion')
def test_completion_retry_with_llm_no_response_error_nonzero_temp(
mock_litellm_completion, default_config
):
"""
Test that the retry decorator works for LLMNoResponseError when initial temperature is non-zero,
and keeps the original temperature on retry.
This test verifies that when LLMNoResponseError is raised with a non-zero temperature:
1. The retry mechanism is triggered
2. The temperature remains unchanged (not set to 0.2)
3. After all retries are exhausted, the exception is propagated
"""
# Define a side effect function that always raises LLMNoResponseError
mock_litellm_completion.side_effect = LLMNoResponseError(
'LLM did not return a response'
)
# Create LLM instance and make a completion call with non-zero temperature
llm = LLM(config=default_config)
# We expect this to raise an exception after all retries are exhausted
with pytest.raises(LLMNoResponseError):
llm.completion(
messages=[{'role': 'user', 'content': 'Hello!'}],
stream=False,
temperature=0.7, # Initial temperature is non-zero
)
# Verify that litellm_completion was called multiple times (retries happened)
# The default_config has num_retries=2, so it should be called 2 times
assert mock_litellm_completion.call_count == 2
# Check the temperature in the first call (should be 0.7)
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
assert first_call_kwargs.get('temperature') == 0.7
# Check the temperature in the second call (should remain 0.7, not changed to 0.2)
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
assert second_call_kwargs.get('temperature') == 0.7
@patch('openhands.llm.llm.litellm_completion')
def test_completion_retry_with_llm_no_response_error_nonzero_temp_successful_retry(
mock_litellm_completion, default_config
):
"""
Test that the retry decorator works for LLMNoResponseError with non-zero temperature
and successfully retries while preserving the original temperature.
This test verifies that:
1. First call to llm_completion with temperature=0.7 throws LLMNoResponseError
2. Second call with the same temperature=0.7 returns a successful response
"""
# Define a side effect function that raises LLMNoResponseError on first call
# and returns a successful response on second call
def side_effect(*args, **kwargs):
temperature = kwargs.get('temperature', 0)
if mock_litellm_completion.call_count == 1:
# First call should raise LLMNoResponseError
raise LLMNoResponseError('LLM did not return a response')
else:
# Second call should return a successful response
return {
'choices': [
{
'message': {
'content': f'Successful response with temperature={temperature}'
}
}
]
}
mock_litellm_completion.side_effect = side_effect
# Create LLM instance and make a completion call with non-zero temperature
llm = LLM(config=default_config)
response = llm.completion(
messages=[{'role': 'user', 'content': 'Hello!'}],
stream=False,
temperature=0.7, # Non-zero temperature
)
# Verify the response after retry
assert (
response['choices'][0]['message']['content']
== 'Successful response with temperature=0.7'
)
# Verify that litellm_completion was called twice
assert mock_litellm_completion.call_count == 2
# Check the temperature in the first call (should be 0.7)
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
assert first_call_kwargs.get('temperature') == 0.7
# Check the temperature in the second call (should still be 0.7)
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
assert second_call_kwargs.get('temperature') == 0.7
@patch('openhands.llm.llm.litellm_completion')
def test_completion_retry_with_llm_no_response_error_successful_retry(
mock_litellm_completion, default_config
):
"""
Test that the retry decorator works for LLMNoResponseError with zero temperature
and successfully retries with temperature=0.2.
This test verifies that:
1. First call to llm_completion with temperature=0 throws LLMNoResponseError
2. Second call with temperature=0.2 returns a successful response
"""
# Define a side effect function that raises LLMNoResponseError on first call
# and returns a successful response on second call
def side_effect(*args, **kwargs):
temperature = kwargs.get('temperature', 0)
if mock_litellm_completion.call_count == 1:
# First call should raise LLMNoResponseError
raise LLMNoResponseError('LLM did not return a response')
else:
# Second call should return a successful response
return {
'choices': [
{
'message': {
'content': f'Successful response with temperature={temperature}'
}
}
]
}
mock_litellm_completion.side_effect = side_effect
# Create LLM instance and make a completion call with explicit temperature=0
llm = LLM(config=default_config)
response = llm.completion(
messages=[{'role': 'user', 'content': 'Hello!'}],
stream=False,
temperature=0, # Explicitly set temperature to 0
)
# Verify the response after retry
assert (
response['choices'][0]['message']['content']
== 'Successful response with temperature=1.0'
)
# Verify that litellm_completion was called twice
assert mock_litellm_completion.call_count == 2
# Check the temperature in the first call (should be 0)
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
assert first_call_kwargs.get('temperature') == 0
# Check the temperature in the second call (should be 1.0)
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
assert second_call_kwargs.get('temperature') == 1.0
@patch('openhands.llm.llm.litellm_completion')
def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
mock_response = {