mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
fix(llm): retry on InternalServerError and Timeout; handle Gemini returns len(choices) < 1 (#7713)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
parent
b326433330
commit
0ab9d97f2d
@ -88,6 +88,16 @@ class LLMResponseError(Exception):
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
# This exception should be retried
|
||||
# Typically, after retry with a non-zero temperature, the LLM will return a response
|
||||
class LLMNoResponseError(Exception):
|
||||
def __init__(
|
||||
self,
|
||||
message: str = 'LLM did not return a response. This is only seen in Gemini models so far.',
|
||||
) -> None:
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class UserCancelledError(Exception):
|
||||
def __init__(self, message: str = 'User cancelled the request') -> None:
|
||||
super().__init__(message)
|
||||
|
||||
@ -23,6 +23,7 @@ from litellm.exceptions import (
|
||||
from litellm.types.utils import CostPerToken, ModelResponse, Usage
|
||||
from litellm.utils import create_pretrained_tokenizer
|
||||
|
||||
from openhands.core.exceptions import LLMNoResponseError
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.message import Message
|
||||
from openhands.llm.debug_mixin import DebugMixin
|
||||
@ -37,7 +38,12 @@ from openhands.llm.retry_mixin import RetryMixin
|
||||
__all__ = ['LLM']
|
||||
|
||||
# tuple of exceptions to retry on
|
||||
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (RateLimitError,)
|
||||
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
|
||||
RateLimitError,
|
||||
litellm.Timeout,
|
||||
litellm.InternalServerError,
|
||||
LLMNoResponseError,
|
||||
)
|
||||
|
||||
# cache prompt supporting models
|
||||
# remove this when we gemini and deepseek are supported
|
||||
@ -63,6 +69,7 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
|
||||
'o1-2024-12-17',
|
||||
'o3-mini-2025-01-31',
|
||||
'o3-mini',
|
||||
'gemini-2.5-pro',
|
||||
]
|
||||
|
||||
REASONING_EFFORT_SUPPORTED_MODELS = [
|
||||
@ -267,8 +274,12 @@ class LLM(RetryMixin, DebugMixin):
|
||||
|
||||
# if we mocked function calling, and we have tools, convert the response back to function calling format
|
||||
if mock_function_calling and mock_fncall_tools is not None:
|
||||
logger.debug(f'Response choices: {len(resp.choices)}')
|
||||
assert len(resp.choices) >= 1
|
||||
if len(resp.choices) < 1:
|
||||
raise LLMNoResponseError(
|
||||
'Response choices is less than 1 - This is only seen in Gemini models so far. Response: '
|
||||
+ str(resp)
|
||||
)
|
||||
|
||||
non_fncall_response_message = resp.choices[0].message
|
||||
fn_call_messages_with_response = (
|
||||
convert_non_fncall_messages_to_fncall_messages(
|
||||
@ -282,6 +293,13 @@ class LLM(RetryMixin, DebugMixin):
|
||||
)
|
||||
resp.choices[0].message = fn_call_response_message
|
||||
|
||||
# Check if resp has 'choices' key with at least one item
|
||||
if not resp.get('choices') or len(resp['choices']) < 1:
|
||||
raise LLMNoResponseError(
|
||||
'Response choices is less than 1 - This is only seen in Gemini models so far. Response: '
|
||||
+ str(resp)
|
||||
)
|
||||
|
||||
message_back: str = resp['choices'][0]['message']['content'] or ''
|
||||
tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][
|
||||
'message'
|
||||
|
||||
@ -5,6 +5,7 @@ from tenacity import (
|
||||
wait_exponential,
|
||||
)
|
||||
|
||||
from openhands.core.exceptions import LLMNoResponseError
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.utils.tenacity_stop import stop_if_should_exit
|
||||
|
||||
@ -35,6 +36,22 @@ class RetryMixin:
|
||||
if retry_listener:
|
||||
retry_listener(retry_state.attempt_number, num_retries)
|
||||
|
||||
# Check if the exception is LLMNoResponseError
|
||||
exception = retry_state.outcome.exception()
|
||||
if isinstance(exception, LLMNoResponseError):
|
||||
if hasattr(retry_state, 'kwargs'):
|
||||
# Only change temperature if it's zero or not set
|
||||
current_temp = retry_state.kwargs.get('temperature', 0)
|
||||
if current_temp == 0:
|
||||
retry_state.kwargs['temperature'] = 1.0
|
||||
logger.warning(
|
||||
'LLMNoResponseError detected with temperature=0, setting temperature to 1.0 for next attempt.'
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f'LLMNoResponseError detected with temperature={current_temp}, keeping original temperature'
|
||||
)
|
||||
|
||||
return retry(
|
||||
before_sleep=before_sleep,
|
||||
stop=stop_after_attempt(num_retries) | stop_if_should_exit(),
|
||||
|
||||
@ -10,7 +10,7 @@ from litellm.exceptions import (
|
||||
)
|
||||
|
||||
from openhands.core.config import LLMConfig
|
||||
from openhands.core.exceptions import OperationCancelled
|
||||
from openhands.core.exceptions import LLMNoResponseError, OperationCancelled
|
||||
from openhands.core.message import Message, TextContent
|
||||
from openhands.llm.llm import LLM
|
||||
from openhands.llm.metrics import Metrics, TokenUsage
|
||||
@ -385,6 +385,223 @@ def test_completion_keyboard_interrupt_handler(mock_litellm_completion, default_
|
||||
_should_exit = False
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_completion_retry_with_llm_no_response_error_zero_temp(
|
||||
mock_litellm_completion, default_config
|
||||
):
|
||||
"""
|
||||
Test that the retry decorator properly handles LLMNoResponseError by:
|
||||
1. First call to llm_completion uses temperature=0 and throws LLMNoResponseError
|
||||
2. Second call should have temperature=0.2 and return a successful response
|
||||
"""
|
||||
|
||||
# Define a side effect function that checks the temperature parameter
|
||||
# and returns different responses based on it
|
||||
def side_effect(*args, **kwargs):
|
||||
temperature = kwargs.get('temperature', 0)
|
||||
|
||||
# First call with temperature=0 should raise LLMNoResponseError
|
||||
if temperature == 0:
|
||||
raise LLMNoResponseError('LLM did not return a response')
|
||||
|
||||
else:
|
||||
return {
|
||||
'choices': [
|
||||
{'message': {'content': f'Response with temperature={temperature}'}}
|
||||
]
|
||||
}
|
||||
|
||||
mock_litellm_completion.side_effect = side_effect
|
||||
|
||||
# Create LLM instance and make a completion call
|
||||
llm = LLM(config=default_config)
|
||||
response = llm.completion(
|
||||
messages=[{'role': 'user', 'content': 'Hello!'}],
|
||||
stream=False,
|
||||
temperature=0, # Initial temperature is 0
|
||||
)
|
||||
|
||||
# Verify the response after retry
|
||||
assert (
|
||||
response['choices'][0]['message']['content'] == 'Response with temperature=1.0'
|
||||
)
|
||||
|
||||
# Verify that litellm_completion was called twice
|
||||
assert mock_litellm_completion.call_count == 2
|
||||
|
||||
# Check the temperature in the first call (should be 0)
|
||||
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
||||
assert first_call_kwargs.get('temperature') == 0
|
||||
|
||||
# Check the temperature in the second call (should be 1.0)
|
||||
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
||||
assert second_call_kwargs.get('temperature') == 1.0
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_completion_retry_with_llm_no_response_error_nonzero_temp(
|
||||
mock_litellm_completion, default_config
|
||||
):
|
||||
"""
|
||||
Test that the retry decorator works for LLMNoResponseError when initial temperature is non-zero,
|
||||
and keeps the original temperature on retry.
|
||||
|
||||
This test verifies that when LLMNoResponseError is raised with a non-zero temperature:
|
||||
1. The retry mechanism is triggered
|
||||
2. The temperature remains unchanged (not set to 0.2)
|
||||
3. After all retries are exhausted, the exception is propagated
|
||||
"""
|
||||
# Define a side effect function that always raises LLMNoResponseError
|
||||
mock_litellm_completion.side_effect = LLMNoResponseError(
|
||||
'LLM did not return a response'
|
||||
)
|
||||
|
||||
# Create LLM instance and make a completion call with non-zero temperature
|
||||
llm = LLM(config=default_config)
|
||||
|
||||
# We expect this to raise an exception after all retries are exhausted
|
||||
with pytest.raises(LLMNoResponseError):
|
||||
llm.completion(
|
||||
messages=[{'role': 'user', 'content': 'Hello!'}],
|
||||
stream=False,
|
||||
temperature=0.7, # Initial temperature is non-zero
|
||||
)
|
||||
|
||||
# Verify that litellm_completion was called multiple times (retries happened)
|
||||
# The default_config has num_retries=2, so it should be called 2 times
|
||||
assert mock_litellm_completion.call_count == 2
|
||||
|
||||
# Check the temperature in the first call (should be 0.7)
|
||||
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
||||
assert first_call_kwargs.get('temperature') == 0.7
|
||||
|
||||
# Check the temperature in the second call (should remain 0.7, not changed to 0.2)
|
||||
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
||||
assert second_call_kwargs.get('temperature') == 0.7
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_completion_retry_with_llm_no_response_error_nonzero_temp_successful_retry(
|
||||
mock_litellm_completion, default_config
|
||||
):
|
||||
"""
|
||||
Test that the retry decorator works for LLMNoResponseError with non-zero temperature
|
||||
and successfully retries while preserving the original temperature.
|
||||
|
||||
This test verifies that:
|
||||
1. First call to llm_completion with temperature=0.7 throws LLMNoResponseError
|
||||
2. Second call with the same temperature=0.7 returns a successful response
|
||||
"""
|
||||
|
||||
# Define a side effect function that raises LLMNoResponseError on first call
|
||||
# and returns a successful response on second call
|
||||
def side_effect(*args, **kwargs):
|
||||
temperature = kwargs.get('temperature', 0)
|
||||
|
||||
if mock_litellm_completion.call_count == 1:
|
||||
# First call should raise LLMNoResponseError
|
||||
raise LLMNoResponseError('LLM did not return a response')
|
||||
else:
|
||||
# Second call should return a successful response
|
||||
return {
|
||||
'choices': [
|
||||
{
|
||||
'message': {
|
||||
'content': f'Successful response with temperature={temperature}'
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_litellm_completion.side_effect = side_effect
|
||||
|
||||
# Create LLM instance and make a completion call with non-zero temperature
|
||||
llm = LLM(config=default_config)
|
||||
response = llm.completion(
|
||||
messages=[{'role': 'user', 'content': 'Hello!'}],
|
||||
stream=False,
|
||||
temperature=0.7, # Non-zero temperature
|
||||
)
|
||||
|
||||
# Verify the response after retry
|
||||
assert (
|
||||
response['choices'][0]['message']['content']
|
||||
== 'Successful response with temperature=0.7'
|
||||
)
|
||||
|
||||
# Verify that litellm_completion was called twice
|
||||
assert mock_litellm_completion.call_count == 2
|
||||
|
||||
# Check the temperature in the first call (should be 0.7)
|
||||
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
||||
assert first_call_kwargs.get('temperature') == 0.7
|
||||
|
||||
# Check the temperature in the second call (should still be 0.7)
|
||||
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
||||
assert second_call_kwargs.get('temperature') == 0.7
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_completion_retry_with_llm_no_response_error_successful_retry(
|
||||
mock_litellm_completion, default_config
|
||||
):
|
||||
"""
|
||||
Test that the retry decorator works for LLMNoResponseError with zero temperature
|
||||
and successfully retries with temperature=0.2.
|
||||
|
||||
This test verifies that:
|
||||
1. First call to llm_completion with temperature=0 throws LLMNoResponseError
|
||||
2. Second call with temperature=0.2 returns a successful response
|
||||
"""
|
||||
|
||||
# Define a side effect function that raises LLMNoResponseError on first call
|
||||
# and returns a successful response on second call
|
||||
def side_effect(*args, **kwargs):
|
||||
temperature = kwargs.get('temperature', 0)
|
||||
|
||||
if mock_litellm_completion.call_count == 1:
|
||||
# First call should raise LLMNoResponseError
|
||||
raise LLMNoResponseError('LLM did not return a response')
|
||||
else:
|
||||
# Second call should return a successful response
|
||||
return {
|
||||
'choices': [
|
||||
{
|
||||
'message': {
|
||||
'content': f'Successful response with temperature={temperature}'
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_litellm_completion.side_effect = side_effect
|
||||
|
||||
# Create LLM instance and make a completion call with explicit temperature=0
|
||||
llm = LLM(config=default_config)
|
||||
response = llm.completion(
|
||||
messages=[{'role': 'user', 'content': 'Hello!'}],
|
||||
stream=False,
|
||||
temperature=0, # Explicitly set temperature to 0
|
||||
)
|
||||
|
||||
# Verify the response after retry
|
||||
assert (
|
||||
response['choices'][0]['message']['content']
|
||||
== 'Successful response with temperature=1.0'
|
||||
)
|
||||
|
||||
# Verify that litellm_completion was called twice
|
||||
assert mock_litellm_completion.call_count == 2
|
||||
|
||||
# Check the temperature in the first call (should be 0)
|
||||
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
||||
assert first_call_kwargs.get('temperature') == 0
|
||||
|
||||
# Check the temperature in the second call (should be 1.0)
|
||||
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
||||
assert second_call_kwargs.get('temperature') == 1.0
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
|
||||
mock_response = {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user