From 411b63159f2d12726b70537c542e1ffa0d8f4fb9 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Sun, 5 Jan 2025 08:13:18 +0900 Subject: [PATCH] fix: Use _send_action_server_request in send_action_for_execution (#5951) Co-authored-by: openhands --- .../action_execution_client.py | 3 +- .../runtime/impl/remote/remote_runtime.py | 29 ++++--- tests/unit/test_runtime_reboot.py | 87 +++++++++++++++++++ 3 files changed, 106 insertions(+), 13 deletions(-) create mode 100644 tests/unit/test_runtime_reboot.py diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py index 80f71ffe8a..4c9d5d52fa 100644 --- a/openhands/runtime/impl/action_execution/action_execution_client.py +++ b/openhands/runtime/impl/action_execution/action_execution_client.py @@ -249,8 +249,7 @@ class ActionExecutionClient(Runtime): assert action.timeout is not None try: - with send_request( - self.session, + with self._send_action_server_request( 'POST', f'{self._get_action_execution_server_host()}/execute_action', json={'action': event_to_dict(action)}, diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index 010ec8b0e9..a8c23bbde5 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -329,9 +329,14 @@ class RemoteRuntime(ActionExecutionClient): elif pod_status in ('failed', 'unknown', 'crashloopbackoff'): # clean up the runtime self.close() - raise AgentRuntimeUnavailableError( - f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}. Pod Logs:\n{runtime_data.get("pod_logs", "N/A")}' - ) + if pod_status == 'crashloopbackoff': + raise AgentRuntimeUnavailableError( + 'Runtime crashed and is being restarted, potentially due to memory usage. Please try again.' + ) + else: + raise AgentRuntimeUnavailableError( + f'Runtime is unavailable (status: {pod_status}). Please try again.' + ) else: # Maybe this should be a hard failure, but passing through in case the API changes self.log('warning', f'Unknown pod status: {pod_status}') @@ -371,15 +376,17 @@ class RemoteRuntime(ActionExecutionClient): f'No response received within the timeout period for url: {url}', ) raise + except requests.HTTPError as e: - if e.response.status_code == 404: - raise AgentRuntimeNotFoundError( - 'Runtime unavailable: System resources may be exhausted due to running commands. This may be fixed by retrying.' - ) from e - elif e.response.status_code == 502: - raise AgentRuntimeDisconnectedError( - 'Runtime disconnected: System resources may be exhausted due to running commands. This may be fixed by retrying.' - ) from e + if e.response.status_code in (404, 502): + if e.response.status_code == 404: + raise AgentRuntimeDisconnectedError( + 'Runtime is not responding. This may be temporary, please try again.' + ) from e + else: # 502 + raise AgentRuntimeDisconnectedError( + 'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again.' + ) from e elif e.response.status_code == 503: self.log('warning', 'Runtime appears to be paused. Resuming...') self._resume_runtime() diff --git a/tests/unit/test_runtime_reboot.py b/tests/unit/test_runtime_reboot.py new file mode 100644 index 0000000000..e3ae31815a --- /dev/null +++ b/tests/unit/test_runtime_reboot.py @@ -0,0 +1,87 @@ +from unittest.mock import Mock + +import pytest +import requests + +from openhands.core.exceptions import ( + AgentRuntimeDisconnectedError, + AgentRuntimeTimeoutError, +) +from openhands.events.action import CmdRunAction +from openhands.runtime.base import Runtime + + +@pytest.fixture +def mock_session(): + return Mock() + + +@pytest.fixture +def runtime(mock_session): + runtime = Mock(spec=Runtime) + runtime.session = mock_session + runtime.send_action_for_execution = Mock() + return runtime + + +def test_runtime_timeout_error(runtime, mock_session): + # Create a command action + action = CmdRunAction(command='test command') + action.timeout = 120 + + # Mock the runtime to raise a timeout error + runtime.send_action_for_execution.side_effect = AgentRuntimeTimeoutError( + 'Runtime failed to return execute_action before the requested timeout of 120s' + ) + + # Verify that the error message indicates a timeout + with pytest.raises(AgentRuntimeTimeoutError) as exc_info: + runtime.send_action_for_execution(action) + + assert ( + str(exc_info.value) + == 'Runtime failed to return execute_action before the requested timeout of 120s' + ) + + +@pytest.mark.parametrize( + 'status_code,expected_message', + [ + (404, 'Runtime is not responding. This may be temporary, please try again.'), + ( + 502, + 'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again.', + ), + ], +) +def test_runtime_disconnected_error( + runtime, mock_session, status_code, expected_message +): + # Mock the request to return the specified status code + mock_response = Mock() + mock_response.status_code = status_code + mock_response.raise_for_status = Mock( + side_effect=requests.HTTPError(response=mock_response) + ) + mock_response.json = Mock( + return_value={ + 'observation': 'run', + 'content': 'test', + 'extras': {'command_id': 'test_id', 'command': 'test command'}, + } + ) + + # Mock the runtime to raise the error + runtime.send_action_for_execution.side_effect = AgentRuntimeDisconnectedError( + expected_message + ) + + # Create a command action + action = CmdRunAction(command='test command') + action.timeout = 120 + + # Verify that the error message is correct + with pytest.raises(AgentRuntimeDisconnectedError) as exc_info: + runtime.send_action_for_execution(action) + + assert str(exc_info.value) == expected_message