From 411b63159f2d12726b70537c542e1ffa0d8f4fb9 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Sun, 5 Jan 2025 08:13:18 +0900
Subject: [PATCH] fix: Use _send_action_server_request in
 send_action_for_execution (#5951)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../action_execution_client.py                |  3 +-
 .../runtime/impl/remote/remote_runtime.py     | 29 ++++---
 tests/unit/test_runtime_reboot.py             | 87 +++++++++++++++++++
 3 files changed, 106 insertions(+), 13 deletions(-)
 create mode 100644 tests/unit/test_runtime_reboot.py

diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py
index 80f71ffe8a..4c9d5d52fa 100644
--- a/openhands/runtime/impl/action_execution/action_execution_client.py
+++ b/openhands/runtime/impl/action_execution/action_execution_client.py
@@ -249,8 +249,7 @@ class ActionExecutionClient(Runtime):
             assert action.timeout is not None
 
             try:
-                with send_request(
-                    self.session,
+                with self._send_action_server_request(
                     'POST',
                     f'{self._get_action_execution_server_host()}/execute_action',
                     json={'action': event_to_dict(action)},
diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index 010ec8b0e9..a8c23bbde5 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -329,9 +329,14 @@ class RemoteRuntime(ActionExecutionClient):
         elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
             # clean up the runtime
             self.close()
-            raise AgentRuntimeUnavailableError(
-                f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}. Pod Logs:\n{runtime_data.get("pod_logs", "N/A")}'
-            )
+            if pod_status == 'crashloopbackoff':
+                raise AgentRuntimeUnavailableError(
+                    'Runtime crashed and is being restarted, potentially due to memory usage. Please try again.'
+                )
+            else:
+                raise AgentRuntimeUnavailableError(
+                    f'Runtime is unavailable (status: {pod_status}). Please try again.'
+                )
         else:
             # Maybe this should be a hard failure, but passing through in case the API changes
             self.log('warning', f'Unknown pod status: {pod_status}')
@@ -371,15 +376,17 @@ class RemoteRuntime(ActionExecutionClient):
                 f'No response received within the timeout period for url: {url}',
             )
             raise
+
         except requests.HTTPError as e:
-            if e.response.status_code == 404:
-                raise AgentRuntimeNotFoundError(
-                    'Runtime unavailable: System resources may be exhausted due to running commands. This may be fixed by retrying.'
-                ) from e
-            elif e.response.status_code == 502:
-                raise AgentRuntimeDisconnectedError(
-                    'Runtime disconnected: System resources may be exhausted due to running commands. This may be fixed by retrying.'
-                ) from e
+            if e.response.status_code in (404, 502):
+                if e.response.status_code == 404:
+                    raise AgentRuntimeDisconnectedError(
+                        'Runtime is not responding. This may be temporary, please try again.'
+                    ) from e
+                else:  # 502
+                    raise AgentRuntimeDisconnectedError(
+                        'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again.'
+                    ) from e
             elif e.response.status_code == 503:
                 self.log('warning', 'Runtime appears to be paused. Resuming...')
                 self._resume_runtime()
diff --git a/tests/unit/test_runtime_reboot.py b/tests/unit/test_runtime_reboot.py
new file mode 100644
index 0000000000..e3ae31815a
--- /dev/null
+++ b/tests/unit/test_runtime_reboot.py
@@ -0,0 +1,87 @@
+from unittest.mock import Mock
+
+import pytest
+import requests
+
+from openhands.core.exceptions import (
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeTimeoutError,
+)
+from openhands.events.action import CmdRunAction
+from openhands.runtime.base import Runtime
+
+
+@pytest.fixture
+def mock_session():
+    return Mock()
+
+
+@pytest.fixture
+def runtime(mock_session):
+    runtime = Mock(spec=Runtime)
+    runtime.session = mock_session
+    runtime.send_action_for_execution = Mock()
+    return runtime
+
+
+def test_runtime_timeout_error(runtime, mock_session):
+    # Create a command action
+    action = CmdRunAction(command='test command')
+    action.timeout = 120
+
+    # Mock the runtime to raise a timeout error
+    runtime.send_action_for_execution.side_effect = AgentRuntimeTimeoutError(
+        'Runtime failed to return execute_action before the requested timeout of 120s'
+    )
+
+    # Verify that the error message indicates a timeout
+    with pytest.raises(AgentRuntimeTimeoutError) as exc_info:
+        runtime.send_action_for_execution(action)
+
+    assert (
+        str(exc_info.value)
+        == 'Runtime failed to return execute_action before the requested timeout of 120s'
+    )
+
+
+@pytest.mark.parametrize(
+    'status_code,expected_message',
+    [
+        (404, 'Runtime is not responding. This may be temporary, please try again.'),
+        (
+            502,
+            'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again.',
+        ),
+    ],
+)
+def test_runtime_disconnected_error(
+    runtime, mock_session, status_code, expected_message
+):
+    # Mock the request to return the specified status code
+    mock_response = Mock()
+    mock_response.status_code = status_code
+    mock_response.raise_for_status = Mock(
+        side_effect=requests.HTTPError(response=mock_response)
+    )
+    mock_response.json = Mock(
+        return_value={
+            'observation': 'run',
+            'content': 'test',
+            'extras': {'command_id': 'test_id', 'command': 'test command'},
+        }
+    )
+
+    # Mock the runtime to raise the error
+    runtime.send_action_for_execution.side_effect = AgentRuntimeDisconnectedError(
+        expected_message
+    )
+
+    # Create a command action
+    action = CmdRunAction(command='test command')
+    action.timeout = 120
+
+    # Verify that the error message is correct
+    with pytest.raises(AgentRuntimeDisconnectedError) as exc_info:
+        runtime.send_action_for_execution(action)
+
+    assert str(exc_info.value) == expected_message