From 21af52560af7db52fe3169f5dc39b78357820869 Mon Sep 17 00:00:00 2001
From: Robert Brennan <accounts@rbren.io>
Date: Mon, 2 Jun 2025 13:12:24 -0400
Subject: [PATCH] Add detailed logging to remote runtime resume process (#8819)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../runtime/impl/remote/remote_runtime.py     | 108 +++++++++++++++---
 1 file changed, 90 insertions(+), 18 deletions(-)

diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index c3c20d58cc..0164118816 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -100,7 +100,6 @@ class RemoteRuntime(ActionExecutionClient):
         self.available_hosts: dict[str, int] = {}
 
     def log(self, level: str, message: str, exc_info: bool | None = None) -> None:
-        message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}'
         getattr(logger, level)(
             message,
             stacklevel=2,
@@ -128,14 +127,17 @@ class RemoteRuntime(ActionExecutionClient):
         self._runtime_initialized = True
 
     def _start_or_attach_to_runtime(self) -> None:
+        self.log('info', 'Starting or attaching to runtime')
         existing_runtime = self._check_existing_runtime()
         if existing_runtime:
-            self.log('debug', f'Using existing runtime with ID: {self.runtime_id}')
+            self.log('info', f'Using existing runtime with ID: {self.runtime_id}')
         elif self.attach_to_existing:
+            self.log('info', f'Failed to find existing runtime for SID: {self.sid}')
             raise AgentRuntimeNotFoundError(
                 f'Could not find existing runtime for SID: {self.sid}'
             )
         else:
+            self.log('info', 'No existing runtime found, starting a new one')
             self.send_status_message('STATUS$STARTING_CONTAINER')
             if self.config.sandbox.runtime_container_image is None:
                 self.log(
@@ -165,6 +167,7 @@ class RemoteRuntime(ActionExecutionClient):
         self.send_status_message(' ')
 
     def _check_existing_runtime(self) -> bool:
+        self.log('info', f'Checking for existing runtime with session ID: {self.sid}')
         try:
             response = self._send_runtime_api_request(
                 'GET',
@@ -172,12 +175,16 @@ class RemoteRuntime(ActionExecutionClient):
             )
             data = response.json()
             status = data.get('status')
+            self.log('info', f'Found runtime with status: {status}')
             if status == 'running' or status == 'paused':
                 self._parse_runtime_response(response)
         except httpx.HTTPError as e:
             if e.response.status_code == 404:
+                self.log(
+                    'info', f'No existing runtime found for session ID: {self.sid}'
+                )
                 return False
-            self.log('debug', f'Error while looking for remote runtime: {e}')
+            self.log('error', f'Error while looking for remote runtime: {e}')
             raise
         except json.decoder.JSONDecodeError as e:
             self.log(
@@ -187,14 +194,25 @@ class RemoteRuntime(ActionExecutionClient):
             raise
 
         if status == 'running':
+            self.log('info', 'Found existing runtime in running state')
             return True
         elif status == 'stopped':
-            self.log('debug', 'Found existing remote runtime, but it is stopped')
+            self.log('info', 'Found existing runtime, but it is stopped')
             return False
         elif status == 'paused':
-            self.log('debug', 'Found existing remote runtime, but it is paused')
-            self._resume_runtime()
-            return True
+            self.log(
+                'info', 'Found existing runtime in paused state, attempting to resume'
+            )
+            try:
+                self._resume_runtime()
+                self.log('info', 'Successfully resumed paused runtime')
+                return True
+            except Exception as e:
+                self.log(
+                    'error', f'Failed to resume paused runtime: {e}', exc_info=True
+                )
+                # Return false to indicate we couldn't use the existing runtime
+                return False
         else:
             self.log('error', f'Invalid response from runtime API: {data}')
             return False
@@ -287,16 +305,48 @@ class RemoteRuntime(ActionExecutionClient):
         3. Poll for the runtime to be ready
         4. Update env vars
         """
+        self.log('info', f'Attempting to resume runtime with ID: {self.runtime_id}')
         self.send_status_message('STATUS$STARTING_RUNTIME')
-        self._send_runtime_api_request(
-            'POST',
-            f'{self.config.sandbox.remote_runtime_api_url}/resume',
-            json={'runtime_id': self.runtime_id},
+        try:
+            response = self._send_runtime_api_request(
+                'POST',
+                f'{self.config.sandbox.remote_runtime_api_url}/resume',
+                json={'runtime_id': self.runtime_id},
+            )
+            self.log(
+                'info',
+                f'Resume API call successful with status code: {response.status_code}',
+            )
+        except Exception as e:
+            self.log('error', f'Failed to call /resume API: {e}', exc_info=True)
+            raise
+
+        self.log(
+            'info', 'Runtime resume API call completed, waiting for it to be alive...'
         )
-        self.log('info', 'Runtime resumed, waiting for it to be alive...')
-        self._wait_until_alive()
-        self.setup_initial_env()
-        self.log('info', 'Runtime resumed and alive.')
+        try:
+            self._wait_until_alive()
+            self.log('info', 'Runtime is now alive after resume')
+        except Exception as e:
+            self.log(
+                'error',
+                f'Runtime failed to become alive after resume: {e}',
+                exc_info=True,
+            )
+            raise
+
+        try:
+            self.setup_initial_env()
+            self.log('info', 'Successfully set up initial environment after resume')
+        except Exception as e:
+            self.log(
+                'error',
+                f'Failed to set up initial environment after resume: {e}',
+                exc_info=True,
+            )
+            raise
+
+        self.log('info', 'Runtime successfully resumed and alive.')
 
     def _parse_runtime_response(self, response: httpx.Response) -> None:
         start_response = response.json()
@@ -491,10 +541,32 @@ class RemoteRuntime(ActionExecutionClient):
                     ) from e
             elif hasattr(e, 'response') and e.response.status_code == 503:
                 if self.config.sandbox.keep_runtime_alive:
-                    self.log('warning', 'Runtime appears to be paused. Resuming...')
-                    self._resume_runtime()
-                    return super()._send_action_server_request(method, url, **kwargs)
+                    self.log(
+                        'info',
+                        f'Runtime appears to be paused (503 response). Runtime ID: {self.runtime_id}, URL: {url}',
+                    )
+                    try:
+                        self._resume_runtime()
+                        self.log(
+                            'info', 'Successfully resumed runtime after 503 response'
+                        )
+                        return super()._send_action_server_request(
+                            method, url, **kwargs
+                        )
+                    except Exception as resume_error:
+                        self.log(
+                            'error',
+                            f'Failed to resume runtime after 503 response: {resume_error}',
+                            exc_info=True,
+                        )
+                        raise AgentRuntimeDisconnectedError(
+                            f'Runtime is paused and could not be resumed. Original error: {e}, Resume error: {resume_error}'
+                        ) from resume_error
                 else:
+                    self.log(
+                        'info',
+                        'Runtime appears to be paused (503 response) but keep_runtime_alive is False',
+                    )
                     raise AgentRuntimeDisconnectedError(
                         f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}'
                     ) from e