From 21af52560af7db52fe3169f5dc39b78357820869 Mon Sep 17 00:00:00 2001 From: Robert Brennan Date: Mon, 2 Jun 2025 13:12:24 -0400 Subject: [PATCH] Add detailed logging to remote runtime resume process (#8819) Co-authored-by: openhands --- .../runtime/impl/remote/remote_runtime.py | 108 +++++++++++++++--- 1 file changed, 90 insertions(+), 18 deletions(-) diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index c3c20d58cc..0164118816 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -100,7 +100,6 @@ class RemoteRuntime(ActionExecutionClient): self.available_hosts: dict[str, int] = {} def log(self, level: str, message: str, exc_info: bool | None = None) -> None: - message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}' getattr(logger, level)( message, stacklevel=2, @@ -128,14 +127,17 @@ class RemoteRuntime(ActionExecutionClient): self._runtime_initialized = True def _start_or_attach_to_runtime(self) -> None: + self.log('info', 'Starting or attaching to runtime') existing_runtime = self._check_existing_runtime() if existing_runtime: - self.log('debug', f'Using existing runtime with ID: {self.runtime_id}') + self.log('info', f'Using existing runtime with ID: {self.runtime_id}') elif self.attach_to_existing: + self.log('info', f'Failed to find existing runtime for SID: {self.sid}') raise AgentRuntimeNotFoundError( f'Could not find existing runtime for SID: {self.sid}' ) else: + self.log('info', 'No existing runtime found, starting a new one') self.send_status_message('STATUS$STARTING_CONTAINER') if self.config.sandbox.runtime_container_image is None: self.log( @@ -165,6 +167,7 @@ class RemoteRuntime(ActionExecutionClient): self.send_status_message(' ') def _check_existing_runtime(self) -> bool: + self.log('info', f'Checking for existing runtime with session ID: {self.sid}') try: response = self._send_runtime_api_request( 'GET', @@ -172,12 +175,16 @@ class RemoteRuntime(ActionExecutionClient): ) data = response.json() status = data.get('status') + self.log('info', f'Found runtime with status: {status}') if status == 'running' or status == 'paused': self._parse_runtime_response(response) except httpx.HTTPError as e: if e.response.status_code == 404: + self.log( + 'info', f'No existing runtime found for session ID: {self.sid}' + ) return False - self.log('debug', f'Error while looking for remote runtime: {e}') + self.log('error', f'Error while looking for remote runtime: {e}') raise except json.decoder.JSONDecodeError as e: self.log( @@ -187,14 +194,25 @@ class RemoteRuntime(ActionExecutionClient): raise if status == 'running': + self.log('info', 'Found existing runtime in running state') return True elif status == 'stopped': - self.log('debug', 'Found existing remote runtime, but it is stopped') + self.log('info', 'Found existing runtime, but it is stopped') return False elif status == 'paused': - self.log('debug', 'Found existing remote runtime, but it is paused') - self._resume_runtime() - return True + self.log( + 'info', 'Found existing runtime in paused state, attempting to resume' + ) + try: + self._resume_runtime() + self.log('info', 'Successfully resumed paused runtime') + return True + except Exception as e: + self.log( + 'error', f'Failed to resume paused runtime: {e}', exc_info=True + ) + # Return false to indicate we couldn't use the existing runtime + return False else: self.log('error', f'Invalid response from runtime API: {data}') return False @@ -287,16 +305,48 @@ class RemoteRuntime(ActionExecutionClient): 3. Poll for the runtime to be ready 4. Update env vars """ + self.log('info', f'Attempting to resume runtime with ID: {self.runtime_id}') self.send_status_message('STATUS$STARTING_RUNTIME') - self._send_runtime_api_request( - 'POST', - f'{self.config.sandbox.remote_runtime_api_url}/resume', - json={'runtime_id': self.runtime_id}, + try: + response = self._send_runtime_api_request( + 'POST', + f'{self.config.sandbox.remote_runtime_api_url}/resume', + json={'runtime_id': self.runtime_id}, + ) + self.log( + 'info', + f'Resume API call successful with status code: {response.status_code}', + ) + except Exception as e: + self.log('error', f'Failed to call /resume API: {e}', exc_info=True) + raise + + self.log( + 'info', 'Runtime resume API call completed, waiting for it to be alive...' ) - self.log('info', 'Runtime resumed, waiting for it to be alive...') - self._wait_until_alive() - self.setup_initial_env() - self.log('info', 'Runtime resumed and alive.') + try: + self._wait_until_alive() + self.log('info', 'Runtime is now alive after resume') + except Exception as e: + self.log( + 'error', + f'Runtime failed to become alive after resume: {e}', + exc_info=True, + ) + raise + + try: + self.setup_initial_env() + self.log('info', 'Successfully set up initial environment after resume') + except Exception as e: + self.log( + 'error', + f'Failed to set up initial environment after resume: {e}', + exc_info=True, + ) + raise + + self.log('info', 'Runtime successfully resumed and alive.') def _parse_runtime_response(self, response: httpx.Response) -> None: start_response = response.json() @@ -491,10 +541,32 @@ class RemoteRuntime(ActionExecutionClient): ) from e elif hasattr(e, 'response') and e.response.status_code == 503: if self.config.sandbox.keep_runtime_alive: - self.log('warning', 'Runtime appears to be paused. Resuming...') - self._resume_runtime() - return super()._send_action_server_request(method, url, **kwargs) + self.log( + 'info', + f'Runtime appears to be paused (503 response). Runtime ID: {self.runtime_id}, URL: {url}', + ) + try: + self._resume_runtime() + self.log( + 'info', 'Successfully resumed runtime after 503 response' + ) + return super()._send_action_server_request( + method, url, **kwargs + ) + except Exception as resume_error: + self.log( + 'error', + f'Failed to resume runtime after 503 response: {resume_error}', + exc_info=True, + ) + raise AgentRuntimeDisconnectedError( + f'Runtime is paused and could not be resumed. Original error: {e}, Resume error: {resume_error}' + ) from resume_error else: + self.log( + 'info', + 'Runtime appears to be paused (503 response) but keep_runtime_alive is False', + ) raise AgentRuntimeDisconnectedError( f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}' ) from e