diff --git a/openhands/runtime/client/client.py b/openhands/runtime/client/client.py index 8fe51ecffc..c480c961c9 100644 --- a/openhands/runtime/client/client.py +++ b/openhands/runtime/client/client.py @@ -300,7 +300,7 @@ class RuntimeClient: def _execute_bash( self, command: str, - timeout: int | None, + timeout: int, keep_prompt: bool = True, kill_on_timeout: bool = True, ) -> tuple[str, int]: @@ -310,19 +310,85 @@ class RuntimeClient: timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout ) - def _interrupt_bash(self, timeout: int | None = None) -> tuple[str, int]: + def _interrupt_bash( + self, + action_timeout: int | None, + interrupt_timeout: int | None = None, + max_retries: int = 2, + ) -> tuple[str, int]: self.shell.sendintr() # send SIGINT to the shell - self.shell.expect(self.__bash_expect_regex, timeout=timeout) + logger.debug('Sent SIGINT to bash. Waiting for output...') + + interrupt_timeout = interrupt_timeout or 1 # default timeout for SIGINT + # try to interrupt the bash shell use SIGINT + while max_retries > 0: + try: + self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout) + output = self.shell.before + logger.debug(f'Received output after SIGINT: {output}') + exit_code = 130 # SIGINT + + _additional_msg = '' + if action_timeout is not None: + _additional_msg = ( + f'Command timed out after {action_timeout} seconds. ' + ) + output += ( + '\r\n\r\n' + + f'[{_additional_msg}SIGINT was sent to interrupt the command.]' + ) + return output, exit_code + except pexpect.TIMEOUT as e: + logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}') + max_retries -= 1 + + # fall back to send control-z + logger.error( + 'Failed to get output after SIGINT. Max retries reached. Sending control-z...' + ) + self.shell.sendcontrol('z') + self.shell.expect(self.__bash_expect_regex) output = self.shell.before - exit_code = 130 # SIGINT + logger.debug(f'Received output after control-z: {output}') + # Try to kill the job + self.shell.sendline('kill -9 %1') + self.shell.expect(self.__bash_expect_regex) + logger.debug(f'Received output after killing job %1: {self.shell.before}') + output += self.shell.before + + _additional_msg = '' + if action_timeout is not None: + _additional_msg = f'Command timed out after {action_timeout} seconds. ' + output += ( + '\r\n\r\n' + + f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]' + ) + + # Try to get the exit code again + self.shell.sendline('echo $?') + self.shell.expect(self.__bash_expect_regex) + _exit_code_output = self.shell.before + exit_code = self._parse_exit_code(_exit_code_output) + return output, exit_code + def _parse_exit_code(self, output: str) -> int: + try: + exit_code = int(output.strip().split()[0]) + except Exception: + logger.error('Error getting exit code from bash script') + # If we try to run an invalid shell script the output sometimes includes error text + # rather than the error code - we assume this is an error + exit_code = 2 + return exit_code + def _continue_bash( self, - timeout: int | None, + timeout: int, keep_prompt: bool = True, kill_on_timeout: bool = True, ) -> tuple[str, int]: + logger.debug(f'Continuing bash with timeout={timeout}') try: self.shell.expect(self.__bash_expect_regex, timeout=timeout) @@ -333,31 +399,18 @@ class RuntimeClient: logger.debug('Requesting exit code...') self.shell.expect(self.__bash_expect_regex, timeout=timeout) _exit_code_output = self.shell.before - try: - exit_code = int(_exit_code_output.strip().split()[0]) - except Exception: - logger.error('Error getting exit code from bash script') - # If we try to run an invalid shell script the output sometimes includes error text - # rather than the error code - we assume this is an error - exit_code = 2 - + exit_code = self._parse_exit_code(_exit_code_output) except pexpect.TIMEOUT as e: + logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}') if kill_on_timeout: - output, exit_code = self._interrupt_bash() - output += ( - '\r\n\r\n' - + f'[Command timed out after {timeout} seconds. SIGINT was sent to interrupt it.]' - ) - logger.error(f'Failed to execute command. Error: {e}') + output, exit_code = self._interrupt_bash(action_timeout=timeout) else: output = self.shell.before or '' exit_code = -1 - finally: bash_prompt = self._get_bash_prompt_and_update_pwd() if keep_prompt: output += '\r\n' + bash_prompt - # logger.debug(f'Command output:\n{output}') return output, exit_code async def run_action(self, action) -> Observation: @@ -383,7 +436,7 @@ class RuntimeClient: ) elif command.lower() == 'ctrl+c': output, exit_code = self._interrupt_bash( - timeout=SOFT_TIMEOUT_SECONDS + action_timeout=None, # intentionally None ) else: output, exit_code = self._execute_bash( diff --git a/openhands/runtime/client/runtime.py b/openhands/runtime/client/runtime.py index c6bb301904..1f3ab8cc7b 100644 --- a/openhands/runtime/client/runtime.py +++ b/openhands/runtime/client/runtime.py @@ -35,6 +35,9 @@ from openhands.runtime.builder import DockerRuntimeBuilder from openhands.runtime.plugins import PluginRequirement from openhands.runtime.runtime import Runtime from openhands.runtime.utils import find_available_tcp_port +from openhands.runtime.utils.request import ( + send_request, +) from openhands.runtime.utils.runtime_build import build_runtime_image from openhands.utils.tenacity_stop import stop_if_should_exit @@ -333,7 +336,12 @@ class EventStreamRuntime(Runtime): if not (self.log_buffer and self.log_buffer.client_ready): raise RuntimeError('Runtime client is not ready.') - response = self.session.get(f'{self.api_url}/alive') + response = send_request( + self.session, + 'GET', + f'{self.api_url}/alive', + retry_exceptions=[ConnectionRefusedError], + ) if response.status_code == 200: return else: @@ -416,7 +424,9 @@ class EventStreamRuntime(Runtime): assert action.timeout is not None try: - response = self.session.post( + response = send_request( + self.session, + 'POST', f'{self.api_url}/execute_action', json={'action': event_to_dict(action)}, timeout=action.timeout, @@ -495,8 +505,12 @@ class EventStreamRuntime(Runtime): params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()} - response = self.session.post( - f'{self.api_url}/upload_file', files=upload_data, params=params + response = send_request( + self.session, + 'POST', + f'{self.api_url}/upload_file', + files=upload_data, + params=params, ) if response.status_code == 200: return @@ -525,7 +539,12 @@ class EventStreamRuntime(Runtime): if path is not None: data['path'] = path - response = self.session.post(f'{self.api_url}/list_files', json=data) + response = send_request( + self.session, + 'POST', + f'{self.api_url}/list_files', + json=data, + ) if response.status_code == 200: response_json = response.json() assert isinstance(response_json, list) diff --git a/openhands/runtime/utils/request.py b/openhands/runtime/utils/request.py index f67faf28f4..7d68df8d8e 100644 --- a/openhands/runtime/utils/request.py +++ b/openhands/runtime/utils/request.py @@ -49,7 +49,8 @@ def send_request( if retry_fns is not None: for fn in retry_fns: retry_condition |= retry_if_exception(fn) - kwargs['timeout'] = timeout + # wait a few more seconds to get the timeout error from client side + kwargs['timeout'] = timeout + 10 @retry( stop=stop_after_delay(timeout) | stop_if_should_exit(), diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py index c28957888d..db8919a924 100644 --- a/tests/runtime/test_bash.py +++ b/tests/runtime/test_bash.py @@ -57,6 +57,198 @@ def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands): _close_test_runtime(runtime) +def test_bash_timeout_and_keyboard_interrupt(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) + try: + action = CmdRunAction(command='python -c "import time; time.sleep(10)"') + action.timeout = 1 + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert ( + '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]' + in obs.content + ) + assert 'KeyboardInterrupt' in obs.content + + # follow up command should not be affected + action = CmdRunAction(command='ls') + action.timeout = 1 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + # run it again! + action = CmdRunAction(command='python -c "import time; time.sleep(10)"') + action.timeout = 1 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + assert ( + '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]' + in obs.content + ) + assert 'KeyboardInterrupt' in obs.content + + # things should still work + action = CmdRunAction(command='ls') + action.timeout = 1 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert '/workspace' in obs.content + + finally: + _close_test_runtime(runtime) + + +def test_bash_pexcept_eof(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) + try: + action = CmdRunAction(command='python3 -m http.server 8080') + action.timeout = 1 + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 130 # script was killed by SIGINT + assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content + assert 'Keyboard interrupt received, exiting.' in obs.content + + action = CmdRunAction(command='ls') + action.timeout = 1 + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert '/workspace' in obs.content + + # run it again! + action = CmdRunAction(command='python3 -m http.server 8080') + action.timeout = 1 + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 130 # script was killed by SIGINT + assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content + assert 'Keyboard interrupt received, exiting.' in obs.content + + # things should still work + action = CmdRunAction(command='ls') + action.timeout = 1 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert '/workspace' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_process_resistant_to_one_sigint(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) + try: + # Create a bash script that ignores SIGINT up to 1 times + script_content = """ +#!/bin/bash +trap_count=0 +trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT +while true; do + echo "Still running..." + sleep 1 +done + """.strip() + + with open(f'{temp_dir}/resistant_script.sh', 'w') as f: + f.write(script_content) + os.chmod(f'{temp_dir}/resistant_script.sh', 0o777) + + runtime.copy_to( + os.path.join(temp_dir, 'resistant_script.sh'), + runtime.config.workspace_mount_path_in_sandbox, + ) + + # Run the resistant script + action = CmdRunAction(command='sudo bash ./resistant_script.sh') + action.timeout = 5 + action.blocking = True + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 130 # script was killed by SIGINT + assert 'Still running...' in obs.content + assert 'Caught SIGINT (1/1), ignoring...' in obs.content + assert 'Stopped' not in obs.content + assert ( + '[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]' + in obs.content + ) + + # Normal command should still work + action = CmdRunAction(command='ls') + action.timeout = 10 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert '/workspace' in obs.content + assert 'resistant_script.sh' in obs.content + + finally: + _close_test_runtime(runtime) + + +def test_process_resistant_to_multiple_sigint(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) + try: + # Create a bash script that ignores SIGINT up to 2 times + script_content = """ +#!/bin/bash +trap_count=0 +trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT +while true; do + echo "Still running..." + sleep 1 +done + """.strip() + + with open(f'{temp_dir}/resistant_script.sh', 'w') as f: + f.write(script_content) + os.chmod(f'{temp_dir}/resistant_script.sh', 0o777) + + runtime.copy_to( + os.path.join(temp_dir, 'resistant_script.sh'), + runtime.config.workspace_mount_path_in_sandbox, + ) + + # Run the resistant script + action = CmdRunAction(command='sudo bash ./resistant_script.sh') + action.timeout = 2 + action.blocking = True + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert 'Still running...' in obs.content + assert 'Caught SIGINT (1/3), ignoring...' in obs.content + assert '[1]+' and 'Stopped' in obs.content + assert ( + '[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]' + in obs.content + ) + + # Normal command should still work + action = CmdRunAction(command='ls') + action.timeout = 10 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert '/workspace' in obs.content + assert 'resistant_script.sh' in obs.content + + finally: + _close_test_runtime(runtime) + + def test_multiline_commands(temp_dir, box_class): runtime = _load_runtime(temp_dir, box_class) try: