fix(runtime): fix bash interrupt on program that cannot be stopped via ctrl+c (#4161)

2025-12-26 05:48:36 +08:00 · 2024-10-03 17:48:44 -05:00 · 2024-10-03 17:48:44 -05:00 · 42649745bd
commit 42649745bd
parent 4678ae4ebd
4 changed files with 293 additions and 28 deletions
--- a/openhands/runtime/client/client.py
+++ b/openhands/runtime/client/client.py
@ -300,7 +300,7 @@ class RuntimeClient:
    def _execute_bash(
        self,
        command: str,
-        timeout: int | None,
+        timeout: int,
        keep_prompt: bool = True,
        kill_on_timeout: bool = True,
    ) -> tuple[str, int]:
@ -310,19 +310,85 @@ class RuntimeClient:
            timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout
        )

-    def _interrupt_bash(self, timeout: int | None = None) -> tuple[str, int]:
+    def _interrupt_bash(
+        self,
+        action_timeout: int | None,
+        interrupt_timeout: int | None = None,
+        max_retries: int = 2,
+    ) -> tuple[str, int]:
        self.shell.sendintr()  # send SIGINT to the shell
-        self.shell.expect(self.__bash_expect_regex, timeout=timeout)
+        logger.debug('Sent SIGINT to bash. Waiting for output...')
+
+        interrupt_timeout = interrupt_timeout or 1  # default timeout for SIGINT
+        # try to interrupt the bash shell use SIGINT
+        while max_retries > 0:
+            try:
+                self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout)
+                output = self.shell.before
+                logger.debug(f'Received output after SIGINT: {output}')
+                exit_code = 130  # SIGINT
+
+                _additional_msg = ''
+                if action_timeout is not None:
+                    _additional_msg = (
+                        f'Command timed out after {action_timeout} seconds. '
+                    )
+                output += (
+                    '\r\n\r\n'
+                    + f'[{_additional_msg}SIGINT was sent to interrupt the command.]'
+                )
+                return output, exit_code
+            except pexpect.TIMEOUT as e:
+                logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}')
+                max_retries -= 1
+
+        # fall back to send control-z
+        logger.error(
+            'Failed to get output after SIGINT. Max retries reached. Sending control-z...'
+        )
+        self.shell.sendcontrol('z')
+        self.shell.expect(self.__bash_expect_regex)
        output = self.shell.before
-        exit_code = 130  # SIGINT
+        logger.debug(f'Received output after control-z: {output}')
+        # Try to kill the job
+        self.shell.sendline('kill -9 %1')
+        self.shell.expect(self.__bash_expect_regex)
+        logger.debug(f'Received output after killing job %1: {self.shell.before}')
+        output += self.shell.before
+
+        _additional_msg = ''
+        if action_timeout is not None:
+            _additional_msg = f'Command timed out after {action_timeout} seconds. '
+        output += (
+            '\r\n\r\n'
+            + f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]'
+        )
+
+        # Try to get the exit code again
+        self.shell.sendline('echo $?')
+        self.shell.expect(self.__bash_expect_regex)
+        _exit_code_output = self.shell.before
+        exit_code = self._parse_exit_code(_exit_code_output)
+
        return output, exit_code

+    def _parse_exit_code(self, output: str) -> int:
+        try:
+            exit_code = int(output.strip().split()[0])
+        except Exception:
+            logger.error('Error getting exit code from bash script')
+            # If we try to run an invalid shell script the output sometimes includes error text
+            # rather than the error code - we assume this is an error
+            exit_code = 2
+        return exit_code
+
    def _continue_bash(
        self,
-        timeout: int | None,
+        timeout: int,
        keep_prompt: bool = True,
        kill_on_timeout: bool = True,
    ) -> tuple[str, int]:
+        logger.debug(f'Continuing bash with timeout={timeout}')
        try:
            self.shell.expect(self.__bash_expect_regex, timeout=timeout)

@ -333,31 +399,18 @@ class RuntimeClient:
            logger.debug('Requesting exit code...')
            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
            _exit_code_output = self.shell.before
-            try:
-                exit_code = int(_exit_code_output.strip().split()[0])
-            except Exception:
-                logger.error('Error getting exit code from bash script')
-                # If we try to run an invalid shell script the output sometimes includes error text
-                # rather than the error code - we assume this is an error
-                exit_code = 2
-
+            exit_code = self._parse_exit_code(_exit_code_output)
        except pexpect.TIMEOUT as e:
+            logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}')
            if kill_on_timeout:
-                output, exit_code = self._interrupt_bash()
-                output += (
-                    '\r\n\r\n'
-                    + f'[Command timed out after {timeout} seconds. SIGINT was sent to interrupt it.]'
-                )
-                logger.error(f'Failed to execute command. Error: {e}')
+                output, exit_code = self._interrupt_bash(action_timeout=timeout)
            else:
                output = self.shell.before or ''
                exit_code = -1
-
        finally:
            bash_prompt = self._get_bash_prompt_and_update_pwd()
            if keep_prompt:
                output += '\r\n' + bash_prompt
-            # logger.debug(f'Command output:\n{output}')
        return output, exit_code

    async def run_action(self, action) -> Observation:
@ -383,7 +436,7 @@ class RuntimeClient:
                    )
                elif command.lower() == 'ctrl+c':
                    output, exit_code = self._interrupt_bash(
-                        timeout=SOFT_TIMEOUT_SECONDS
+                        action_timeout=None,  # intentionally None
                    )
                else:
                    output, exit_code = self._execute_bash(
--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@ -35,6 +35,9 @@ from openhands.runtime.builder import DockerRuntimeBuilder
 from openhands.runtime.plugins import PluginRequirement
 from openhands.runtime.runtime import Runtime
 from openhands.runtime.utils import find_available_tcp_port
+from openhands.runtime.utils.request import (
+    send_request,
+)
 from openhands.runtime.utils.runtime_build import build_runtime_image
 from openhands.utils.tenacity_stop import stop_if_should_exit

@ -333,7 +336,12 @@ class EventStreamRuntime(Runtime):
        if not (self.log_buffer and self.log_buffer.client_ready):
            raise RuntimeError('Runtime client is not ready.')

-        response = self.session.get(f'{self.api_url}/alive')
+        response = send_request(
+            self.session,
+            'GET',
+            f'{self.api_url}/alive',
+            retry_exceptions=[ConnectionRefusedError],
+        )
        if response.status_code == 200:
            return
        else:
@ -416,7 +424,9 @@ class EventStreamRuntime(Runtime):
            assert action.timeout is not None

            try:
-                response = self.session.post(
+                response = send_request(
+                    self.session,
+                    'POST',
                    f'{self.api_url}/execute_action',
                    json={'action': event_to_dict(action)},
                    timeout=action.timeout,
@ -495,8 +505,12 @@ class EventStreamRuntime(Runtime):

            params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}

-            response = self.session.post(
-                f'{self.api_url}/upload_file', files=upload_data, params=params
+            response = send_request(
+                self.session,
+                'POST',
+                f'{self.api_url}/upload_file',
+                files=upload_data,
+                params=params,
            )
            if response.status_code == 200:
                return
@ -525,7 +539,12 @@ class EventStreamRuntime(Runtime):
            if path is not None:
                data['path'] = path

-            response = self.session.post(f'{self.api_url}/list_files', json=data)
+            response = send_request(
+                self.session,
+                'POST',
+                f'{self.api_url}/list_files',
+                json=data,
+            )
            if response.status_code == 200:
                response_json = response.json()
                assert isinstance(response_json, list)
--- a/openhands/runtime/utils/request.py
+++ b/openhands/runtime/utils/request.py
@ -49,7 +49,8 @@ def send_request(
    if retry_fns is not None:
        for fn in retry_fns:
            retry_condition |= retry_if_exception(fn)
-    kwargs['timeout'] = timeout
+    # wait a few more seconds to get the timeout error from client side
+    kwargs['timeout'] = timeout + 10

    @retry(
        stop=stop_after_delay(timeout) | stop_if_should_exit(),
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@ -57,6 +57,198 @@ def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
        _close_test_runtime(runtime)


+def test_bash_timeout_and_keyboard_interrupt(temp_dir, box_class, run_as_openhands):
+    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert (
+            '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
+            in obs.content
+        )
+        assert 'KeyboardInterrupt' in obs.content
+
+        # follow up command should not be affected
+        action = CmdRunAction(command='ls')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # run it again!
+        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert (
+            '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
+            in obs.content
+        )
+        assert 'KeyboardInterrupt' in obs.content
+
+        # things should still work
+        action = CmdRunAction(command='ls')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_bash_pexcept_eof(temp_dir, box_class, run_as_openhands):
+    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        action = CmdRunAction(command='python3 -m http.server 8080')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 130  # script was killed by SIGINT
+        assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
+        assert 'Keyboard interrupt received, exiting.' in obs.content
+
+        action = CmdRunAction(command='ls')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+
+        # run it again!
+        action = CmdRunAction(command='python3 -m http.server 8080')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 130  # script was killed by SIGINT
+        assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
+        assert 'Keyboard interrupt received, exiting.' in obs.content
+
+        # things should still work
+        action = CmdRunAction(command='ls')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_process_resistant_to_one_sigint(temp_dir, box_class, run_as_openhands):
+    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        # Create a bash script that ignores SIGINT up to 1 times
+        script_content = """
+#!/bin/bash
+trap_count=0
+trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT
+while true; do
+    echo "Still running..."
+    sleep 1
+done
+        """.strip()
+
+        with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
+            f.write(script_content)
+        os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
+
+        runtime.copy_to(
+            os.path.join(temp_dir, 'resistant_script.sh'),
+            runtime.config.workspace_mount_path_in_sandbox,
+        )
+
+        # Run the resistant script
+        action = CmdRunAction(command='sudo bash ./resistant_script.sh')
+        action.timeout = 5
+        action.blocking = True
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 130  # script was killed by SIGINT
+        assert 'Still running...' in obs.content
+        assert 'Caught SIGINT (1/1), ignoring...' in obs.content
+        assert 'Stopped' not in obs.content
+        assert (
+            '[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]'
+            in obs.content
+        )
+
+        # Normal command should still work
+        action = CmdRunAction(command='ls')
+        action.timeout = 10
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+        assert 'resistant_script.sh' in obs.content
+
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_process_resistant_to_multiple_sigint(temp_dir, box_class, run_as_openhands):
+    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        # Create a bash script that ignores SIGINT up to 2 times
+        script_content = """
+#!/bin/bash
+trap_count=0
+trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT
+while true; do
+    echo "Still running..."
+    sleep 1
+done
+        """.strip()
+
+        with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
+            f.write(script_content)
+        os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
+
+        runtime.copy_to(
+            os.path.join(temp_dir, 'resistant_script.sh'),
+            runtime.config.workspace_mount_path_in_sandbox,
+        )
+
+        # Run the resistant script
+        action = CmdRunAction(command='sudo bash ./resistant_script.sh')
+        action.timeout = 2
+        action.blocking = True
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'Still running...' in obs.content
+        assert 'Caught SIGINT (1/3), ignoring...' in obs.content
+        assert '[1]+' and 'Stopped' in obs.content
+        assert (
+            '[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]'
+            in obs.content
+        )
+
+        # Normal command should still work
+        action = CmdRunAction(command='ls')
+        action.timeout = 10
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+        assert 'resistant_script.sh' in obs.content
+
+    finally:
+        _close_test_runtime(runtime)
+
+
 def test_multiline_commands(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
    try: