mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
fix(runtime): fix bash interrupt on program that cannot be stopped via ctrl+c (#4161)
This commit is contained in:
parent
4678ae4ebd
commit
42649745bd
@ -300,7 +300,7 @@ class RuntimeClient:
|
||||
def _execute_bash(
|
||||
self,
|
||||
command: str,
|
||||
timeout: int | None,
|
||||
timeout: int,
|
||||
keep_prompt: bool = True,
|
||||
kill_on_timeout: bool = True,
|
||||
) -> tuple[str, int]:
|
||||
@ -310,19 +310,85 @@ class RuntimeClient:
|
||||
timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout
|
||||
)
|
||||
|
||||
def _interrupt_bash(self, timeout: int | None = None) -> tuple[str, int]:
|
||||
def _interrupt_bash(
|
||||
self,
|
||||
action_timeout: int | None,
|
||||
interrupt_timeout: int | None = None,
|
||||
max_retries: int = 2,
|
||||
) -> tuple[str, int]:
|
||||
self.shell.sendintr() # send SIGINT to the shell
|
||||
self.shell.expect(self.__bash_expect_regex, timeout=timeout)
|
||||
logger.debug('Sent SIGINT to bash. Waiting for output...')
|
||||
|
||||
interrupt_timeout = interrupt_timeout or 1 # default timeout for SIGINT
|
||||
# try to interrupt the bash shell use SIGINT
|
||||
while max_retries > 0:
|
||||
try:
|
||||
self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout)
|
||||
output = self.shell.before
|
||||
logger.debug(f'Received output after SIGINT: {output}')
|
||||
exit_code = 130 # SIGINT
|
||||
|
||||
_additional_msg = ''
|
||||
if action_timeout is not None:
|
||||
_additional_msg = (
|
||||
f'Command timed out after {action_timeout} seconds. '
|
||||
)
|
||||
output += (
|
||||
'\r\n\r\n'
|
||||
+ f'[{_additional_msg}SIGINT was sent to interrupt the command.]'
|
||||
)
|
||||
return output, exit_code
|
||||
except pexpect.TIMEOUT as e:
|
||||
logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}')
|
||||
max_retries -= 1
|
||||
|
||||
# fall back to send control-z
|
||||
logger.error(
|
||||
'Failed to get output after SIGINT. Max retries reached. Sending control-z...'
|
||||
)
|
||||
self.shell.sendcontrol('z')
|
||||
self.shell.expect(self.__bash_expect_regex)
|
||||
output = self.shell.before
|
||||
exit_code = 130 # SIGINT
|
||||
logger.debug(f'Received output after control-z: {output}')
|
||||
# Try to kill the job
|
||||
self.shell.sendline('kill -9 %1')
|
||||
self.shell.expect(self.__bash_expect_regex)
|
||||
logger.debug(f'Received output after killing job %1: {self.shell.before}')
|
||||
output += self.shell.before
|
||||
|
||||
_additional_msg = ''
|
||||
if action_timeout is not None:
|
||||
_additional_msg = f'Command timed out after {action_timeout} seconds. '
|
||||
output += (
|
||||
'\r\n\r\n'
|
||||
+ f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]'
|
||||
)
|
||||
|
||||
# Try to get the exit code again
|
||||
self.shell.sendline('echo $?')
|
||||
self.shell.expect(self.__bash_expect_regex)
|
||||
_exit_code_output = self.shell.before
|
||||
exit_code = self._parse_exit_code(_exit_code_output)
|
||||
|
||||
return output, exit_code
|
||||
|
||||
def _parse_exit_code(self, output: str) -> int:
|
||||
try:
|
||||
exit_code = int(output.strip().split()[0])
|
||||
except Exception:
|
||||
logger.error('Error getting exit code from bash script')
|
||||
# If we try to run an invalid shell script the output sometimes includes error text
|
||||
# rather than the error code - we assume this is an error
|
||||
exit_code = 2
|
||||
return exit_code
|
||||
|
||||
def _continue_bash(
|
||||
self,
|
||||
timeout: int | None,
|
||||
timeout: int,
|
||||
keep_prompt: bool = True,
|
||||
kill_on_timeout: bool = True,
|
||||
) -> tuple[str, int]:
|
||||
logger.debug(f'Continuing bash with timeout={timeout}')
|
||||
try:
|
||||
self.shell.expect(self.__bash_expect_regex, timeout=timeout)
|
||||
|
||||
@ -333,31 +399,18 @@ class RuntimeClient:
|
||||
logger.debug('Requesting exit code...')
|
||||
self.shell.expect(self.__bash_expect_regex, timeout=timeout)
|
||||
_exit_code_output = self.shell.before
|
||||
try:
|
||||
exit_code = int(_exit_code_output.strip().split()[0])
|
||||
except Exception:
|
||||
logger.error('Error getting exit code from bash script')
|
||||
# If we try to run an invalid shell script the output sometimes includes error text
|
||||
# rather than the error code - we assume this is an error
|
||||
exit_code = 2
|
||||
|
||||
exit_code = self._parse_exit_code(_exit_code_output)
|
||||
except pexpect.TIMEOUT as e:
|
||||
logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}')
|
||||
if kill_on_timeout:
|
||||
output, exit_code = self._interrupt_bash()
|
||||
output += (
|
||||
'\r\n\r\n'
|
||||
+ f'[Command timed out after {timeout} seconds. SIGINT was sent to interrupt it.]'
|
||||
)
|
||||
logger.error(f'Failed to execute command. Error: {e}')
|
||||
output, exit_code = self._interrupt_bash(action_timeout=timeout)
|
||||
else:
|
||||
output = self.shell.before or ''
|
||||
exit_code = -1
|
||||
|
||||
finally:
|
||||
bash_prompt = self._get_bash_prompt_and_update_pwd()
|
||||
if keep_prompt:
|
||||
output += '\r\n' + bash_prompt
|
||||
# logger.debug(f'Command output:\n{output}')
|
||||
return output, exit_code
|
||||
|
||||
async def run_action(self, action) -> Observation:
|
||||
@ -383,7 +436,7 @@ class RuntimeClient:
|
||||
)
|
||||
elif command.lower() == 'ctrl+c':
|
||||
output, exit_code = self._interrupt_bash(
|
||||
timeout=SOFT_TIMEOUT_SECONDS
|
||||
action_timeout=None, # intentionally None
|
||||
)
|
||||
else:
|
||||
output, exit_code = self._execute_bash(
|
||||
|
||||
@ -35,6 +35,9 @@ from openhands.runtime.builder import DockerRuntimeBuilder
|
||||
from openhands.runtime.plugins import PluginRequirement
|
||||
from openhands.runtime.runtime import Runtime
|
||||
from openhands.runtime.utils import find_available_tcp_port
|
||||
from openhands.runtime.utils.request import (
|
||||
send_request,
|
||||
)
|
||||
from openhands.runtime.utils.runtime_build import build_runtime_image
|
||||
from openhands.utils.tenacity_stop import stop_if_should_exit
|
||||
|
||||
@ -333,7 +336,12 @@ class EventStreamRuntime(Runtime):
|
||||
if not (self.log_buffer and self.log_buffer.client_ready):
|
||||
raise RuntimeError('Runtime client is not ready.')
|
||||
|
||||
response = self.session.get(f'{self.api_url}/alive')
|
||||
response = send_request(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.api_url}/alive',
|
||||
retry_exceptions=[ConnectionRefusedError],
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return
|
||||
else:
|
||||
@ -416,7 +424,9 @@ class EventStreamRuntime(Runtime):
|
||||
assert action.timeout is not None
|
||||
|
||||
try:
|
||||
response = self.session.post(
|
||||
response = send_request(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/execute_action',
|
||||
json={'action': event_to_dict(action)},
|
||||
timeout=action.timeout,
|
||||
@ -495,8 +505,12 @@ class EventStreamRuntime(Runtime):
|
||||
|
||||
params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
|
||||
|
||||
response = self.session.post(
|
||||
f'{self.api_url}/upload_file', files=upload_data, params=params
|
||||
response = send_request(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/upload_file',
|
||||
files=upload_data,
|
||||
params=params,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return
|
||||
@ -525,7 +539,12 @@ class EventStreamRuntime(Runtime):
|
||||
if path is not None:
|
||||
data['path'] = path
|
||||
|
||||
response = self.session.post(f'{self.api_url}/list_files', json=data)
|
||||
response = send_request(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/list_files',
|
||||
json=data,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response_json = response.json()
|
||||
assert isinstance(response_json, list)
|
||||
|
||||
@ -49,7 +49,8 @@ def send_request(
|
||||
if retry_fns is not None:
|
||||
for fn in retry_fns:
|
||||
retry_condition |= retry_if_exception(fn)
|
||||
kwargs['timeout'] = timeout
|
||||
# wait a few more seconds to get the timeout error from client side
|
||||
kwargs['timeout'] = timeout + 10
|
||||
|
||||
@retry(
|
||||
stop=stop_after_delay(timeout) | stop_if_should_exit(),
|
||||
|
||||
@ -57,6 +57,198 @@ def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_bash_timeout_and_keyboard_interrupt(temp_dir, box_class, run_as_openhands):
|
||||
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
|
||||
try:
|
||||
action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert (
|
||||
'[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
|
||||
in obs.content
|
||||
)
|
||||
assert 'KeyboardInterrupt' in obs.content
|
||||
|
||||
# follow up command should not be affected
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# run it again!
|
||||
action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert (
|
||||
'[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
|
||||
in obs.content
|
||||
)
|
||||
assert 'KeyboardInterrupt' in obs.content
|
||||
|
||||
# things should still work
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert '/workspace' in obs.content
|
||||
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_bash_pexcept_eof(temp_dir, box_class, run_as_openhands):
|
||||
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
|
||||
try:
|
||||
action = CmdRunAction(command='python3 -m http.server 8080')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 130 # script was killed by SIGINT
|
||||
assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
|
||||
assert 'Keyboard interrupt received, exiting.' in obs.content
|
||||
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert '/workspace' in obs.content
|
||||
|
||||
# run it again!
|
||||
action = CmdRunAction(command='python3 -m http.server 8080')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 130 # script was killed by SIGINT
|
||||
assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
|
||||
assert 'Keyboard interrupt received, exiting.' in obs.content
|
||||
|
||||
# things should still work
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert '/workspace' in obs.content
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_process_resistant_to_one_sigint(temp_dir, box_class, run_as_openhands):
|
||||
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
|
||||
try:
|
||||
# Create a bash script that ignores SIGINT up to 1 times
|
||||
script_content = """
|
||||
#!/bin/bash
|
||||
trap_count=0
|
||||
trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT
|
||||
while true; do
|
||||
echo "Still running..."
|
||||
sleep 1
|
||||
done
|
||||
""".strip()
|
||||
|
||||
with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
|
||||
f.write(script_content)
|
||||
os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
|
||||
|
||||
runtime.copy_to(
|
||||
os.path.join(temp_dir, 'resistant_script.sh'),
|
||||
runtime.config.workspace_mount_path_in_sandbox,
|
||||
)
|
||||
|
||||
# Run the resistant script
|
||||
action = CmdRunAction(command='sudo bash ./resistant_script.sh')
|
||||
action.timeout = 5
|
||||
action.blocking = True
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 130 # script was killed by SIGINT
|
||||
assert 'Still running...' in obs.content
|
||||
assert 'Caught SIGINT (1/1), ignoring...' in obs.content
|
||||
assert 'Stopped' not in obs.content
|
||||
assert (
|
||||
'[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]'
|
||||
in obs.content
|
||||
)
|
||||
|
||||
# Normal command should still work
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 10
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert '/workspace' in obs.content
|
||||
assert 'resistant_script.sh' in obs.content
|
||||
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_process_resistant_to_multiple_sigint(temp_dir, box_class, run_as_openhands):
|
||||
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
|
||||
try:
|
||||
# Create a bash script that ignores SIGINT up to 2 times
|
||||
script_content = """
|
||||
#!/bin/bash
|
||||
trap_count=0
|
||||
trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT
|
||||
while true; do
|
||||
echo "Still running..."
|
||||
sleep 1
|
||||
done
|
||||
""".strip()
|
||||
|
||||
with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
|
||||
f.write(script_content)
|
||||
os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
|
||||
|
||||
runtime.copy_to(
|
||||
os.path.join(temp_dir, 'resistant_script.sh'),
|
||||
runtime.config.workspace_mount_path_in_sandbox,
|
||||
)
|
||||
|
||||
# Run the resistant script
|
||||
action = CmdRunAction(command='sudo bash ./resistant_script.sh')
|
||||
action.timeout = 2
|
||||
action.blocking = True
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert 'Still running...' in obs.content
|
||||
assert 'Caught SIGINT (1/3), ignoring...' in obs.content
|
||||
assert '[1]+' and 'Stopped' in obs.content
|
||||
assert (
|
||||
'[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]'
|
||||
in obs.content
|
||||
)
|
||||
|
||||
# Normal command should still work
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 10
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert '/workspace' in obs.content
|
||||
assert 'resistant_script.sh' in obs.content
|
||||
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_multiline_commands(temp_dir, box_class):
|
||||
runtime = _load_runtime(temp_dir, box_class)
|
||||
try:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user