fix(runtime): fix bash interrupt on program that cannot be stopped via ctrl+c (#4161)

This commit is contained in:
Xingyao Wang 2024-10-03 17:48:44 -05:00 committed by GitHub
parent 4678ae4ebd
commit 42649745bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 293 additions and 28 deletions

View File

@ -300,7 +300,7 @@ class RuntimeClient:
def _execute_bash(
self,
command: str,
timeout: int | None,
timeout: int,
keep_prompt: bool = True,
kill_on_timeout: bool = True,
) -> tuple[str, int]:
@ -310,19 +310,85 @@ class RuntimeClient:
timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout
)
def _interrupt_bash(self, timeout: int | None = None) -> tuple[str, int]:
def _interrupt_bash(
self,
action_timeout: int | None,
interrupt_timeout: int | None = None,
max_retries: int = 2,
) -> tuple[str, int]:
self.shell.sendintr() # send SIGINT to the shell
self.shell.expect(self.__bash_expect_regex, timeout=timeout)
logger.debug('Sent SIGINT to bash. Waiting for output...')
interrupt_timeout = interrupt_timeout or 1 # default timeout for SIGINT
# try to interrupt the bash shell use SIGINT
while max_retries > 0:
try:
self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout)
output = self.shell.before
logger.debug(f'Received output after SIGINT: {output}')
exit_code = 130 # SIGINT
_additional_msg = ''
if action_timeout is not None:
_additional_msg = (
f'Command timed out after {action_timeout} seconds. '
)
output += (
'\r\n\r\n'
+ f'[{_additional_msg}SIGINT was sent to interrupt the command.]'
)
return output, exit_code
except pexpect.TIMEOUT as e:
logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}')
max_retries -= 1
# fall back to send control-z
logger.error(
'Failed to get output after SIGINT. Max retries reached. Sending control-z...'
)
self.shell.sendcontrol('z')
self.shell.expect(self.__bash_expect_regex)
output = self.shell.before
exit_code = 130 # SIGINT
logger.debug(f'Received output after control-z: {output}')
# Try to kill the job
self.shell.sendline('kill -9 %1')
self.shell.expect(self.__bash_expect_regex)
logger.debug(f'Received output after killing job %1: {self.shell.before}')
output += self.shell.before
_additional_msg = ''
if action_timeout is not None:
_additional_msg = f'Command timed out after {action_timeout} seconds. '
output += (
'\r\n\r\n'
+ f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]'
)
# Try to get the exit code again
self.shell.sendline('echo $?')
self.shell.expect(self.__bash_expect_regex)
_exit_code_output = self.shell.before
exit_code = self._parse_exit_code(_exit_code_output)
return output, exit_code
def _parse_exit_code(self, output: str) -> int:
try:
exit_code = int(output.strip().split()[0])
except Exception:
logger.error('Error getting exit code from bash script')
# If we try to run an invalid shell script the output sometimes includes error text
# rather than the error code - we assume this is an error
exit_code = 2
return exit_code
def _continue_bash(
self,
timeout: int | None,
timeout: int,
keep_prompt: bool = True,
kill_on_timeout: bool = True,
) -> tuple[str, int]:
logger.debug(f'Continuing bash with timeout={timeout}')
try:
self.shell.expect(self.__bash_expect_regex, timeout=timeout)
@ -333,31 +399,18 @@ class RuntimeClient:
logger.debug('Requesting exit code...')
self.shell.expect(self.__bash_expect_regex, timeout=timeout)
_exit_code_output = self.shell.before
try:
exit_code = int(_exit_code_output.strip().split()[0])
except Exception:
logger.error('Error getting exit code from bash script')
# If we try to run an invalid shell script the output sometimes includes error text
# rather than the error code - we assume this is an error
exit_code = 2
exit_code = self._parse_exit_code(_exit_code_output)
except pexpect.TIMEOUT as e:
logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}')
if kill_on_timeout:
output, exit_code = self._interrupt_bash()
output += (
'\r\n\r\n'
+ f'[Command timed out after {timeout} seconds. SIGINT was sent to interrupt it.]'
)
logger.error(f'Failed to execute command. Error: {e}')
output, exit_code = self._interrupt_bash(action_timeout=timeout)
else:
output = self.shell.before or ''
exit_code = -1
finally:
bash_prompt = self._get_bash_prompt_and_update_pwd()
if keep_prompt:
output += '\r\n' + bash_prompt
# logger.debug(f'Command output:\n{output}')
return output, exit_code
async def run_action(self, action) -> Observation:
@ -383,7 +436,7 @@ class RuntimeClient:
)
elif command.lower() == 'ctrl+c':
output, exit_code = self._interrupt_bash(
timeout=SOFT_TIMEOUT_SECONDS
action_timeout=None, # intentionally None
)
else:
output, exit_code = self._execute_bash(

View File

@ -35,6 +35,9 @@ from openhands.runtime.builder import DockerRuntimeBuilder
from openhands.runtime.plugins import PluginRequirement
from openhands.runtime.runtime import Runtime
from openhands.runtime.utils import find_available_tcp_port
from openhands.runtime.utils.request import (
send_request,
)
from openhands.runtime.utils.runtime_build import build_runtime_image
from openhands.utils.tenacity_stop import stop_if_should_exit
@ -333,7 +336,12 @@ class EventStreamRuntime(Runtime):
if not (self.log_buffer and self.log_buffer.client_ready):
raise RuntimeError('Runtime client is not ready.')
response = self.session.get(f'{self.api_url}/alive')
response = send_request(
self.session,
'GET',
f'{self.api_url}/alive',
retry_exceptions=[ConnectionRefusedError],
)
if response.status_code == 200:
return
else:
@ -416,7 +424,9 @@ class EventStreamRuntime(Runtime):
assert action.timeout is not None
try:
response = self.session.post(
response = send_request(
self.session,
'POST',
f'{self.api_url}/execute_action',
json={'action': event_to_dict(action)},
timeout=action.timeout,
@ -495,8 +505,12 @@ class EventStreamRuntime(Runtime):
params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
response = self.session.post(
f'{self.api_url}/upload_file', files=upload_data, params=params
response = send_request(
self.session,
'POST',
f'{self.api_url}/upload_file',
files=upload_data,
params=params,
)
if response.status_code == 200:
return
@ -525,7 +539,12 @@ class EventStreamRuntime(Runtime):
if path is not None:
data['path'] = path
response = self.session.post(f'{self.api_url}/list_files', json=data)
response = send_request(
self.session,
'POST',
f'{self.api_url}/list_files',
json=data,
)
if response.status_code == 200:
response_json = response.json()
assert isinstance(response_json, list)

View File

@ -49,7 +49,8 @@ def send_request(
if retry_fns is not None:
for fn in retry_fns:
retry_condition |= retry_if_exception(fn)
kwargs['timeout'] = timeout
# wait a few more seconds to get the timeout error from client side
kwargs['timeout'] = timeout + 10
@retry(
stop=stop_after_delay(timeout) | stop_if_should_exit(),

View File

@ -57,6 +57,198 @@ def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
_close_test_runtime(runtime)
def test_bash_timeout_and_keyboard_interrupt(temp_dir, box_class, run_as_openhands):
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
try:
action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
action.timeout = 1
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert (
'[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
in obs.content
)
assert 'KeyboardInterrupt' in obs.content
# follow up command should not be affected
action = CmdRunAction(command='ls')
action.timeout = 1
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# run it again!
action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
action.timeout = 1
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
assert (
'[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
in obs.content
)
assert 'KeyboardInterrupt' in obs.content
# things should still work
action = CmdRunAction(command='ls')
action.timeout = 1
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert '/workspace' in obs.content
finally:
_close_test_runtime(runtime)
def test_bash_pexcept_eof(temp_dir, box_class, run_as_openhands):
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
try:
action = CmdRunAction(command='python3 -m http.server 8080')
action.timeout = 1
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 130 # script was killed by SIGINT
assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
assert 'Keyboard interrupt received, exiting.' in obs.content
action = CmdRunAction(command='ls')
action.timeout = 1
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert '/workspace' in obs.content
# run it again!
action = CmdRunAction(command='python3 -m http.server 8080')
action.timeout = 1
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 130 # script was killed by SIGINT
assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
assert 'Keyboard interrupt received, exiting.' in obs.content
# things should still work
action = CmdRunAction(command='ls')
action.timeout = 1
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert '/workspace' in obs.content
finally:
_close_test_runtime(runtime)
def test_process_resistant_to_one_sigint(temp_dir, box_class, run_as_openhands):
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
try:
# Create a bash script that ignores SIGINT up to 1 times
script_content = """
#!/bin/bash
trap_count=0
trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT
while true; do
echo "Still running..."
sleep 1
done
""".strip()
with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
f.write(script_content)
os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
runtime.copy_to(
os.path.join(temp_dir, 'resistant_script.sh'),
runtime.config.workspace_mount_path_in_sandbox,
)
# Run the resistant script
action = CmdRunAction(command='sudo bash ./resistant_script.sh')
action.timeout = 5
action.blocking = True
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 130 # script was killed by SIGINT
assert 'Still running...' in obs.content
assert 'Caught SIGINT (1/1), ignoring...' in obs.content
assert 'Stopped' not in obs.content
assert (
'[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]'
in obs.content
)
# Normal command should still work
action = CmdRunAction(command='ls')
action.timeout = 10
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert '/workspace' in obs.content
assert 'resistant_script.sh' in obs.content
finally:
_close_test_runtime(runtime)
def test_process_resistant_to_multiple_sigint(temp_dir, box_class, run_as_openhands):
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
try:
# Create a bash script that ignores SIGINT up to 2 times
script_content = """
#!/bin/bash
trap_count=0
trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT
while true; do
echo "Still running..."
sleep 1
done
""".strip()
with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
f.write(script_content)
os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
runtime.copy_to(
os.path.join(temp_dir, 'resistant_script.sh'),
runtime.config.workspace_mount_path_in_sandbox,
)
# Run the resistant script
action = CmdRunAction(command='sudo bash ./resistant_script.sh')
action.timeout = 2
action.blocking = True
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert 'Still running...' in obs.content
assert 'Caught SIGINT (1/3), ignoring...' in obs.content
assert '[1]+' and 'Stopped' in obs.content
assert (
'[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]'
in obs.content
)
# Normal command should still work
action = CmdRunAction(command='ls')
action.timeout = 10
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert '/workspace' in obs.content
assert 'resistant_script.sh' in obs.content
finally:
_close_test_runtime(runtime)
def test_multiline_commands(temp_dir, box_class):
runtime = _load_runtime(temp_dir, box_class)
try: