From 899c1f8360bafafe3d575664e714223f68a401b3 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 17 Jan 2025 14:31:23 -0500 Subject: [PATCH] fix(bash): also show timeout reminder when no_change_timeout is triggered (#6318) Co-authored-by: Robert Brennan --- evaluation/benchmarks/swe_bench/run_infer.py | 2 +- .../tests/t07_interactive_commands.py | 73 +++++++++++ evaluation/utils/shared.py | 33 +++-- .../codeact_agent/function_calling.py | 13 +- openhands/events/action/commands.py | 8 +- openhands/runtime/base.py | 5 +- .../action_execution_client.py | 6 + .../runtime/impl/remote/remote_runtime.py | 40 ++++-- openhands/runtime/utils/bash.py | 120 +++++++++++------- tests/runtime/test_bash.py | 87 ++++++++++--- tests/unit/test_action_serialization.py | 2 + tests/unit/test_bash_session.py | 50 ++++---- tests/unit/test_security.py | 1 + 13 files changed, 324 insertions(+), 116 deletions(-) create mode 100644 evaluation/integration_tests/tests/t07_interactive_commands.py diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 8375295b99..511a22c4ed 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -359,7 +359,7 @@ def complete_runtime( action = CmdRunAction( command=f'git diff --no-color --cached {instance["base_commit"]}' ) - action.timeout = max(300 + 100 * n_retries, 600) + action.set_hard_timeout(max(300 + 100 * n_retries, 600)) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/evaluation/integration_tests/tests/t07_interactive_commands.py b/evaluation/integration_tests/tests/t07_interactive_commands.py new file mode 100644 index 0000000000..24a66d3f38 --- /dev/null +++ b/evaluation/integration_tests/tests/t07_interactive_commands.py @@ -0,0 +1,73 @@ +import hashlib + +from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult +from openhands.events.action import ( + AgentFinishAction, + FileWriteAction, + MessageAction, +) +from openhands.events.event import Event +from openhands.events.observation import AgentDelegateObservation +from openhands.runtime.base import Runtime + + +class Test(BaseIntegrationTest): + INSTRUCTION = 'Execute the python script /workspace/python_script.py with input "John" and "25" and tell me the secret number.' + SECRET_NUMBER = int(hashlib.sha256(str(25).encode()).hexdigest()[:8], 16) % 1000 + + @classmethod + def initialize_runtime(cls, runtime: Runtime) -> None: + from openhands.core.logger import openhands_logger as logger + + action = FileWriteAction( + path='/workspace/python_script.py', + content=( + 'name = input("Enter your name: "); age = input("Enter your age: "); ' + 'import hashlib; secret = int(hashlib.sha256(str(age).encode()).hexdigest()[:8], 16) % 1000; ' + 'print(f"Hello {name}, you are {age} years old. Tell you a secret number: {secret}")' + ), + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + observation = runtime.run_action(action) + logger.info(observation, extra={'msg_type': 'OBSERVATION'}) + + @classmethod + def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: + from openhands.core.logger import openhands_logger as logger + + # check if the license information is in any message + message_actions = [ + event + for event in histories + if isinstance( + event, (MessageAction, AgentFinishAction, AgentDelegateObservation) + ) + ] + logger.info(f'Total message-like events: {len(message_actions)}') + + for event in message_actions: + try: + if isinstance(event, AgentDelegateObservation): + content = event.content + elif isinstance(event, AgentFinishAction): + content = event.outputs.get('content', '') + if event.thought: + content += f'\n\n{event.thought}' + elif isinstance(event, MessageAction): + content = event.content + else: + logger.warning(f'Unexpected event type: {type(event)}') + continue + + if str(cls.SECRET_NUMBER) in content: + return TestResult(success=True) + except Exception as e: + logger.error(f'Error processing event: {e}') + + logger.debug( + f'Total messages: {len(message_actions)}. Messages: {message_actions}' + ) + return TestResult( + success=False, + reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.', + ) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 23c5b319d3..5b3ce8c8bd 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -371,7 +371,6 @@ def _process_instance_wrapper( error = str(e) stacktrace = traceback.format_exc() if attempt == max_retries: - logger.exception(e) msg = ( '-' * 10 + '\n' @@ -395,19 +394,13 @@ def _process_instance_wrapper( + '-' * 10 + '\n' ) - if isinstance( - e, - ( - AgentRuntimeDisconnectedError, - AgentRuntimeUnavailableError, - AgentRuntimeNotFoundError, - ), - ): + # e is likely an EvalException, so we can't directly infer it from type + # but rather check if it's a fatal error + if is_fatal_runtime_error(str(e)): runtime_failure_count += 1 msg += f'Runtime disconnected error detected for instance {instance.instance_id}, runtime failure count: {runtime_failure_count}' + msg += '\n' + '-' * 10 + '\n' logger.error(msg) - if use_mp: - print(msg) # use print to directly print to console time.sleep(5) @@ -564,6 +557,7 @@ def is_fatal_evaluation_error(error: str | None) -> bool: AgentRuntimeNotReadyError, AgentRuntimeDisconnectedError, AgentRuntimeNotFoundError, + ConnectionError, ] if any(exception.__name__ in error for exception in FATAL_EXCEPTIONS): @@ -573,6 +567,23 @@ def is_fatal_evaluation_error(error: str | None) -> bool: return False +def is_fatal_runtime_error(error: str | None) -> bool: + if not error: + return False + + FATAL_RUNTIME_ERRORS = [ + AgentRuntimeUnavailableError, + AgentRuntimeDisconnectedError, + AgentRuntimeNotFoundError, + ] + + if any(exception.__name__ in error for exception in FATAL_RUNTIME_ERRORS): + logger.error(f'Fatal runtime error detected: {error}') + return True + + return False + + def get_metrics(state: State) -> dict[str, Any]: """Extract metrics from the state.""" metrics = state.metrics.get() if state.metrics else {} diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index d06cf01cd3..af07393dd4 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -31,7 +31,7 @@ from openhands.events.tool import ToolCallMetadata _BASH_DESCRIPTION = """Execute a bash command in the terminal. * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`. -* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command like `C-c` (Ctrl+C) to interrupt the process. +* Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, the assistant can interact with the running process and send empty `command` to retrieve any additional logs, or send additional text (set `command` to the text) to STDIN of the running process, or send command like `C-c` (Ctrl+C), `C-d` (Ctrl+D), `C-z` (Ctrl+Z) to interrupt the process. """ CmdRunTool = ChatCompletionToolParam( @@ -46,6 +46,11 @@ CmdRunTool = ChatCompletionToolParam( 'type': 'string', 'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process.', }, + 'is_input': { + 'type': 'string', + 'description': 'If True, the command is an input to the running process. If False, the command is a bash command to be executed in the terminal. Default is False.', + 'enum': ['true', 'false'], + }, }, 'required': ['command'], }, @@ -488,6 +493,12 @@ def response_to_actions(response: ModelResponse) -> list[Action]: f'Failed to parse tool call arguments: {tool_call.function.arguments}' ) from e if tool_call.function.name == 'execute_bash': + # this is an LLM error: add empty command to avoid breaking the tool call + if 'command' not in arguments: + arguments['command'] = '' + # convert is_input to boolean + if 'is_input' in arguments: + arguments['is_input'] = arguments['is_input'] == 'true' action = CmdRunAction(**arguments) elif tool_call.function.name == 'execute_ipython_cell': action = IPythonRunCellAction(**arguments) diff --git a/openhands/events/action/commands.py b/openhands/events/action/commands.py index 67b586f86a..ab5c77de29 100644 --- a/openhands/events/action/commands.py +++ b/openhands/events/action/commands.py @@ -11,8 +11,10 @@ from openhands.events.action.action import ( @dataclass class CmdRunAction(Action): - command: str - # When `command` is empty, it will be used to print the current tmux window + command: ( + str # When `command` is empty, it will be used to print the current tmux window + ) + is_input: bool = False # if True, the command is an input to the running process thought: str = '' blocking: bool = False # If blocking is True, the command will be run in a blocking manner. @@ -28,7 +30,7 @@ class CmdRunAction(Action): return f'Running command: {self.command}' def __str__(self) -> str: - ret = f'**CmdRunAction (source={self.source})**\n' + ret = f'**CmdRunAction (source={self.source}, is_input={self.is_input})**\n' if self.thought: ret += f'THOUGHT: {self.thought}\n' ret += f'COMMAND:\n{self.command}' diff --git a/openhands/runtime/base.py b/openhands/runtime/base.py index 0505203cf6..4e5729ac77 100644 --- a/openhands/runtime/base.py +++ b/openhands/runtime/base.py @@ -197,9 +197,10 @@ class Runtime(FileEditRuntimeMixin): e, AgentRuntimeDisconnectedError ): err_id = 'STATUS$ERROR_RUNTIME_DISCONNECTED' - self.log('error', f'Unexpected error while running action: {str(e)}') + error_message = f'{type(e).__name__}: {str(e)}' + self.log('error', f'Unexpected error while running action: {error_message}') self.log('error', f'Problematic action: {str(event)}') - self.send_error_message(err_id, str(e)) + self.send_error_message(err_id, error_message) self.close() return diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py index a2455dd2b6..38afc54487 100644 --- a/openhands/runtime/impl/action_execution/action_execution_client.py +++ b/openhands/runtime/impl/action_execution/action_execution_client.py @@ -59,6 +59,7 @@ class ActionExecutionClient(Runtime): self.session = HttpSession() self.action_semaphore = threading.Semaphore(1) # Ensure one action at a time self._runtime_initialized: bool = False + self._runtime_closed: bool = False self._vscode_token: str | None = None # initial dummy value super().__init__( config, @@ -283,4 +284,9 @@ class ActionExecutionClient(Runtime): return self.send_action_for_execution(action) def close(self) -> None: + # Make sure we don't close the session multiple times + # Can happen in evaluation + if self._runtime_closed: + return + self._runtime_closed = True self.session.close() diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index 2842a7b50e..41e1620e29 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -13,6 +13,7 @@ from openhands.core.exceptions import ( AgentRuntimeNotReadyError, AgentRuntimeUnavailableError, ) +from openhands.core.logger import openhands_logger as logger from openhands.events import EventStream from openhands.runtime.builder.remote import RemoteRuntimeBuilder from openhands.runtime.impl.action_execution.action_execution_client import ( @@ -75,6 +76,10 @@ class RemoteRuntime(ActionExecutionClient): self.available_hosts: dict[str, int] = {} self._runtime_initialized: bool = False + def log(self, level: str, message: str) -> None: + message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}' + getattr(logger, level)(message, stacklevel=2) + def _get_action_execution_server_host(self): return self.runtime_url @@ -350,20 +355,33 @@ class RemoteRuntime(ActionExecutionClient): super().close() return try: - with self._send_runtime_api_request( - 'POST', - f'{self.config.sandbox.remote_runtime_api_url}/stop', - json={'runtime_id': self.runtime_id}, - ): - self.log('debug', 'Runtime stopped.') + if not self._runtime_closed: + with self._send_runtime_api_request( + 'POST', + f'{self.config.sandbox.remote_runtime_api_url}/stop', + json={'runtime_id': self.runtime_id}, + ): + self.log('debug', 'Runtime stopped.') except Exception as e: raise e finally: super().close() def _send_runtime_api_request(self, method, url, **kwargs): - return send_request(self.session, method, url, **kwargs) + try: + return send_request(self.session, method, url, **kwargs) + except requests.Timeout: + self.log( + 'error', + f'No response received within the timeout period for url: {url}', + ) + raise + @tenacity.retry( + retry=tenacity.retry_if_exception_type(ConnectionError), + stop=tenacity.stop_after_attempt(3) | stop_if_should_exit(), + wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), + ) def _send_action_server_request(self, method, url, **kwargs): try: return super()._send_action_server_request(method, url, **kwargs) @@ -375,14 +393,14 @@ class RemoteRuntime(ActionExecutionClient): raise except requests.HTTPError as e: - if e.response.status_code in (404, 502): + if e.response.status_code in (404, 502, 504): if e.response.status_code == 404: raise AgentRuntimeDisconnectedError( - 'Runtime is not responding. This may be temporary, please try again.' + f'Runtime is not responding. This may be temporary, please try again. Original error: {e}' ) from e - else: # 502 + else: # 502, 504 raise AgentRuntimeDisconnectedError( - 'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again.' + f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}' ) from e elif e.response.status_code == 503: self.log('warning', 'Runtime appears to be paused. Resuming...') diff --git a/openhands/runtime/utils/bash.py b/openhands/runtime/utils/bash.py index 87b2ae405f..9da848c44d 100644 --- a/openhands/runtime/utils/bash.py +++ b/openhands/runtime/utils/bash.py @@ -461,22 +461,28 @@ class BashSession: # Strip the command of any leading/trailing whitespace logger.debug(f'RECEIVED ACTION: {action}') command = action.command.strip() - is_special_key = self._is_special_key(command) + is_input: bool = action.is_input - # Handle when prev command is hard timeout - - if command == '' and self.prev_status not in { + # If the previous command is not completed, we need to check if the command is empty + if self.prev_status not in { BashCommandStatus.CONTINUE, BashCommandStatus.NO_CHANGE_TIMEOUT, BashCommandStatus.HARD_TIMEOUT, }: - return CmdOutputObservation( - content='ERROR: No previous command to continue from. ' - + 'Previous command has to be timeout to be continued.', - command='', - metadata=CmdOutputMetadata(), - ) + if command == '': + return CmdOutputObservation( + content='ERROR: No previous running command to retrieve logs from.', + command='', + metadata=CmdOutputMetadata(), + ) + if is_input: + return CmdOutputObservation( + content='ERROR: No previous running command to interact with.', + command='', + metadata=CmdOutputMetadata(), + ) + # Check if the command is a single command or multiple commands splited_commands = split_bash_commands(command) if len(splited_commands) > 1: return ErrorObservation( @@ -491,46 +497,62 @@ class BashSession: last_change_time = start_time last_pane_output = self._get_pane_content() - # Do not check hard timeout if the command is a special key - if command != '' and is_special_key: - logger.debug(f'SENDING SPECIAL KEY: {command!r}') - self.pane.send_keys(command, enter=False) - # When prev command is hard timeout, and we are trying to execute new command - elif self.prev_status == BashCommandStatus.HARD_TIMEOUT and command != '': - if not last_pane_output.endswith(CMD_OUTPUT_PS1_END): - _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output) - raw_command_output = self._combine_outputs_between_matches( - last_pane_output, _ps1_matches - ) - metadata = CmdOutputMetadata() # No metadata available - metadata.suffix = ( - f'\n[Your command "{command}" is NOT executed. ' - f'The previous command was timed out but still running. Above is the output of the previous command. ' - "You may wait longer to see additional output of the previous command by sending empty command '', " - 'send other commands to interact with the current process, ' - 'or send keys ("C-c", "C-z", "C-d") to interrupt/kill the previous command before sending your new command.]' - ) - command_output = self._get_command_output( - command, - raw_command_output, - metadata, - continue_prefix='[Below is the output of the previous command.]\n', - ) - return CmdOutputObservation( - command=command, - content=command_output, - metadata=metadata, - ) - # Only send the command to the pane if it's not a special key and it's not empty - # AND previous hard timeout command is resolved - elif command != '' and not is_special_key: - # convert command to raw string - command = escape_bash_special_chars(command) - logger.debug(f'SENDING COMMAND: {command!r}') - self.pane.send_keys( - command, - enter=True, + # When prev command is still running, and we are trying to send a new command + if ( + self.prev_status + in { + BashCommandStatus.HARD_TIMEOUT, + BashCommandStatus.NO_CHANGE_TIMEOUT, + } + and not last_pane_output.endswith( + CMD_OUTPUT_PS1_END + ) # prev command is not completed + and not is_input + and command != '' # not input and not empty command + ): + _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output) + raw_command_output = self._combine_outputs_between_matches( + last_pane_output, _ps1_matches ) + metadata = CmdOutputMetadata() # No metadata available + metadata.suffix = ( + f'\n[Your command "{command}" is NOT executed. ' + f'The previous command is still running - You CANNOT send new commands until the previous command is completed. ' + 'By setting `is_input` to `true`, you can interact with the current process: ' + "You may wait longer to see additional output of the previous command by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys ("C-c", "C-z", "C-d") to interrupt/kill the previous command before sending your new command.]' + ) + logger.debug(f'PREVIOUS COMMAND OUTPUT: {raw_command_output}') + command_output = self._get_command_output( + command, + raw_command_output, + metadata, + continue_prefix='[Below is the output of the previous command.]\n', + ) + return CmdOutputObservation( + command=command, + content=command_output, + metadata=metadata, + ) + + # Send actual command/inputs to the pane + if command != '': + is_special_key = self._is_special_key(command) + if is_input: + logger.debug(f'SENDING INPUT TO RUNNING PROCESS: {command!r}') + self.pane.send_keys( + command, + enter=not is_special_key, + ) + else: + # convert command to raw string + command = escape_bash_special_chars(command) + logger.debug(f'SENDING COMMAND: {command!r}') + self.pane.send_keys( + command, + enter=not is_special_key, + ) # Loop until the command completes or times out while should_continue(): diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py index 828c859f11..4af28d9065 100644 --- a/tests/runtime/test_bash.py +++ b/tests/runtime/test_bash.py @@ -57,7 +57,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands): in obs.metadata.suffix ) - action = CmdRunAction(command='C-c') + action = CmdRunAction(command='C-c', is_input=True) action.set_hard_timeout(30) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -571,7 +571,7 @@ def test_interactive_command(temp_dir, runtime_cls, run_as_openhands): assert 'Enter name:' in obs.content assert '[The command has no new output after 1 seconds.' in obs.metadata.suffix - action = CmdRunAction('John') + action = CmdRunAction('John', is_input=True) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'Hello John' in obs.content @@ -741,10 +741,7 @@ def test_long_running_command_follow_by_execute( logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '3' not in obs.content assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' - assert ( - 'The previous command was timed out but still running.' - in obs.metadata.suffix - ) + assert 'The previous command is still running' in obs.metadata.suffix assert obs.metadata.exit_code == -1 # -1 indicates command is still running # Finally continue again @@ -763,7 +760,9 @@ def test_empty_command_errors(temp_dir, runtime_cls, run_as_openhands): # Test empty command without previous command obs = runtime.run_action(CmdRunAction('')) assert isinstance(obs, CmdOutputObservation) - assert 'ERROR: No previous command to continue from' in obs.content + assert ( + 'ERROR: No previous running command to retrieve logs from.' in obs.content + ) finally: _close_test_runtime(runtime) @@ -781,13 +780,52 @@ def test_python_interactive_input(temp_dir, runtime_cls, run_as_openhands): assert obs.metadata.exit_code == -1 # -1 indicates command is still running # Send first input (name) - obs = runtime.run_action(CmdRunAction('Alice')) + obs = runtime.run_action(CmdRunAction('Alice', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'Enter your age:' in obs.content assert obs.metadata.exit_code == -1 # Send second input (age) - obs = runtime.run_action(CmdRunAction('25')) + obs = runtime.run_action(CmdRunAction('25', is_input=True)) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Hello Alice, you are 25 years old' in obs.content + assert obs.metadata.exit_code == 0 + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + finally: + _close_test_runtime(runtime) + + +def test_python_interactive_input_without_set_input( + temp_dir, runtime_cls, run_as_openhands +): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Test Python program that asks for input - properly escaped for bash + python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')""" + + # Start Python with the interactive script + obs = runtime.run_action(CmdRunAction(f'python3 -c "{python_script}"')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your name:' in obs.content + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + + # Send first input (name) + obs = runtime.run_action(CmdRunAction('Alice', is_input=False)) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your age:' not in obs.content + assert ( + 'Your command "Alice" is NOT executed. The previous command is still running' + in obs.metadata.suffix + ) + assert obs.metadata.exit_code == -1 + + # Try again now with input + obs = runtime.run_action(CmdRunAction('Alice', is_input=True)) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your age:' in obs.content + assert obs.metadata.exit_code == -1 + + obs = runtime.run_action(CmdRunAction('25', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'Hello Alice, you are 25 years old' in obs.content assert obs.metadata.exit_code == 0 @@ -844,7 +882,7 @@ def test_stress_long_output_with_soft_and_hard_timeout( assert obs.exit_code == -1 # Command is still running, waiting for input # Send the confirmation - action = CmdRunAction('Y') + action = CmdRunAction('Y', is_input=True) obs = runtime.run_action(action) assert 'Proceeding with operation...' in obs.content assert 'Operation completed successfully!' in obs.content @@ -869,13 +907,10 @@ def test_stress_long_output_with_soft_and_hard_timeout( # where it will not accept any new commands. obs = runtime.run_action(CmdRunAction('ls')) assert obs.exit_code == -1 - assert ( - 'The previous command was timed out but still running.' - in obs.metadata.suffix - ) + assert 'The previous command is still running' in obs.metadata.suffix # We need to send a Ctrl+C to reset the terminal. - obs = runtime.run_action(CmdRunAction('C-c')) + obs = runtime.run_action(CmdRunAction('C-c', is_input=True)) assert obs.exit_code == 130 # Now make sure the terminal is in a good state @@ -887,3 +922,25 @@ def test_stress_long_output_with_soft_and_hard_timeout( finally: _close_test_runtime(runtime) + + +def test_bash_remove_prefix(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # create a git repo + action = CmdRunAction( + 'git init && git remote add origin https://github.com/All-Hands-AI/OpenHands' + ) + obs = runtime.run_action(action) + # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == 0 + + # Start Python with the interactive script + obs = runtime.run_action(CmdRunAction('git remote -v')) + # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == 0 + assert 'https://github.com/All-Hands-AI/OpenHands' in obs.content + assert 'git remote -v' not in obs.content + + finally: + _close_test_runtime(runtime) diff --git a/tests/unit/test_action_serialization.py b/tests/unit/test_action_serialization.py index 374d5eee01..32b29e44b2 100644 --- a/tests/unit/test_action_serialization.py +++ b/tests/unit/test_action_serialization.py @@ -97,6 +97,7 @@ def test_cmd_run_action_serialization_deserialization(): 'args': { 'blocking': False, 'command': 'echo "Hello world"', + 'is_input': False, 'thought': '', 'hidden': False, 'confirmation_state': ActionConfirmationStatus.CONFIRMED, @@ -181,3 +182,4 @@ def test_legacy_serialization(): assert event_dict['args']['blocking'] is False assert event_dict['args']['command'] == 'echo "Hello world"' assert event_dict['args']['thought'] == '' + assert event_dict['args']['is_input'] is False diff --git a/tests/unit/test_bash_session.py b/tests/unit/test_bash_session.py index fc29eaffb2..e5e0bf22b7 100644 --- a/tests/unit/test_bash_session.py +++ b/tests/unit/test_bash_session.py @@ -1,5 +1,6 @@ import os import tempfile +import time from openhands.core.logger import openhands_logger as logger from openhands.events.action import CmdRunAction @@ -91,7 +92,7 @@ def test_long_running_command_follow_by_execute(): assert obs.metadata.prefix == '' # Continue watching output - obs = session.execute(CmdRunAction('')) + obs = session.execute(CmdRunAction('', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '2' in obs.content assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' @@ -107,14 +108,20 @@ def test_long_running_command_follow_by_execute(): # Test command that produces no output obs = session.execute(CmdRunAction('sleep 15')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '3' not in obs.content + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' + assert 'The previous command is still running' in obs.metadata.suffix + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + time.sleep(3) + + # Run it again, this time it should produce output + obs = session.execute(CmdRunAction('sleep 15')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '3' in obs.content assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' - assert obs.metadata.suffix == ( - '\n[The command has no new output after 2 seconds. ' - "You may wait longer to see additional output by sending empty command '', " - 'send other commands to interact with the current process, ' - 'or send keys to interrupt/kill the command.]' - ) + assert 'The previous command is still running' in obs.metadata.suffix assert obs.metadata.exit_code == -1 # -1 indicates command is still running assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT @@ -144,7 +151,7 @@ def test_interactive_command(): assert obs.metadata.prefix == '' # Send input - obs = session.execute(CmdRunAction('John')) + obs = session.execute(CmdRunAction('John', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'Hello John' in obs.content assert obs.metadata.exit_code == 0 @@ -165,7 +172,7 @@ def test_interactive_command(): ) assert obs.metadata.prefix == '' - obs = session.execute(CmdRunAction('line 1')) + obs = session.execute(CmdRunAction('line 1', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.metadata.exit_code == -1 assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT @@ -177,7 +184,7 @@ def test_interactive_command(): ) assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' - obs = session.execute(CmdRunAction('line 2')) + obs = session.execute(CmdRunAction('line 2', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.metadata.exit_code == -1 assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT @@ -189,7 +196,7 @@ def test_interactive_command(): ) assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' - obs = session.execute(CmdRunAction('EOF')) + obs = session.execute(CmdRunAction('EOF', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'line 1' in obs.content and 'line 2' in obs.content assert obs.metadata.exit_code == 0 @@ -220,7 +227,7 @@ def test_ctrl_c(): assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT # Send Ctrl+C - obs = session.execute(CmdRunAction('C-c')) + obs = session.execute(CmdRunAction('C-c', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.metadata.exit_code == 130 # Standard exit code for Ctrl+C assert ( @@ -240,10 +247,7 @@ def test_empty_command_errors(): # Test empty command without previous command obs = session.execute(CmdRunAction('')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert ( - obs.content - == 'ERROR: No previous command to continue from. Previous command has to be timeout to be continued.' - ) + assert obs.content == 'ERROR: No previous running command to retrieve logs from.' assert obs.metadata.exit_code == -1 assert obs.metadata.prefix == '' assert obs.metadata.suffix == '' @@ -264,14 +268,14 @@ def test_command_output_continuation(): assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT - obs = session.execute(CmdRunAction('')) + obs = session.execute(CmdRunAction('', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '2' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT - obs = session.execute(CmdRunAction('')) + obs = session.execute(CmdRunAction('', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '3' @@ -279,21 +283,21 @@ def test_command_output_continuation(): assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT - obs = session.execute(CmdRunAction('')) + obs = session.execute(CmdRunAction('', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '4' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT - obs = session.execute(CmdRunAction('')) + obs = session.execute(CmdRunAction('', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '5' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT - obs = session.execute(CmdRunAction('')) + obs = session.execute(CmdRunAction('', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '[The command completed with exit code 0.]' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.COMPLETED @@ -367,14 +371,14 @@ def test_python_interactive_input(): assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT # Send first input (name) - obs = session.execute(CmdRunAction('Alice')) + obs = session.execute(CmdRunAction('Alice', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'Enter your age:' in obs.content assert obs.metadata.exit_code == -1 assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT # Send second input (age) - obs = session.execute(CmdRunAction('25')) + obs = session.execute(CmdRunAction('25', is_input=True)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'Hello Alice, you are 25 years old' in obs.content assert obs.metadata.exit_code == 0 diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py index 0e2b3c8a0d..b3d7ce748d 100644 --- a/tests/unit/test_security.py +++ b/tests/unit/test_security.py @@ -367,6 +367,7 @@ async def test_unsafe_bash_command(temp_dir: str): arguments={ 'blocking': False, 'command': 'ls', + 'is_input': False, 'hidden': False, 'confirmation_state': ActionConfirmationStatus.CONFIRMED, },