Migrate multi-line-bash-related sandbox tests into runtime tests and fix multi-line issue (#3128)

* Remove global config from memory * Remove runtime global config * Remove from storage * Remove global config * Fix event stream tests * Fix sandbox issue * Change config * Removed transferred tests * Add swe env box * Fixes on testing * Fixed some tests * Merge with stashed changes * Fix typing * Fix ipython test * Revive function * Make temp_dir fixture * Remove test to avoid circular import * fix eventstream filestore for test_runtime * fix parse arg issue that cause integration test to fail * support swebench pull from custom namespace * add back simple tests for runtime * move multi-line bash tests to test_runtime; support multi-line bash for esruntime; * add testcase to handle PS2 prompt * use bashlex for bash parsing to handle multi-line commands; add testcases for multi-line commands * revert ghcr runtime change --------- Co-authored-by: Graham Neubig <neubig@gmail.com>
2025-12-26 05:48:36 +08:00 · 2024-07-28 04:12:57 +08:00 · 2024-07-28 04:12:57 +08:00 · b1ea204c5b
commit b1ea204c5b
parent 8b77e8a0ff
11 changed files with 608 additions and 150 deletions
--- a/opendevin/core/logger.py
+++ b/opendevin/core/logger.py
@ -9,7 +9,7 @@ from typing import Literal, Mapping
 from termcolor import colored

 DISABLE_COLOR_PRINTING = False
-DEBUG = False
+DEBUG = os.getenv('DEBUG', 'False').lower() in ['true', '1', 'yes']

 ColorType = Literal[
    'red',
--- a/opendevin/runtime/client/client.py
+++ b/opendevin/runtime/client/client.py
@ -46,6 +46,7 @@ from opendevin.runtime.plugins import (
    Plugin,
 )
 from opendevin.runtime.server.files import insert_lines, read_lines
+from opendevin.runtime.utils import split_bash_commands

 app = FastAPI()

@ -79,7 +80,7 @@ class RuntimeClient:
            r'\[PEXPECT_BEGIN\] ([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+) \[PEXPECT_END\]'
        )

-        self.shell.sendline(f'export PS1="{self.__bash_PS1}"')
+        self.shell.sendline(f'export PS1="{self.__bash_PS1}"; export PS2=""')
        self.shell.expect(self.__bash_expect_regex)

        self.shell.sendline(f'cd {work_dir}')
@ -87,6 +88,15 @@ class RuntimeClient:

    def _get_bash_prompt(self):
        ps1 = self.shell.after
+
+        # begin at the last occurence of '[PEXPECT_BEGIN]'.
+        # In multi-line bash commands, the prompt will be repeated
+        # and the matched regex captures all of them
+        # - we only want the last one (newest prompt)
+        _begin_pos = ps1.rfind('[PEXPECT_BEGIN]')
+        if _begin_pos != -1:
+            ps1 = ps1[_begin_pos:]
+
        # parse the ps1 to get username, hostname, and working directory
        matched = re.match(self.__bash_expect_regex, ps1)
        assert (
@ -102,7 +112,7 @@ class RuntimeClient:
            prompt += '$'
        return prompt + ' '

-    def _execute_bash(self, command, keep_prompt: bool = True) -> tuple[str, int]:
+    def _execute_bash(self, command: str, keep_prompt: bool = True) -> tuple[str, int]:
        logger.debug(f'Executing command: {command}')
        self.shell.sendline(command)
        self.shell.expect(self.__bash_expect_regex)
@ -129,10 +139,22 @@ class RuntimeClient:

    async def run(self, action: CmdRunAction) -> CmdOutputObservation:
        try:
-            output, exit_code = self._execute_bash(action.command)
+            commands = split_bash_commands(action.command)
+            all_output = ''
+            for command in commands:
+                output, exit_code = self._execute_bash(command)
+                if all_output:
+                    # previous output already exists with prompt "user@hostname:working_dir #""
+                    # we need to add the command to the previous output,
+                    # so model knows the following is the output of another action)
+                    all_output = all_output.rstrip() + ' ' + command + '\r\n'
+
+                all_output += str(output) + '\r\n'
+                if exit_code != 0:
+                    break
            return CmdOutputObservation(
                command_id=-1,
-                content=str(output),
+                content=all_output.rstrip('\r\n'),
                command=action.command,
                exit_code=exit_code,
            )
--- a/opendevin/runtime/client/runtime.py
+++ b/opendevin/runtime/client/runtime.py
@ -58,7 +58,7 @@ class EventStreamRuntime(Runtime):
        # TODO: We can switch to aiodocker when `get_od_sandbox_image` is updated to use aiodocker
        self.docker_client: docker.DockerClient = self._init_docker_client()
        self.container_image = (
-            config.sandbox.container_image
+            self.config.sandbox.container_image
            if container_image is None
            else container_image
        )
@ -103,7 +103,7 @@ class EventStreamRuntime(Runtime):
    async def _init_container(
        self,
        sandbox_workspace_dir: str,
-        mount_dir: str,
+        mount_dir: str | None = None,
        plugins: list[PluginRequirement] | None = None,
    ):
        try:
@ -124,6 +124,14 @@ class EventStreamRuntime(Runtime):
            else:
                port_mapping = {f'{self._port}/tcp': self._port}

+            if mount_dir is not None:
+                volumes = {mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}}
+            else:
+                logger.warn(
+                    'Mount dir is not set, will not mount the workspace directory to the container.'
+                )
+                volumes = None
+
            container = self.docker_client.containers.run(
                self.container_image,
                command=(
@ -139,7 +147,7 @@ class EventStreamRuntime(Runtime):
                name=self.container_name,
                detach=True,
                environment={'DEBUG': 'true'} if self.config.debug else None,
-                volumes={mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}},
+                volumes=volumes,
            )
            logger.info(f'Container started. Server url: {self.api_url}')
            return container
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@ -33,13 +33,13 @@ from opendevin.runtime.tools import RuntimeTool
 from opendevin.storage import FileStore


-def _default_env_vars(config: SandboxConfig) -> dict[str, str]:
+def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
    ret = {}
    for key in os.environ:
        if key.startswith('SANDBOX_ENV_'):
            sandbox_key = key.removeprefix('SANDBOX_ENV_')
            ret[sandbox_key] = os.environ[key]
-    if config.enable_auto_lint:
+    if sandbox_config.enable_auto_lint:
        ret['ENABLE_AUTO_LINT'] = 'true'
    return ret

--- a/opendevin/runtime/server/runtime.py
+++ b/opendevin/runtime/server/runtime.py
@ -115,7 +115,7 @@ class ServerRuntime(Runtime):

    async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
        self._run_command(
-            ("cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n" f'{action.code}\n' 'EOL'),
+            f"cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n{action.code}\nEOL"
        )

        # run the code
--- a/opendevin/runtime/utils/bash.py
+++ b/opendevin/runtime/utils/bash.py
@ -1,87 +1,49 @@
+import bashlex
+
+from opendevin.core.logger import opendevin_logger as logger
+
+
 def split_bash_commands(commands):
-    # States
-    NORMAL = 0
-    IN_SINGLE_QUOTE = 1
-    IN_DOUBLE_QUOTE = 2
-    IN_HEREDOC = 3
+    try:
+        parsed = bashlex.parse(commands)
+    except bashlex.errors.ParsingError as e:
+        logger.error(
+            f'Failed to parse bash commands\n[input]: {commands}\n[error]: {e}'
+        )
+        # If parsing fails, return the original commands
+        return [commands]

-    state = NORMAL
-    heredoc_trigger = None
-    result = []
-    current_command: list[str] = []
+    result: list[str] = []
+    last_end = 0

-    i = 0
-    while i < len(commands):
-        char = commands[i]
+    for node in parsed:
+        start, end = node.pos

-        if state == NORMAL:
-            if char == "'":
-                state = IN_SINGLE_QUOTE
-            elif char == '"':
-                state = IN_DOUBLE_QUOTE
-            elif char == '\\':
-                # Check if this is escaping a newline
-                if i + 1 < len(commands) and commands[i + 1] == '\n':
-                    i += 1  # Skip the newline
-                    # Continue with the next line as part of the same command
-                    i += 1  # Move to the first character of the next line
-                    continue
-            elif char == '\n':
-                if not heredoc_trigger and current_command:
-                    result.append(''.join(current_command).strip())
-                    current_command = []
-            elif char == '<' and commands[i : i + 2] == '<<':
-                # Detect heredoc
-                state = IN_HEREDOC
-                i += 2  # Skip '<<'
-                while commands[i] == ' ':
-                    i += 1
-                start = i
-                while commands[i] not in [' ', '\n']:
-                    i += 1
-                heredoc_trigger = commands[start:i]
-                current_command.append(commands[start - 2 : i])  # Include '<<'
-                continue  # Skip incrementing i at the end of the loop
-            current_command.append(char)
+        # Include any text between the last command and this one
+        if start > last_end:
+            between = commands[last_end:start]
+            logger.debug(f'BASH PARSING between: {between}')
+            if result:
+                result[-1] += between.rstrip()
+            elif between.strip():
+                # THIS SHOULD NOT HAPPEN
+                result.append(between.rstrip())

-        elif state == IN_SINGLE_QUOTE:
-            current_command.append(char)
-            if char == "'" and commands[i - 1] != '\\':
-                state = NORMAL
+        # Extract the command, preserving original formatting
+        command = commands[start:end].rstrip()
+        logger.debug(f'BASH PARSING command: {command}')
+        result.append(command)

-        elif state == IN_DOUBLE_QUOTE:
-            current_command.append(char)
-            if char == '"' and commands[i - 1] != '\\':
-                state = NORMAL
-
-        elif state == IN_HEREDOC:
-            current_command.append(char)
-            if (
-                char == '\n'
-                and heredoc_trigger
-                and commands[i + 1 : i + 1 + len(heredoc_trigger) + 1]
-                == heredoc_trigger + '\n'
-            ):
-                # Check if the next line starts with the heredoc trigger followed by a newline
-                i += (
-                    len(heredoc_trigger) + 1
-                )  # Move past the heredoc trigger and newline
-                current_command.append(
-                    heredoc_trigger + '\n'
-                )  # Include the heredoc trigger and newline
-                result.append(''.join(current_command).strip())
-                current_command = []
-                heredoc_trigger = None
-                state = NORMAL
-                continue
-
-        i += 1
-
-    # Add the last command if any
-    if current_command:
-        result.append(''.join(current_command).strip())
-
-    # Remove any empty strings from the result
-    result = [cmd for cmd in result if cmd]
+        last_end = end

+    # Add any remaining text after the last command to the last command
+    remaining = commands[last_end:].rstrip()
+    logger.debug(f'BASH PARSING remaining: {remaining}')
+    if last_end < len(commands) and result:
+        result[-1] += remaining
+        logger.debug(f'BASH PARSING result[-1] += remaining: {result[-1]}')
+    elif last_end < len(commands):
+        if remaining:
+            result.append(remaining)
+            logger.debug(f'BASH PARSING result.append(remaining): {result[-1]}')
    return result
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.

 [[package]]
 name = "aenum"
@ -398,6 +398,17 @@ files = [
    {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
 ]

+[[package]]
+name = "bashlex"
+version = "0.18"
+description = "Python parser for bash"
+optional = false
+python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4"
+files = [
+    {file = "bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa"},
+    {file = "bashlex-0.18.tar.gz", hash = "sha256:5bb03a01c6d5676338c36fd1028009c8ad07e7d61d8a1ce3f513b7fff52796ee"},
+]
+
 [[package]]
 name = "bcrypt"
 version = "4.1.3"
@ -9109,4 +9120,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "e1520f1342ab527bc3bb2619f8909cbdddeb227c14614eb3d82e133961f1f4d2"
+content-hash = "6d6cfaf3a614a4bf766d9a0e886e82dc9f8cfb8bf08a642f0207f260e72dd6da"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -39,6 +39,7 @@ pathspec = "^0.12.1"
 google-cloud-aiplatform = "*"
 grep-ast = "0.3.2"
 tree-sitter = "0.21.3"
+bashlex = "^0.18"

 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
@ -72,6 +73,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@ -105,6 +107,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
--- a/tests/unit/test_bash_parsing.py
+++ b/tests/unit/test_bash_parsing.py
@ -0,0 +1,283 @@
+import pytest
+
+from opendevin.runtime.utils.bash import split_bash_commands
+
+
+def test_split_commands_util():
+    cmds = [
+        'ls -l',
+        'echo -e "hello\nworld"',
+        """
+echo -e "hello it\\'s me"
+""".strip(),
+        """
+echo \\
+    -e 'hello' \\
+    -v
+""".strip(),
+        """
+echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
+""".strip(),
+        """
+echo -e 'hello
+world
+are
+you\\n
+there?'
+""".strip(),
+        """
+echo -e 'hello
+world "
+'
+""".strip(),
+        """
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: busybox-sleep
+spec:
+  containers:
+  - name: busybox
+    image: busybox:1.28
+    args:
+    - sleep
+    - "1000000"
+EOF
+""".strip(),
+        """
+mkdir -p _modules && \
+for month in {01..04}; do
+    for day in {01..05}; do
+        touch "_modules/2024-${month}-${day}-sample.md"
+    done
+done
+""".strip(),
+    ]
+    joined_cmds = '\n'.join(cmds)
+    split_cmds = split_bash_commands(joined_cmds)
+    for s in split_cmds:
+        print('\nCMD')
+        print(s)
+    for i in range(len(cmds)):
+        assert (
+            split_cmds[i].strip() == cmds[i].strip()
+        ), f'At index {i}: {split_cmds[i]} != {cmds[i]}.'
+
+
+@pytest.mark.parametrize(
+    'input_command, expected_output',
+    [
+        ('ls -l', ['ls -l']),
+        ("echo 'Hello, world!'", ["echo 'Hello, world!'"]),
+        ('cd /tmp && touch test.txt', ['cd /tmp && touch test.txt']),
+        ("echo -e 'line1\\nline2\\nline3'", ["echo -e 'line1\\nline2\\nline3'"]),
+        (
+            "grep 'pattern' file.txt | sort | uniq",
+            ["grep 'pattern' file.txt | sort | uniq"],
+        ),
+        ('for i in {1..5}; do echo $i; done', ['for i in {1..5}; do echo $i; done']),
+        (
+            "echo 'Single quotes don\\'t escape'",
+            ["echo 'Single quotes don\\'t escape'"],
+        ),
+        (
+            'echo "Double quotes \\"do\\" escape"',
+            ['echo "Double quotes \\"do\\" escape"'],
+        ),
+    ],
+)
+def test_single_commands(input_command, expected_output):
+    assert split_bash_commands(input_command) == expected_output
+
+
+def test_heredoc():
+    input_commands = """
+cat <<EOF
+multiline
+text
+EOF
+echo "Done"
+"""
+    expected_output = ['cat <<EOF\nmultiline\ntext\nEOF', 'echo "Done"']
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_jupyter_heredoc():
+    """This tests specifically test the behavior of the bash parser
+    when the input is a heredoc for a Jupyter cell (used in ServerRuntime).
+
+    It will failed to parse bash commands AND fall back to the original input,
+    which won't cause issues in actual execution.
+
+    [input]: cat > /tmp/opendevin_jupyter_temp.py <<'EOL'
+    print('Hello, `World`!
+    ')
+    EOL
+    [error]: here-document at line 0 delimited by end-of-file (wanted "'EOL'") (position 75)
+
+    TODO: remove this tests after the deprecation of ServerRuntime
+    """
+
+    code = "print('Hello, `World`!\n')"
+    input_commands = f"""cat > /tmp/opendevin_jupyter_temp.py <<'EOL'
+{code}
+EOL"""
+    expected_output = [f"cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n{code}\nEOL"]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_backslash_continuation():
+    input_commands = """
+echo "This is a long \
+command that spans \
+multiple lines"
+echo "Next command"
+"""
+    expected_output = [
+        'echo "This is a long command that spans multiple lines"',
+        'echo "Next command"',
+    ]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_comments():
+    input_commands = """
+echo "Hello" # This is a comment
+# This is another comment
+ls -l
+"""
+    expected_output = [
+        'echo "Hello" # This is a comment\n# This is another comment',
+        'ls -l',
+    ]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_complex_quoting():
+    input_commands = """
+echo "This is a \\"quoted\\" string"
+echo 'This is a '\''single-quoted'\'' string'
+echo "Mixed 'quotes' in \\"double quotes\\""
+"""
+    expected_output = [
+        'echo "This is a \\"quoted\\" string"',
+        "echo 'This is a '''single-quoted''' string'",
+        'echo "Mixed \'quotes\' in \\"double quotes\\""',
+    ]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_invalid_syntax():
+    invalid_inputs = [
+        'echo "Unclosed quote',
+        "echo 'Unclosed quote",
+        'cat <<EOF\nUnclosed heredoc',
+    ]
+    for input_command in invalid_inputs:
+        # it will fall back to return the original input
+        assert split_bash_commands(input_command) == [input_command]
+
+
+@pytest.fixture
+def sample_commands():
+    return [
+        'ls -l',
+        'echo "Hello, world!"',
+        'cd /tmp && touch test.txt',
+        'echo -e "line1\\nline2\\nline3"',
+        'grep "pattern" file.txt | sort | uniq',
+        'for i in {1..5}; do echo $i; done',
+        'cat <<EOF\nmultiline\ntext\nEOF',
+        'echo "Escaped \\"quotes\\""',
+        "echo 'Single quotes don\\'t escape'",
+        'echo "Command with a trailing backslash \\\n  and continuation"',
+    ]
+
+
+def test_split_single_commands(sample_commands):
+    for cmd in sample_commands:
+        result = split_bash_commands(cmd)
+        assert len(result) == 1, f'Expected single command, got: {result}'
+
+
+def test_split_commands_with_heredoc():
+    input_commands = """
+cat <<EOF
+multiline
+text
+EOF
+echo "Done"
+"""
+    expected_output = ['cat <<EOF\nmultiline\ntext\nEOF', 'echo "Done"']
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_backslash_continuation():
+    input_commands = """
+echo "This is a long \
+command that spans \
+multiple lines"
+echo "Next command"
+"""
+    expected_output = [
+        'echo "This is a long command that spans multiple lines"',
+        'echo "Next command"',
+    ]
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_empty_lines():
+    input_commands = """
+ls -l
+
+echo "Hello"
+
+cd /tmp
+"""
+    expected_output = ['ls -l', 'echo "Hello"', 'cd /tmp']
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_comments():
+    input_commands = """
+echo "Hello" # This is a comment
+# This is another comment
+ls -l
+"""
+    expected_output = [
+        'echo "Hello" # This is a comment\n# This is another comment',
+        'ls -l',
+    ]
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_complex_quoting():
+    input_commands = """
+echo "This is a \\"quoted\\" string"
+echo "Mixed 'quotes' in \\"double quotes\\""
+"""
+    # echo 'This is a '\''single-quoted'\'' string'
+
+    expected_output = [
+        'echo "This is a \\"quoted\\" string"',
+        'echo "Mixed \'quotes\' in \\"double quotes\\""',
+    ]
+    # "echo 'This is a '\\''single-quoted'\\'' string'",
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_invalid_input():
+    invalid_inputs = [
+        'echo "Unclosed quote',
+        "echo 'Unclosed quote",
+        'cat <<EOF\nUnclosed heredoc',
+    ]
+    for input_command in invalid_inputs:
+        # it will fall back to return the original input
+        assert split_bash_commands(input_command) == [input_command]
--- a/tests/unit/test_runtime.py
+++ b/tests/unit/test_runtime.py
@ -9,14 +9,23 @@ from unittest.mock import patch

 import pytest

-from opendevin.core.config import AppConfig, SandboxConfig
+from opendevin.core.config import AppConfig, SandboxConfig, load_from_env
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events import EventStream
 from opendevin.events.action import (
+    BrowseURLAction,
    CmdRunAction,
+    FileReadAction,
+    FileWriteAction,
+    IPythonRunCellAction,
 )
 from opendevin.events.observation import (
+    BrowserOutputObservation,
    CmdOutputObservation,
+    ErrorObservation,
+    FileReadObservation,
+    FileWriteObservation,
+    IPythonRunCellObservation,
 )
 from opendevin.runtime.client.runtime import EventStreamRuntime
 from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
@ -58,6 +67,8 @@ async def _load_runtime(temp_dir, box_class):
            use_host_network=True,
        ),
    )
+    load_from_env(config, os.environ)
+
    file_store = get_file_store(config.file_store, config.file_store_path)
    event_stream = EventStream(cli_session, file_store)

@ -223,3 +234,218 @@ async def test_bash_command_pexcept(temp_dir, box_class):

    await runtime.close()
    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    # Test run command
+    action_cmd = CmdRunAction(command='ls -l')
+    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_cmd)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'total 0' in obs.content
+
+    # Test run ipython
+    test_code = "print('Hello, `World`!\\n')"
+    action_ipython = IPythonRunCellAction(code=test_code)
+    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_ipython)
+    assert isinstance(obs, IPythonRunCellObservation)
+
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.content.strip() == 'Hello, `World`!'
+
+    # Test read file (file should not exist)
+    action_read = FileReadAction(path='hello.sh')
+    logger.info(action_read, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_read)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, ErrorObservation)
+    assert 'File not found' in obs.content
+
+    # Test write file
+    action_write = FileWriteAction(content='echo "Hello, World!"', path='hello.sh')
+    logger.info(action_write, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_write)
+    assert isinstance(obs, FileWriteObservation)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert obs.content == ''
+    if box_class == ServerRuntime:
+        assert obs.path == 'hello.sh'
+    else:
+        # event stream runtime will always use absolute path
+        assert obs.path == '/workspace/hello.sh'
+
+    # Test read file (file should exist)
+    action_read = FileReadAction(path='hello.sh')
+    logger.info(action_read, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_read)
+    assert isinstance(
+        obs, FileReadObservation
+    ), 'The observation should be a FileReadObservation.'
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert obs.content == 'echo "Hello, World!"\n'
+    if box_class == ServerRuntime:
+        assert obs.path == 'hello.sh'
+    else:
+        assert obs.path == '/workspace/hello.sh'
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_simple_browse(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    # Test browse
+    action_cmd = CmdRunAction(command='python -m http.server 8000 > server.log 2>&1 &')
+    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_cmd)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert '[1]' in obs.content
+
+    action_browse = BrowseURLAction(url='http://localhost:8000')
+    logger.info(action_browse, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_browse)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, BrowserOutputObservation)
+    assert 'http://localhost:8000' in obs.url
+    assert obs.status_code == 200
+    assert not obs.error
+    assert obs.open_pages_urls == ['http://localhost:8000/']
+    assert obs.active_page_index == 0
+    assert obs.last_browser_action == 'goto("http://localhost:8000")'
+    assert obs.last_browser_action_error == ''
+    assert 'Directory listing for /' in obs.content
+    assert 'server.log' in obs.content
+
+    await runtime.close()
+
+
+@pytest.mark.asyncio
+async def test_multiline_commands(temp_dir, box_class):
+    cmds = [
+        'ls -l',
+        'echo -e "hello\nworld"',
+        """
+echo -e "hello it\\'s me"
+""".strip(),
+        """
+echo \\
+    -e 'hello' \\
+    -v
+""".strip(),
+        """
+echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
+""".strip(),
+        """
+echo -e 'hello
+world
+are
+you\\n
+there?'
+""".strip(),
+        """
+echo -e 'hello
+world "
+'
+""".strip(),
+    ]
+    joined_cmds = '\n'.join(cmds)
+
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command=joined_cmds)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+
+    assert 'total 0' in obs.content
+    assert 'hello\r\nworld' in obs.content
+    assert "hello it\\'s me" in obs.content
+    assert 'hello -v' in obs.content
+    assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
+    assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
+    assert 'hello\r\nworld "\r\n' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_no_ps2_in_output(temp_dir, box_class):
+    """Test that the PS2 sign is not added to the output of a multiline command."""
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='echo -e "hello\nworld"')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if box_class == ServerRuntime:
+        # the extra PS2 '>' is NOT handled by the ServerRuntime
+        assert 'hello\r\nworld' in obs.content
+        assert '>' in obs.content
+        assert obs.content.count('>') == 1
+    else:
+        assert 'hello\r\nworld' in obs.content
+        assert '>' not in obs.content
+
+
+@pytest.mark.asyncio
+async def test_multiline_command_loop(temp_dir, box_class):
+    # https://github.com/OpenDevin/OpenDevin/issues/3143
+
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    init_cmd = """
+mkdir -p _modules && \
+for month in {01..04}; do
+    for day in {01..05}; do
+        touch "_modules/2024-${month}-${day}-sample.md"
+    done
+done
+echo "created files"
+"""
+    action = CmdRunAction(command=init_cmd)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert 'created files' in obs.content
+
+    follow_up_cmd = """
+for file in _modules/*.md; do
+    new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/')
+    mv "$file" "$new_date"
+done
+echo "success"
+"""
+    action = CmdRunAction(command=follow_up_cmd)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert 'success' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
--- a/tests/unit/test_sandbox.py
+++ b/tests/unit/test_sandbox.py
@ -7,7 +7,6 @@ import pytest
 from opendevin.core.config import AppConfig, SandboxConfig
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
-from opendevin.runtime.utils import split_bash_commands


 def create_docker_box_from_app_config(
@ -41,62 +40,6 @@ def temp_dir(monkeypatch):
        yield temp_dir


-def test_split_commands():
-    cmds = [
-        'ls -l',
-        'echo -e "hello\nworld"',
-        """
-echo -e 'hello it\\'s me'
-""".strip(),
-        """
-echo \\
-    -e 'hello' \\
-    -v
-""".strip(),
-        """
-echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
-""".strip(),
-        """
-echo -e 'hello
-world
-are
-you\\n
-there?'
-""".strip(),
-        """
-echo -e 'hello
-world "
-'
-""".strip(),
-        """
-kubectl apply -f - <<EOF
-apiVersion: v1
-kind: Pod
-metadata:
-  name: busybox-sleep
-spec:
-  containers:
-  - name: busybox
-    image: busybox:1.28
-    args:
-    - sleep
-    - "1000000"
-EOF
-""".strip(),
-    ]
-    joined_cmds = '\n'.join(cmds)
-    split_cmds = split_bash_commands(joined_cmds)
-    for s in split_cmds:
-        print('\nCMD')
-        print(s)
-    cmds = [
-        c.replace('\\\n', '') for c in cmds
-    ]  # The function strips escaped newlines, but this shouldn't matter
-    assert (
-        split_cmds == cmds
-    ), 'The split commands should be the same as the input commands.'
-
-
 def test_ssh_box_run_as_devin(temp_dir):
    # get a temporary directory
    for box in [