[Arch] Add tests for EventStreamRuntime and fix bash parsing (#2933)

* deprecating recall action * fix integration tests * fix integration tests * refractor runtime to use async * remove search memory * rename .initialize to .ainit * draft of runtime image building (separate from img agnostic) * refractor runtime build into separate file and add unit tests for it * fix image agnostic tests * move `split_bash_commands` into a separate util file * fix bash pexcept parsing for env * refractor add_env_var from sandbox to runtime; add test runtime for env var, remove it from sandbox; * remove unclear comment * capture broader error * make `add_env_var` handle multiple export at the same time * add multi env var test * fix tests with new config * make runtime tests a separate ci to avoid full disk * Update Runtime README with architecture diagram and detailed explanations * update test * remove dependency of global config in sandbox test * fix sandbox typo * runtime tests does not need ghcr build now * remove download runtime img * remove dependency of global config in sandbox test * fix sandbox typo * try to free disk before running the tests * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * try to reduce code duplication * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update opendevin/runtime/client/README.md Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * cleanup before setup * temporarily remove this enable lint test since env var are now handled by runtime * linter --------- Co-authored-by: OpenDevin <opendevin@all-hands.dev> Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>
2026-03-22 13:47:19 +08:00 · 2024-07-18 06:10:45 +08:00
parent cf3d2298da
commit f80ecec772
18 changed files with 573 additions and 182 deletions
--- a/.github/workflows/run-runtime-tests.yml
+++ b/.github/workflows/run-runtime-tests.yml
@@ -0,0 +1,64 @@
+name: Run Runtime Tests
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+on:
+  push:
+    branches:
+      - main
+    paths-ignore:
+      - '**/*.md'
+      - 'frontend/**'
+      - 'docs/**'
+      - 'evaluation/**'
+  pull_request:
+
+env:
+  PERSIST_SANDBOX : "false"
+
+jobs:
+  test-for-runtime:
+    name: Test for Runtime
+    runs-on: ubuntu-latest
+    env:
+      PERSIST_SANDBOX: "false"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # when set to "true" but frees about 6 GB
+          tool-cache: true
+
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "poetry"
+
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+
+      - name: Run runtime tests
+        run: |
+          TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -149,7 +149,7 @@ jobs:
        run: make build

      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox and not test_runtime"

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
--- a/opendevin/core/main.py
+++ b/opendevin/core/main.py
@@ -79,7 +79,9 @@ async def run_agent_controller(

    # runtime and tools
    runtime_cls = get_runtime_cls(config.runtime)
-    runtime = runtime_cls(event_stream=event_stream, sandbox=sandbox)
+    runtime = runtime_cls(
+        sandbox_config=config.sandbox, event_stream=event_stream, sandbox=sandbox
+    )
    await runtime.ainit()
    runtime.init_sandbox_plugins(controller.agent.sandbox_plugins)
    runtime.init_runtime_tools(
--- a/opendevin/runtime/client/README.md
+++ b/opendevin/runtime/client/README.md
@@ -0,0 +1,110 @@
+# OpenDevin Runtime
+
+This README provides an overview of the OpenDevin Runtime, a crucial component of the OpenDevin system. It covers two main aspects:
+
+1. How the Runtime Image is Built: Explains the layered approach to creating Docker images for both production and development environments.
+2. How the Runtime Client Works: Details the functionality and architecture of the Runtime Client, which executes actions within the Docker sandbox.
+
+The following sections dive deeper into these topics, providing a comprehensive understanding of the OpenDevin Runtime system.
+
+## How the Runtime Image is Built
+
+The OpenDevin runtime uses a layered approach for building Docker images:
+
+1. **Original Image**: `ubuntu:22.04`
+   - This is the base image used for all subsequent layers.
+
+2. **Runtime Image**:  `od_runtime:ubuntu__22.04`
+   - Built from the stable release of OpenDevin.
+   - This is the primary runtime image that users will interact with.
+   - Created by copying all OpenDevin code into the original image and installing dependencies using Poetry.
+
+3. **Dev Runtime Image**:  `od_runtime_dev:ubuntu__22.04`
+   - Built from local source code for development purposes.
+
+### Build Process
+
+#### Production Build (DEBUG=false)
+By default, when DEBUG is set to false, the build process only needs to run once:
+- The Runtime Image (`od_runtime:ubuntu__22.04`) is created by copying OpenDevin code into the original Ubuntu image and installing all dependencies.
+- This pre-built image is then used for running the OpenDevin environment.
+
+#### Development Build (DEBUG=true)
+When developing or modifying code that runs inside the container, you can set DEBUG=true to enable a more dynamic build process:
+- Every time you run the code, the existing image will be updated with the latest changes.
+- The Dev Runtime Image (`od_runtime_dev:ubuntu__22.04`) is rebuilt from the Runtime Image (`od_runtime:ubuntu__22.04`).
+- Most dependencies are already installed in the Runtime Image, so this process mainly updates the code and any new dependencies.
+- The rebuild process typically takes around 10 seconds, allowing for quick iterations during development.
+
+This approach allows developers to easily test changes to the OpenDevin codebase, including modifications to files like client.py, without needing to rebuild the entire image from scratch each time.
+
+## How the Runtime Client Works
+
+The Runtime Client is a crucial component of the OpenDevin system, responsible for executing actions within the Docker sandbox environment and producing observations. Here's an overview of its functionality:
+
+1. **Initialization**:
+   - The `EventStreamRuntime` class in `runtime.py` initializes the Docker container and sets up the runtime environment.
+
+2. **Communication**:
+   - The Runtime Client uses FastAPI to create a web server inside the Docker container.
+   - It listens for incoming action requests from the OpenDevin backend.
+
+3. **Action Execution**:
+   - When an action is received, the Runtime Client processes it based on its type:
+     - `CmdRunAction`: Executes shell commands using a pexpect-spawned bash shell.
+     - `FileReadAction` and `FileWriteAction`: Perform file operations within the sandbox.
+     - `IPythonRunCellAction`: Executes Python code in an IPython environment.
+     - `BrowseURLAction` and `BrowseInteractiveAction`: Handle web browsing tasks using a browser environment.
+
+4. **Plugin System**:
+   - The Runtime Client supports a plugin system for extending functionality.
+   - Plugins like JupyterPlugin can be loaded to provide additional features.
+
+5. **Observation Generation**:
+   - After executing an action, the Runtime Client generates an appropriate observation.
+   - Observations include command outputs, file contents, error messages, etc.
+
+6. **Asynchronous Operation**:
+   - The Runtime Client uses asyncio for avoid concurrent requests.
+   - It ensures that only one action is executed at a time using a semaphore.
+
+7. **Security**:
+   - All actions are executed within the confined Docker environment, providing a sandbox for safe execution.
+
+8. **Flexibility**:
+   - The system supports both production (DEBUG=false) and development (DEBUG=true) modes.
+   - In development mode, the runtime image can be updated with the latest code changes for testing and debugging.
+
+
+
+## Architecture Diagram
+
+```
+-------------------+     +-------------------+
+|   OpenDevin       |     |   Docker Host     |
+|   Backend         |     |                   |
+|                   |     |  +-------------+  |
+|  +-------------+  |     |  |  Runtime    |  |
+|  | EventStream |  |     |  |  Container  |  |
+|  | Runtime     |<-|-----|->|             |  |
+|  +-------------+  |     |  |  +-------+  |  |
+|                   |     |  |  |Runtime|  |  |
+|                   |     |  |  |Client |  |  |
+|                   |     |  |  +-------+  |  |
+|                   |     |  |     |       |  |
+|                   |     |  |  +-------+  |  |
+|                   |     |  |  |Plugins|  |  |
+|                   |     |  |  +-------+  |  |
+|                   |     |  +-------------+  |
+-------------------+     +-------------------+
+```
+
+This diagram illustrates the high-level architecture of the OpenDevin Runtime system:
+
+1. The OpenDevin Backend communicates with the Docker Host through the EventStreamRuntime.
+2. The Docker Host runs a Runtime Container, which includes:
+   - The Runtime Client: Handles incoming actions and generates observations.
+   - Plugins: Extend the functionality of the Runtime Client.
+3. The Runtime Client executes actions within the sandboxed environment of the Docker container.
+
+This architecture ensures a secure and flexible environment for executing AI-driven development tasks, allowing OpenDevin to execute a wide range of actions safely and efficiently.
--- a/opendevin/runtime/client/client.py
+++ b/opendevin/runtime/client/client.py
@@ -1,6 +1,7 @@
 import argparse
 import asyncio
 import os
+import re
 from pathlib import Path

 import pexpect
@@ -60,26 +61,51 @@ class RuntimeClient:

    def _init_bash_shell(self, work_dir: str) -> None:
        self.shell = pexpect.spawn('/bin/bash', encoding='utf-8', echo=False)
-        self.__bash_expect = r'\[PEXPECT\][\$\#] '
-        self.__bash_PS1 = r'\u@\h:\w [PEXPECT]\$ '
+        self.__bash_PS1 = r'[PEXPECT_BEGIN] \u@\h:\w [PEXPECT_END]'
+
+        # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
+        self.__bash_expect_regex = r'\[PEXPECT_BEGIN\] ([a-z_][a-z0-9_-]*)@([a-zA-Z][a-zA-Z0-9.-]*):(.+) \[PEXPECT_END\]'
+
        self.shell.sendline(f'export PS1="{self.__bash_PS1}"')
-        self.shell.expect(self.__bash_expect)
+        self.shell.expect(self.__bash_expect_regex)
+
        self.shell.sendline(f'cd {work_dir}')
-        self.shell.expect(self.__bash_expect)
+        self.shell.expect(self.__bash_expect_regex)

-    def _execute_bash(self, command, keep_prompt: bool = True):
-        logger.info(f'Received command: {command}')
+    def _get_bash_prompt(self):
+        ps1 = self.shell.after
+        # parse the ps1 to get username, hostname, and working directory
+        matched = re.match(self.__bash_expect_regex, ps1)
+        assert (
+            matched is not None
+        ), f'Failed to parse bash prompt: {ps1}. This should not happen.'
+        username, hostname, working_dir = matched.groups()
+
+        # re-assemble the prompt
+        prompt = f'{username}@{hostname}:{working_dir} '
+        if username == 'root':
+            prompt += '#'
+        else:
+            prompt += '$'
+        return prompt + ' '
+
+    def _execute_bash(self, command, keep_prompt: bool = True) -> tuple[str, int]:
+        logger.debug(f'Executing command: {command}')
        self.shell.sendline(command)
-        self.shell.expect(self.__bash_expect)
-        output = self.shell.before + '$ '
-        if not keep_prompt:
-            # remove the last line of the output (the prompt)
-            # e.g., user@host:~$
-            output = '\r\n'.join(output.split('\r\n')[:-1])
+        self.shell.expect(self.__bash_expect_regex)

+        output = self.shell.before
+        if keep_prompt:
+            output += '\r\n' + self._get_bash_prompt()
+        logger.debug(f'Command output: {output}')
+
+        # Get exit code
        self.shell.sendline('echo $?')
-        self.shell.expect(r'[$#] ')
-        exit_code = int(self.shell.before.split('\r\n')[0].strip())
+        logger.debug(f'Executing command for exit code: {command}')
+        self.shell.expect(self.__bash_expect_regex)
+        _exit_code_output = self.shell.before
+        logger.debug(f'Exit code Output: {_exit_code_output}')
+        exit_code = int(_exit_code_output.strip())
        return output, exit_code

    async def run_action(self, action) -> Observation:
--- a/opendevin/runtime/client/runtime.py
+++ b/opendevin/runtime/client/runtime.py
@@ -6,7 +6,7 @@ import aiohttp
 import docker
 import tenacity

-from opendevin.core.config import config
+from opendevin.core.config import SandboxConfig, config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events import EventSource, EventStream
 from opendevin.events.action import (
@@ -45,12 +45,15 @@ class EventStreamRuntime(Runtime):

    def __init__(
        self,
+        sandbox_config: SandboxConfig,
        event_stream: EventStream,
        sid: str = 'default',
        container_image: str | None = None,
        plugins: list[PluginRequirement] | None = None,
    ):
-        super().__init__(event_stream, sid)  # will initialize the event stream
+        super().__init__(
+            sandbox_config, event_stream, sid
+        )  # will initialize the event stream
        self._port = find_available_tcp_port()
        self.api_url = f'http://localhost:{self._port}'
        self.session: Optional[aiohttp.ClientSession] = None
@@ -71,7 +74,7 @@ class EventStreamRuntime(Runtime):
        self.container = None
        self.action_semaphore = asyncio.Semaphore(1)  # Ensure one action at a time

-    async def ainit(self):
+    async def ainit(self, env_vars: dict[str, str] | None = None):
        self.container_image = build_runtime_image(
            self.container_image,
            self.docker_client,
@@ -85,6 +88,8 @@ class EventStreamRuntime(Runtime):
            mount_dir=config.workspace_mount_path,
            plugins=self.plugins,
        )
+        # Initialize the env vars
+        await super().ainit(env_vars)

    @staticmethod
    def _init_docker_client() -> docker.DockerClient:
@@ -126,6 +131,7 @@ class EventStreamRuntime(Runtime):
                working_dir='/opendevin/code/',
                name=self.container_name,
                detach=True,
+                environment={'DEBUG': 'true'} if config.debug else None,
                volumes={mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}},
            )
            logger.info(f'Container started. Server url: {self.api_url}')
@@ -273,7 +279,9 @@ async def test_run_command():
    sid = 'test'
    cli_session = 'main' + ('_' + sid if sid else '')
    event_stream = EventStream(cli_session)
-    runtime = EventStreamRuntime(event_stream)
+    runtime = EventStreamRuntime(
+        sandbox_config=config.sandbox, event_stream=event_stream, sid=sid
+    )
    await runtime.ainit()
    await runtime.run_action(CmdRunAction('ls -l'))

@@ -283,9 +291,10 @@ async def test_event_stream():
    cli_session = 'main' + ('_' + sid if sid else '')
    event_stream = EventStream(cli_session)
    runtime = EventStreamRuntime(
-        event_stream,
-        sid,
-        'ubuntu:22.04',
+        sandbox_config=config.sandbox,
+        event_stream=event_stream,
+        sid=sid,
+        container_image='ubuntu:22.04',
        plugins=[JupyterRequirement(), AgentSkillsRequirement()],
    )
    await runtime.ainit()
--- a/opendevin/runtime/docker/ssh_box.py
+++ b/opendevin/runtime/docker/ssh_box.py
@@ -19,7 +19,7 @@ from opendevin.core.schema import CancellableStream
 from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
 from opendevin.runtime.plugins.requirement import PluginRequirement
 from opendevin.runtime.sandbox import Sandbox
-from opendevin.runtime.utils import find_available_tcp_port
+from opendevin.runtime.utils import find_available_tcp_port, split_bash_commands
 from opendevin.runtime.utils.image_agnostic import get_od_sandbox_image


@@ -101,95 +101,6 @@ class SSHExecCancellableStream(CancellableStream):
                break


-def split_bash_commands(commands):
-    # States
-    NORMAL = 0
-    IN_SINGLE_QUOTE = 1
-    IN_DOUBLE_QUOTE = 2
-    IN_HEREDOC = 3
-
-    state = NORMAL
-    heredoc_trigger = None
-    result = []
-    current_command: list[str] = []
-
-    i = 0
-    while i < len(commands):
-        char = commands[i]
-
-        if state == NORMAL:
-            if char == "'":
-                state = IN_SINGLE_QUOTE
-            elif char == '"':
-                state = IN_DOUBLE_QUOTE
-            elif char == '\\':
-                # Check if this is escaping a newline
-                if i + 1 < len(commands) and commands[i + 1] == '\n':
-                    i += 1  # Skip the newline
-                    # Continue with the next line as part of the same command
-                    i += 1  # Move to the first character of the next line
-                    continue
-            elif char == '\n':
-                if not heredoc_trigger and current_command:
-                    result.append(''.join(current_command).strip())
-                    current_command = []
-            elif char == '<' and commands[i : i + 2] == '<<':
-                # Detect heredoc
-                state = IN_HEREDOC
-                i += 2  # Skip '<<'
-                while commands[i] == ' ':
-                    i += 1
-                start = i
-                while commands[i] not in [' ', '\n']:
-                    i += 1
-                heredoc_trigger = commands[start:i]
-                current_command.append(commands[start - 2 : i])  # Include '<<'
-                continue  # Skip incrementing i at the end of the loop
-            current_command.append(char)
-
-        elif state == IN_SINGLE_QUOTE:
-            current_command.append(char)
-            if char == "'" and commands[i - 1] != '\\':
-                state = NORMAL
-
-        elif state == IN_DOUBLE_QUOTE:
-            current_command.append(char)
-            if char == '"' and commands[i - 1] != '\\':
-                state = NORMAL
-
-        elif state == IN_HEREDOC:
-            current_command.append(char)
-            if (
-                char == '\n'
-                and heredoc_trigger
-                and commands[i + 1 : i + 1 + len(heredoc_trigger) + 1]
-                == heredoc_trigger + '\n'
-            ):
-                # Check if the next line starts with the heredoc trigger followed by a newline
-                i += (
-                    len(heredoc_trigger) + 1
-                )  # Move past the heredoc trigger and newline
-                current_command.append(
-                    heredoc_trigger + '\n'
-                )  # Include the heredoc trigger and newline
-                result.append(''.join(current_command).strip())
-                current_command = []
-                heredoc_trigger = None
-                state = NORMAL
-                continue
-
-        i += 1
-
-    # Add the last command if any
-    if current_command:
-        result.append(''.join(current_command).strip())
-
-    # Remove any empty strings from the result
-    result = [cmd for cmd in result if cmd]
-
-    return result
-
-
 class DockerSSHBox(Sandbox):
    instance_id: str
    container_image: str
--- a/opendevin/runtime/e2b/runtime.py
+++ b/opendevin/runtime/e2b/runtime.py
@@ -1,3 +1,4 @@
+from opendevin.core.config import SandboxConfig
 from opendevin.events.action import (
    FileReadAction,
    FileWriteAction,
@@ -20,11 +21,12 @@ from .sandbox import E2BSandbox
 class E2BRuntime(ServerRuntime):
    def __init__(
        self,
+        sandbox_config: SandboxConfig,
        event_stream: EventStream,
        sid: str = 'default',
        sandbox: Sandbox | None = None,
    ):
-        super().__init__(event_stream, sid, sandbox)
+        super().__init__(sandbox_config, event_stream, sid, sandbox)
        if not isinstance(self.sandbox, E2BSandbox):
            raise ValueError('E2BRuntime requires an E2BSandbox')
        self.file_store = E2BFileStore(self.sandbox.filesystem)
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@@ -1,8 +1,13 @@
 import asyncio
 import atexit
+import copy
+import json
+import os
 from abc import abstractmethod
 from typing import Any, Optional

+from opendevin.core.config import SandboxConfig
+from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events import EventStream, EventStreamSubscriber
 from opendevin.events.action import (
    Action,
@@ -16,6 +21,7 @@ from opendevin.events.action import (
 )
 from opendevin.events.event import Event
 from opendevin.events.observation import (
+    CmdOutputObservation,
    ErrorObservation,
    NullObservation,
    Observation,
@@ -27,6 +33,17 @@ from opendevin.runtime.tools import RuntimeTool
 from opendevin.storage import FileStore


+def _default_env_vars(config: SandboxConfig) -> dict[str, str]:
+    ret = {}
+    for key in os.environ:
+        if key.startswith('SANDBOX_ENV_'):
+            sandbox_key = key.removeprefix('SANDBOX_ENV_')
+            ret[sandbox_key] = os.environ[key]
+    if config.enable_auto_lint:
+        ret['ENABLE_AUTO_LINT'] = 'true'
+    return ret
+
+
 class Runtime:
    """The runtime is how the agent interacts with the external environment.
    This includes a bash sandbox, a browser, and filesystem interactions.
@@ -36,18 +53,32 @@ class Runtime:

    sid: str
    file_store: FileStore
+    DEFAULT_ENV_VARS: dict[str, str]

-    def __init__(self, event_stream: EventStream, sid: str = 'default'):
+    def __init__(
+        self,
+        sandbox_config: SandboxConfig,
+        event_stream: EventStream,
+        sid: str = 'default',
+    ):
        self.sid = sid
        self.event_stream = event_stream
        self.event_stream.subscribe(EventStreamSubscriber.RUNTIME, self.on_event)
+        self.sandbox_config = copy.deepcopy(sandbox_config)
+        self.DEFAULT_ENV_VARS = _default_env_vars(self.sandbox_config)
        atexit.register(self.close_sync)

-    async def ainit(self) -> None:
-        """Initialize the runtime (asynchronously).
+    async def ainit(self, env_vars: dict[str, str] | None = None) -> None:
+        """
+        Initialize the runtime (asynchronously).
+
        This method should be called after the runtime's constructor.
        """
-        pass
+        logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
+        await self.add_env_var(self.DEFAULT_ENV_VARS)
+        if env_vars is not None:
+            logger.debug(f'Adding provided env vars: {env_vars}')
+            await self.add_env_var(env_vars)

    async def close(self) -> None:
        pass
@@ -84,6 +115,17 @@ class Runtime:

    # ====================================================================

+    async def add_env_var(self, vars):
+        cmd = ''
+        for key, value in vars.items():
+            # Note: json.dumps gives us nice escaping for free
+            cmd += f'export {key}={json.dumps(value)}; '
+        cmd = cmd.strip()
+        logger.debug(f'Adding env var: {cmd}')
+        obs: Observation = await self.run(CmdRunAction(cmd))
+        if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
+            raise RuntimeError(f'Failed to add {key} to environment: {obs}')
+
    async def on_event(self, event: Event) -> None:
        if isinstance(event, Action):
            observation = await self.run_action(event)
--- a/opendevin/runtime/sandbox.py
+++ b/opendevin/runtime/sandbox.py
@@ -1,6 +1,4 @@
 import copy
-import json
-import os
 from abc import ABC, abstractmethod

 from opendevin.core.config import SandboxConfig
@@ -14,19 +12,8 @@ class Sandbox(ABC, PluginMixin):

    def __init__(self, config: SandboxConfig):
        self.config = copy.deepcopy(config)
-        for key in os.environ:
-            if key.startswith('SANDBOX_ENV_'):
-                sandbox_key = key.removeprefix('SANDBOX_ENV_')
-                self.add_to_env(sandbox_key, os.environ[key])
-        if config.enable_auto_lint:
-            self.add_to_env('ENABLE_AUTO_LINT', 'true')
        self.initialize_plugins: bool = config.initialize_plugins

-    def add_to_env(self, key: str, value: str):
-        self._env[key] = value
-        # Note: json.dumps gives us nice escaping for free
-        self.execute(f'export {key}={json.dumps(value)}')
-
    @abstractmethod
    def execute(
        self, cmd: str, stream: bool = False, timeout: int | None = None
--- a/opendevin/runtime/server/runtime.py
+++ b/opendevin/runtime/server/runtime.py
@@ -1,6 +1,6 @@
 from typing import Any, Optional

-from opendevin.core.config import config
+from opendevin.core.config import SandboxConfig, config
 from opendevin.core.exceptions import BrowserInitException
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.action import (
@@ -63,11 +63,12 @@ def create_sandbox(sid: str = 'default', box_type: str = 'ssh') -> Sandbox:
 class ServerRuntime(Runtime):
    def __init__(
        self,
+        sandbox_config: SandboxConfig,
        event_stream: EventStream,
        sid: str = 'default',
        sandbox: Sandbox | None = None,
    ):
-        super().__init__(event_stream, sid)
+        super().__init__(sandbox_config, event_stream, sid)
        self.file_store = LocalFileStore(config.workspace_base)
        if sandbox is None:
            self.sandbox = create_sandbox(sid, config.sandbox.box_type)
@@ -77,9 +78,6 @@ class ServerRuntime(Runtime):
            self._is_external_sandbox = True
        self.browser: BrowserEnv | None = None

-    async def ainit(self) -> None:
-        pass
-
    async def close(self):
        if not self._is_external_sandbox:
            self.sandbox.close()
--- a/opendevin/runtime/utils/init.py
+++ b/opendevin/runtime/utils/init.py
@@ -1,3 +1,4 @@
+from .bash import split_bash_commands
 from .system import find_available_tcp_port

-__all__ = ['find_available_tcp_port']
+__all__ = ['find_available_tcp_port', 'split_bash_commands']
--- a/opendevin/runtime/utils/bash.py
+++ b/opendevin/runtime/utils/bash.py
@@ -0,0 +1,87 @@
+def split_bash_commands(commands):
+    # States
+    NORMAL = 0
+    IN_SINGLE_QUOTE = 1
+    IN_DOUBLE_QUOTE = 2
+    IN_HEREDOC = 3
+
+    state = NORMAL
+    heredoc_trigger = None
+    result = []
+    current_command: list[str] = []
+
+    i = 0
+    while i < len(commands):
+        char = commands[i]
+
+        if state == NORMAL:
+            if char == "'":
+                state = IN_SINGLE_QUOTE
+            elif char == '"':
+                state = IN_DOUBLE_QUOTE
+            elif char == '\\':
+                # Check if this is escaping a newline
+                if i + 1 < len(commands) and commands[i + 1] == '\n':
+                    i += 1  # Skip the newline
+                    # Continue with the next line as part of the same command
+                    i += 1  # Move to the first character of the next line
+                    continue
+            elif char == '\n':
+                if not heredoc_trigger and current_command:
+                    result.append(''.join(current_command).strip())
+                    current_command = []
+            elif char == '<' and commands[i : i + 2] == '<<':
+                # Detect heredoc
+                state = IN_HEREDOC
+                i += 2  # Skip '<<'
+                while commands[i] == ' ':
+                    i += 1
+                start = i
+                while commands[i] not in [' ', '\n']:
+                    i += 1
+                heredoc_trigger = commands[start:i]
+                current_command.append(commands[start - 2 : i])  # Include '<<'
+                continue  # Skip incrementing i at the end of the loop
+            current_command.append(char)
+
+        elif state == IN_SINGLE_QUOTE:
+            current_command.append(char)
+            if char == "'" and commands[i - 1] != '\\':
+                state = NORMAL
+
+        elif state == IN_DOUBLE_QUOTE:
+            current_command.append(char)
+            if char == '"' and commands[i - 1] != '\\':
+                state = NORMAL
+
+        elif state == IN_HEREDOC:
+            current_command.append(char)
+            if (
+                char == '\n'
+                and heredoc_trigger
+                and commands[i + 1 : i + 1 + len(heredoc_trigger) + 1]
+                == heredoc_trigger + '\n'
+            ):
+                # Check if the next line starts with the heredoc trigger followed by a newline
+                i += (
+                    len(heredoc_trigger) + 1
+                )  # Move past the heredoc trigger and newline
+                current_command.append(
+                    heredoc_trigger + '\n'
+                )  # Include the heredoc trigger and newline
+                result.append(''.join(current_command).strip())
+                current_command = []
+                heredoc_trigger = None
+                state = NORMAL
+                continue
+
+        i += 1
+
+    # Add the last command if any
+    if current_command:
+        result.append(''.join(current_command).strip())
+
+    # Remove any empty strings from the result
+    result = [cmd for cmd in result if cmd]
+
+    return result
--- a/opendevin/runtime/utils/runtime_build.py
+++ b/opendevin/runtime/utils/runtime_build.py
@@ -198,8 +198,9 @@ def build_runtime_image(
    # Try to pull the new image from the registry
    try:
        docker_client.images.pull(new_image_name)
-    except docker.errors.ImageNotFound:
-        logger.info(f'Image {new_image_name} not found, building it from scratch')
+    except Exception as e:
+        logger.info(f'Error pulling image {new_image_name}, building it from scratch')
+        logger.error(f'Error: {e}')

    # Detect if the sandbox image is built
    image_exists = _check_image_exists(new_image_name, docker_client)
--- a/opendevin/server/session/agent.py
+++ b/opendevin/server/session/agent.py
@@ -62,7 +62,9 @@ class AgentSession:

        logger.info(f'Using runtime: {config.runtime}')
        runtime_cls = get_runtime_cls(config.runtime)
-        self.runtime = runtime_cls(self.event_stream, self.sid)
+        self.runtime = runtime_cls(
+            sandbox_config=config.sandbox, event_stream=self.event_stream, sid=self.sid
+        )
        await self.runtime.ainit()

    async def _create_controller(self, start_event: dict):
--- a/tests/unit/test_ipython.py
+++ b/tests/unit/test_ipython.py
@@ -4,7 +4,7 @@ from unittest.mock import MagicMock, call, patch

 import pytest

-from opendevin.core.config import config
+from opendevin.core.config import SandboxConfig, config
 from opendevin.events.action import IPythonRunCellAction
 from opendevin.events.observation import IPythonRunCellObservation
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
@@ -43,7 +43,10 @@ async def test_run_python_backticks():
        new=mock_sandbox_execute,
    ):
        # Initialize the runtime with the mock event_stream
-        runtime = ServerRuntime(event_stream=mock_event_stream)
+        runtime = ServerRuntime(
+            sandbox_config=SandboxConfig(box_type='ssh', persist_sandbox=False),
+            event_stream=mock_event_stream,
+        )

        # Define the test action with a simple IPython command
        action = IPythonRunCellAction(code=test_code)
--- a/tests/unit/test_runtime.py
+++ b/tests/unit/test_runtime.py
@@ -0,0 +1,184 @@
+"""Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
+
+import os
+import pathlib
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from opendevin.core.config import SandboxConfig
+from opendevin.events import EventStream
+from opendevin.events.action import (
+    CmdRunAction,
+)
+from opendevin.events.observation import (
+    CmdOutputObservation,
+)
+from opendevin.runtime.client.runtime import EventStreamRuntime
+from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
+from opendevin.runtime.server.runtime import ServerRuntime
+
+
+@pytest.fixture
+def temp_dir(monkeypatch):
+    # get a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        pathlib.Path().mkdir(parents=True, exist_ok=True)
+        yield temp_dir
+
+
+async def _load_runtime(box_class, event_stream, plugins, sid):
+    sandbox_config = SandboxConfig()
+    if box_class == EventStreamRuntime:
+        runtime = EventStreamRuntime(
+            sandbox_config=sandbox_config,
+            event_stream=event_stream,
+            sid=sid,
+            # NOTE: we probably don't have a default container image `/sandbox` for the event stream runtime
+            # Instead, we will pre-build a suite of container images with OD-runtime-cli installed.
+            container_image='ubuntu:22.04',
+            plugins=plugins,
+        )
+        await runtime.ainit()
+    elif box_class == ServerRuntime:
+        runtime = ServerRuntime(
+            sandbox_config=sandbox_config, event_stream=event_stream, sid=sid
+        )
+        await runtime.ainit()
+        runtime.init_sandbox_plugins(plugins)
+        runtime.init_runtime_tools(
+            [],
+            is_async=False,
+            runtime_tools_config={},
+        )
+    else:
+        raise ValueError(f'Invalid box class: {box_class}')
+    return runtime
+
+
+RUNTIME_TO_TEST = [EventStreamRuntime, ServerRuntime]
+
+
+@pytest.mark.asyncio
+async def test_env_vars_os_environ():
+    with patch.dict(os.environ, {'SANDBOX_ENV_FOOBAR': 'BAZ'}):
+        plugins = [JupyterRequirement(), AgentSkillsRequirement()]
+        sid = 'test'
+        cli_session = 'main_test'
+
+        for box_class in RUNTIME_TO_TEST:
+            event_stream = EventStream(cli_session)
+            runtime = await _load_runtime(box_class, event_stream, plugins, sid)
+
+            obs: CmdOutputObservation = await runtime.run_action(
+                CmdRunAction(command='env')
+            )
+            print(obs)
+
+            obs: CmdOutputObservation = await runtime.run_action(
+                CmdRunAction(command='echo $FOOBAR')
+            )
+            print(obs)
+            assert obs.exit_code == 0, 'The exit code should be 0.'
+            assert (
+                obs.content.strip().split('\n\r')[0].strip() == 'BAZ'
+            ), f'Output: [{obs.content}] for {box_class}'
+
+
+@pytest.mark.asyncio
+async def test_env_vars_runtime_add_env_var():
+    plugins = [JupyterRequirement(), AgentSkillsRequirement()]
+    sid = 'test'
+    cli_session = 'main_test'
+
+    for box_class in RUNTIME_TO_TEST:
+        event_stream = EventStream(cli_session)
+        runtime = await _load_runtime(box_class, event_stream, plugins, sid)
+        await runtime.add_env_var({'QUUX': 'abc"def'})
+
+        obs: CmdOutputObservation = await runtime.run_action(
+            CmdRunAction(command='echo $QUUX')
+        )
+        print(obs)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert (
+            obs.content.strip().split('\r\n')[0].strip() == 'abc"def'
+        ), f'Output: [{obs.content}] for {box_class}'
+
+
+@pytest.mark.asyncio
+async def test_env_vars_runtime_add_multiple_env_vars():
+    plugins = [JupyterRequirement(), AgentSkillsRequirement()]
+    sid = 'test'
+    cli_session = 'main_test'
+
+    for box_class in RUNTIME_TO_TEST:
+        event_stream = EventStream(cli_session)
+        runtime = await _load_runtime(box_class, event_stream, plugins, sid)
+        await runtime.add_env_var({'QUUX': 'abc"def', 'FOOBAR': 'xyz'})
+
+        obs: CmdOutputObservation = await runtime.run_action(
+            CmdRunAction(command='echo $QUUX $FOOBAR')
+        )
+        print(obs)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert (
+            obs.content.strip().split('\r\n')[0].strip() == 'abc"def xyz'
+        ), f'Output: [{obs.content}] for {box_class}'
+
+
+@pytest.mark.asyncio
+async def test_env_vars_runtime_add_env_var_overwrite():
+    plugins = [JupyterRequirement(), AgentSkillsRequirement()]
+    sid = 'test'
+    cli_session = 'main_test'
+
+    for box_class in RUNTIME_TO_TEST:
+        with patch.dict(os.environ, {'SANDBOX_ENV_FOOBAR': 'BAZ'}):
+            event_stream = EventStream(cli_session)
+            runtime = await _load_runtime(box_class, event_stream, plugins, sid)
+            await runtime.add_env_var({'FOOBAR': 'xyz'})
+
+            obs: CmdOutputObservation = await runtime.run_action(
+                CmdRunAction(command='echo $FOOBAR')
+            )
+            print(obs)
+            assert obs.exit_code == 0, 'The exit code should be 0.'
+            assert (
+                obs.content.strip().split('\r\n')[0].strip() == 'xyz'
+            ), f'Output: [{obs.content}] for {box_class}'
+
+
+@pytest.mark.asyncio
+async def test_bash_command_pexcept(temp_dir):
+    plugins = [JupyterRequirement(), AgentSkillsRequirement()]
+    sid = 'test'
+    cli_session = 'main_test'
+
+    box_class = EventStreamRuntime
+    event_stream = EventStream(cli_session)
+    runtime = await _load_runtime(box_class, event_stream, plugins, sid)
+
+    # We set env var PS1="\u@\h:\w $"
+    # and construct the PEXCEPT prompt base on it.
+    # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
+    # and failed to pexcept the right content, causing it fail to get error code.
+    obs = await runtime.run_action(CmdRunAction(command='env'))
+
+    # For example:
+    # 02:16:13 - opendevin:DEBUG: client.py:78 - Executing command: env
+    # 02:16:13 - opendevin:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
+    # CONDA_EXE=/opendevin/miniforge3/bin/conda
+    # [...]
+    # LC_CTYPE=C.UTF-8
+    # PS1=\u@\h:\w $
+    # 02:16:13 - opendevin:DEBUG: client.py:89 - Executing command for exit code: env
+    # 02:16:13 - opendevin:DEBUG: client.py:92 - Exit code Output:
+    # CONDA_DEFAULT_ENV=base
+
+    # As long as the exit code is 0, the test will pass.
+    assert isinstance(
+        obs, CmdOutputObservation
+    ), 'The observation should be a CmdOutputObservation.'
+    assert obs.exit_code == 0, 'The exit code should be 0.'
--- a/tests/unit/test_sandbox.py
+++ b/tests/unit/test_sandbox.py
@@ -5,9 +5,9 @@ import tempfile
 import pytest

 from opendevin.core.config import AppConfig, SandboxConfig
-from opendevin.runtime.docker.local_box import LocalBox
-from opendevin.runtime.docker.ssh_box import DockerSSHBox, split_bash_commands
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
+from opendevin.runtime.utils import split_bash_commands


 def create_docker_box_from_app_config(
@@ -42,30 +42,6 @@ def temp_dir(monkeypatch):
        yield temp_dir


-def test_env_vars(temp_dir):
-    os.environ['SANDBOX_ENV_FOOBAR'] = 'BAZ'
-    ssh_box = create_docker_box_from_app_config(temp_dir)
-
-    local_box_config = AppConfig(
-        sandbox=SandboxConfig(
-            box_type='local',
-        )
-    )
-    local_box = LocalBox(local_box_config.sandbox, temp_dir)
-    for box in [
-        ssh_box,
-        local_box,
-    ]:
-        box.add_to_env(key='QUUX', value='abc"def')
-        assert box._env['FOOBAR'] == 'BAZ'
-        assert box._env['QUUX'] == 'abc"def'
-        exit_code, output = box.execute('echo $FOOBAR $QUUX')
-        assert exit_code == 0, 'The exit code should be 0.'
-        assert (
-            output.strip() == 'BAZ abc"def'
-        ), f'Output: {output} for {box.__class__.__name__}'
-
-
 def test_split_commands():
    cmds = [
        'ls -l',
@@ -339,20 +315,6 @@ def test_sandbox_jupyter_agentskills_fileop_pwd(temp_dir):
    _test_sandbox_jupyter_agentskills_fileop_pwd_impl(box, config)


-def test_sandbox_jupyter_agentskills_fileop_pwd_with_lint(temp_dir):
-    # get a temporary directory
-    config = AppConfig(
-        sandbox=SandboxConfig(
-            box_type='ssh',
-            persist_sandbox=False,
-            enable_auto_lint=True,
-        )
-    )
-    assert config.sandbox.enable_auto_lint
-    box = create_docker_box_from_app_config(temp_dir, config)
-    _test_sandbox_jupyter_agentskills_fileop_pwd_impl(box, config)
-
-
@pytest.mark.skipif(
    os.getenv('TEST_IN_CI') != 'true',
    reason='The unittest need to download image, so only run on CI',