(fix) Fix runtime (RT) tests and split tests in 2 actions (openhands/root) (#3791)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2025-12-26 05:48:36 +08:00 · 2024-09-14 21:51:30 +02:00 · 2024-09-14 21:51:30 +02:00 · 554636cf2a
commit 554636cf2a
parent 57390eb26b
21 changed files with 867 additions and 702 deletions
--- a/.github/workflows/ghcr_runtime.yml
+++ b/.github/workflows/ghcr_runtime.yml
@ -1,5 +1,5 @@
 # Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
-name: Build, Test and Publish Runtime Image
+name: Build, Test and Publish RT Image

 # Only run one workflow of the same group at a time.
 # There can be at most one running and one pending job in a concurrency group at any time.
@ -104,9 +104,9 @@ jobs:
          name: runtime-${{ matrix.base_image.tag }}
          path: /tmp/runtime-${{ matrix.base_image.tag }}.tar

-  # Run unit tests with the EventStream runtime Docker images
-  test_runtime:
-    name: Test Runtime
+  # Run unit tests with the EventStream runtime Docker images as root
+  test_runtime_root:
+    name: RT Unit Tests (Root)
    needs: [ghcr_build_runtime]
    runs-on: ubuntu-latest
    strategy:
@ -164,11 +164,84 @@ jobs:
          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

+          SKIP_CONTAINER_LOGS=true \
          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
-          poetry run pytest -n 2 --reruns 2 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          RUN_AS_OPENHANDS=false \
+          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # Run unit tests with the EventStream runtime Docker images as openhands user
+  test_runtime_oh:
+    name: RT Unit Tests (openhands)
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime]
+    strategy:
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
+      # Forked repos can't push to GHCR, so we need to download the image as an artifact
+      - name: Download runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        uses: actions/download-artifact@v4
+        with:
+          name: runtime-${{ matrix.base_image }}
+          path: /tmp
+      - name: Load runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        run: |
+          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Run runtime tests
+        run: |
+          # We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
+          # then across more than 2 CPUs for some reason
+          poetry run pip install pytest-xdist
+
+          # Install to be able to retry on failures for flaky tests
+          poetry run pip install pytest-rerunfailures
+
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
+
+          SKIP_CONTAINER_LOGS=true \
+          TEST_RUNTIME=eventstream \
+          SANDBOX_USER_ID=$(id -u) \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
+          TEST_IN_CI=true \
+          RUN_AS_OPENHANDS=true \
+          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@ -176,7 +249,7 @@ jobs:

  # Run integration tests with the eventstream runtime Docker image
  runtime_integration_tests_on_linux:
-    name: Runtime Integration Tests on Linux
+    name: RT Integration Tests (Linux)
    runs-on: ubuntu-latest
    needs: [ghcr_build_runtime]
    strategy:
@ -237,7 +310,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
@ -246,7 +319,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
    steps:
      - name: Some tests failed
        run: |
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@ -507,7 +507,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
        if isinstance(value, dict):
            try:
                if key is not None and key.lower() == 'agent':
-                    logger.openhands_logger.info(
+                    logger.openhands_logger.debug(
                        'Attempt to load default agent config from config toml'
                    )
                    non_dict_fields = {
@ -517,13 +517,13 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                    cfg.set_agent_config(agent_config, 'agent')
                    for nested_key, nested_value in value.items():
                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.info(
+                            logger.openhands_logger.debug(
                                f'Attempt to load group {nested_key} from config toml as agent config'
                            )
                            agent_config = AgentConfig(**nested_value)
                            cfg.set_agent_config(agent_config, nested_key)
                elif key is not None and key.lower() == 'llm':
-                    logger.openhands_logger.info(
+                    logger.openhands_logger.debug(
                        'Attempt to load default LLM config from config toml'
                    )
                    non_dict_fields = {
@ -533,7 +533,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                    cfg.set_llm_config(llm_config, 'llm')
                    for nested_key, nested_value in value.items():
                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.info(
+                            logger.openhands_logger.debug(
                                f'Attempt to load group {nested_key} from config toml as llm config'
                            )
                            llm_config = LLMConfig(**nested_value)
@ -584,10 +584,10 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):

 def finalize_config(cfg: AppConfig):
    """More tweaks to the config after it's been loaded."""
+    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
    # Set workspace_mount_path if not set by the user
    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
-        cfg.workspace_mount_path = os.path.abspath(cfg.workspace_base)
-    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
+        cfg.workspace_mount_path = cfg.workspace_base

    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
        # TODO why do we need to check if workspace_mount_path is None?
--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@ -68,6 +68,10 @@ class DockerRuntimeBuilder(RuntimeBuilder):
        Returns:
            bool: Whether the Docker image exists in the registry or in the local store
        """
+        if not image_name:
+            logger.error(f'Invalid image name: `{image_name}`')
+            return False
+
        try:
            logger.info(f'Checking, if image exists locally:\n{image_name}')
            self.docker_client.images.get(image_name)
--- a/openhands/runtime/client/client.py
+++ b/openhands/runtime/client/client.py
@ -84,7 +84,6 @@ class RuntimeClient:
        self.lock = asyncio.Lock()
        self.plugins: dict[str, Plugin] = {}
        self.browser = BrowserEnv(browsergym_eval_env)
-        self._initial_pwd = work_dir

    @property
    def initial_pwd(self):
@ -116,27 +115,85 @@ class RuntimeClient:
        logger.info('Runtime client initialized.')

    def _init_user(self, username: str, user_id: int) -> None:
-        """Create user if not exists."""
+        """Create working directory and user if not exists.
+        It performs the following steps effectively:
+        * Creates the Working Directory:
+            - Uses mkdir -p to create the directory.
+            - Sets ownership to username:root.
+            - Adjusts permissions to be readable and writable by group and others.
+        * User Verification and Creation:
+            - Checks if the user exists using id -u.
+            - If the user exists with the correct UID, it skips creation.
+            - If the UID differs, it logs a warning and updates self.user_id.
+            - If the user doesn't exist, it proceeds to create the user.
+        * Sudo Configuration:
+            - Appends %sudo ALL=(ALL) NOPASSWD:ALL to /etc/sudoers to grant
+              passwordless sudo access to the sudo group.
+            - Adds the user to the sudo group with the useradd command, handling
+              UID conflicts by incrementing the UID if necessary.
+        """
+
+        # First create the working directory, independent of the user
+        logger.info(f'Client working directory: {self.initial_pwd}')
+        command = f'umask 002; mkdir -p {self.initial_pwd}'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        out_str = output.stdout.decode()
+
+        command = f'chown -R {username}:root {self.initial_pwd}'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        out_str += output.stdout.decode()
+
+        command = f'chmod g+rw {self.initial_pwd}'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        out_str += output.stdout.decode()
+        logger.debug(f'Created working directory. Output: [{out_str}]')
+
        # Skip root since it is already created
        if username == 'root':
            return

        # Check if the username already exists
+        existing_user_id = -1
        try:
-            subprocess.run(
+            result = subprocess.run(
                f'id -u {username}', shell=True, check=True, capture_output=True
            )
-            logger.debug(f'User {username} already exists. Skipping creation.')
+            existing_user_id = int(result.stdout.decode().strip())
+
+            # The user ID already exists, skip setup
+            if existing_user_id == user_id:
+                logger.debug(
+                    f'User `{username}` already has the provided UID {user_id}. Skipping user setup.'
+                )
+            else:
+                logger.warning(
+                    f'User `{username}` already exists with UID {existing_user_id}. Skipping user setup.'
+                )
+                self.user_id = existing_user_id
            return
-        except subprocess.CalledProcessError:
-            pass  # User does not exist, continue with creation
+        except subprocess.CalledProcessError as e:
+            # Returncode 1 indicates, that the user does not exist yet
+            if e.returncode == 1:
+                logger.debug(
+                    f'User `{username}` does not exist. Proceeding with user creation.'
+                )
+            else:
+                logger.error(
+                    f'Error checking user `{username}`, skipping setup:\n{e}\n'
+                )
+                raise

        # Add sudoer
-        sudoer_line = r"echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers"
-        output = subprocess.run(sudoer_line, shell=True, capture_output=True)
-        if output.returncode != 0:
-            raise RuntimeError(f'Failed to add sudoer: {output.stderr.decode()}')
-        logger.debug(f'Added sudoer successfully. Output: [{output.stdout.decode()}]')
+        sudoer_line = r'%sudo ALL=(ALL) NOPASSWD:ALL\n'
+        sudoers_path = '/etc/sudoers.d/99_sudo'
+        if not Path(sudoers_path).exists():
+            with open(sudoers_path, 'w') as f:
+                f.write(sudoer_line)
+            output = subprocess.run(['chmod', '0440', sudoers_path])
+            if output.returncode != 0:
+                logger.error('Failed to chmod 99_sudo file!')
+            else:
+                logger.debug('Added sudoer successfully.')

        # Attempt to add the user, retrying with incremented user_id if necessary
        while True:
@ -144,16 +201,10 @@ class RuntimeClient:
                f'useradd -rm -d /home/{username} -s /bin/bash '
                f'-g root -G sudo -u {user_id} {username}'
            )
-
-            if not os.path.exists(self.initial_pwd):
-                command += f' && mkdir -p {self.initial_pwd}'
-                command += f' && chown -R {username}:root {self.initial_pwd}'
-                command += f' && chmod g+s {self.initial_pwd}'
-
            output = subprocess.run(command, shell=True, capture_output=True)
            if output.returncode == 0:
                logger.debug(
-                    f'Added user {username} successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
+                    f'Added user `{username}` successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
                )
                break
            elif f'UID {user_id} is not unique' in output.stderr.decode():
@ -163,7 +214,7 @@ class RuntimeClient:
                user_id += 1
            else:
                raise RuntimeError(
-                    f'Failed to create user {username}: {output.stderr.decode()}'
+                    f'Failed to create user `{username}`! Output: [{output.stderr.decode()}]'
                )

    def _init_bash_shell(self, work_dir: str, username: str) -> None:
@ -181,8 +232,8 @@ class RuntimeClient:

        # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
        self.__bash_expect_regex = r'\[PEXPECT_BEGIN\]\s*(.*?)\s*([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+)\s*\[PEXPECT_END\]'
-
-        self.shell.sendline(f'export PS1="{self.__bash_PS1}"; export PS2=""')
+        # Set umask to allow group write permissions
+        self.shell.sendline(f'umask 002; export PS1="{self.__bash_PS1}"; export PS2=""')
        self.shell.expect(self.__bash_expect_regex)

        self.shell.sendline(
@ -190,8 +241,11 @@ class RuntimeClient:
        )
        self.shell.expect(self.__bash_expect_regex)
        logger.debug(
-            f'Bash initialized. Working directory: {work_dir}. Output: {self.shell.before}'
+            f'Bash initialized. Working directory: {work_dir}. Output: [{self.shell.before}]'
        )
+        # Ensure the group has write permissions on the working directory
+        self.shell.sendline(f'chmod g+rw "{work_dir}"')
+        self.shell.expect(self.__bash_expect_regex)

    async def _init_bash_commands(self):
        logger.info(f'Initializing by running {len(INIT_COMMANDS)} bash commands...')
@ -295,14 +349,14 @@ class RuntimeClient:
            bash_prompt = self._get_bash_prompt_and_update_pwd()
            if keep_prompt:
                output += '\r\n' + bash_prompt
-            logger.debug(f'Command output: {output}')
+            # logger.debug(f'Command output:\n{output}')
        return output, exit_code

    async def run_action(self, action) -> Observation:
        action_type = action.action
-        logger.debug(f'Running action: {action}')
+        logger.debug(f'Running action:\n{action}')
        observation = await getattr(self, action_type)(action)
-        logger.debug(f'Action output: {observation}')
+        logger.debug(f'Action output:\n{observation}')
        return observation

    async def run(self, action: CmdRunAction) -> CmdOutputObservation:
@ -355,10 +409,9 @@ class RuntimeClient:
            _jupyter_plugin: JupyterPlugin = self.plugins['jupyter']  # type: ignore
            # This is used to make AgentSkills in Jupyter aware of the
            # current working directory in Bash
-            if self.pwd != getattr(self, '_jupyter_pwd', None):
-                logger.debug(
-                    f"{self.pwd} != {getattr(self, '_jupyter_pwd', None)} -> reset Jupyter PWD"
-                )
+            jupyter_pwd = getattr(self, '_jupyter_pwd', None)
+            if self.pwd != jupyter_pwd:
+                logger.debug(f'{self.pwd} != {jupyter_pwd} -> reset Jupyter PWD')
                reset_jupyter_pwd_code = f'import os; os.chdir("{self.pwd}")'
                _aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code)
                _reset_obs = await _jupyter_plugin.run(_aux_action)
@ -450,7 +503,7 @@ class RuntimeClient:
                    os.chown(filepath, file_stat.st_uid, file_stat.st_gid)
                else:
                    # set the new file permissions if the file is new
-                    os.chmod(filepath, 0o644)
+                    os.chmod(filepath, 0o664)
                    os.chown(filepath, self.user_id, self.user_id)

            except FileNotFoundError:
--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@ -38,8 +38,7 @@ from openhands.runtime.utils.runtime_build import build_runtime_image


 class LogBuffer:
-    """
-    Synchronous buffer for Docker container logs.
+    """Synchronous buffer for Docker container logs.

    This class provides a thread-safe way to collect, store, and retrieve logs
    from a Docker container. It uses a list to store log lines and provides methods
@ -94,7 +93,7 @@ class LogBuffer:
            )
            self.close(timeout=5)

-    def close(self, timeout: float = 10.0):
+    def close(self, timeout: float = 5.0):
        self._stop_event.set()
        self.log_stream_thread.join(timeout)

@ -102,6 +101,14 @@ class LogBuffer:
 class EventStreamRuntime(Runtime):
    """This runtime will subscribe the event stream.
    When receive an event, it will send the event to runtime-client which run inside the docker environment.
+    From the sid also an instance_id is generated in combination with a UID.
+
+    Args:
+        config (AppConfig): The application configuration.
+        event_stream (EventStream): The event stream to subscribe to.
+        sid (str, optional): The session ID. Defaults to 'default'.
+        plugins (list[PluginRequirement] | None, optional): List of plugin requirements. Defaults to None.
+        env_vars (dict[str, str] | None, optional): Environment variables to set. Defaults to None.
    """

    container_name_prefix = 'openhands-sandbox-'
@ -115,13 +122,16 @@ class EventStreamRuntime(Runtime):
        env_vars: dict[str, str] | None = None,
    ):
        self.config = config
-        self._port = find_available_tcp_port()
-        self.api_url = f'http://{self.config.sandbox.api_hostname}:{self._port}'
+        self._host_port = 30000  # initial dummy value
+        self._container_port = 30001  # initial dummy value
+        self.api_url = (
+            f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+        )
        self.session = requests.Session()
-
        self.instance_id = (
            sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
        )
+
        self.docker_client: docker.DockerClient = self._init_docker_client()
        self.base_container_image = self.config.sandbox.base_container_image
        self.runtime_container_image = self.config.sandbox.runtime_container_image
@ -131,7 +141,7 @@ class EventStreamRuntime(Runtime):
        self.action_semaphore = threading.Semaphore(1)  # Ensure one action at a time

        self.runtime_builder = DockerRuntimeBuilder(self.docker_client)
-        logger.debug(f'EventStreamRuntime `{sid}`')
+        logger.debug(f'EventStreamRuntime `{self.instance_id}`')

        # Buffer for container logs
        self.log_buffer: LogBuffer | None = None
@ -140,7 +150,9 @@ class EventStreamRuntime(Runtime):
            logger.info(
                f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}'
            )
-
+        self.skip_container_logs = (
+            os.environ.get('SKIP_CONTAINER_LOGS', 'false').lower() == 'true'
+        )
        if self.runtime_container_image is None:
            if self.base_container_image is None:
                raise ValueError(
@ -152,19 +164,18 @@ class EventStreamRuntime(Runtime):
                extra_deps=self.config.sandbox.runtime_extra_deps,
            )
        self.container = self._init_container(
-            self.sandbox_workspace_dir,
-            mount_dir=self.config.workspace_mount_path,
+            sandbox_workspace_dir=self.config.workspace_mount_path_in_sandbox,  # e.g. /workspace
+            mount_dir=self.config.workspace_mount_path,  # e.g. /opt/openhands/_test_workspace
            plugins=plugins,
        )
        # will initialize both the event stream and the env vars
        super().__init__(config, event_stream, sid, plugins, env_vars)

-        self._wait_until_alive()
-
        logger.info(
            f'Container initialized with plugins: {[plugin.name for plugin in self.plugins]}'
        )
        logger.info(f'Container initialized with env vars: {env_vars}')
+        time.sleep(1)

    @staticmethod
    def _init_docker_client() -> docker.DockerClient:
@ -196,24 +207,48 @@ class EventStreamRuntime(Runtime):
                    f'--plugins {" ".join([plugin.name for plugin in plugins])} '
                )

-            network_mode: str | None = None
-            port_mapping: dict[str, int] | None = None
-            if self.config.sandbox.use_host_network:
-                network_mode = 'host'
+            self._host_port = self._find_available_port()
+            self._container_port = (
+                self._host_port
+            )  # in future this might differ from host port
+            self.api_url = (
+                f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+            )
+
+            use_host_network = self.config.sandbox.use_host_network
+            network_mode: str | None = 'host' if use_host_network else None
+            port_mapping: dict[str, list[dict[str, str]]] | None = (
+                None
+                if use_host_network
+                else {
+                    f'{self._container_port}/tcp': [{'HostPort': str(self._host_port)}]
+                }
+            )
+
+            if use_host_network:
                logger.warn(
                    'Using host network mode. If you are using MacOS, please make sure you have the latest version of Docker Desktop and enabled host network feature: https://docs.docker.com/network/drivers/host/#docker-desktop'
                )
-            else:
-                port_mapping = {f'{self._port}/tcp': self._port}

-            if mount_dir is not None:
+            # Combine environment variables
+            environment = {
+                'port': str(self._container_port),
+                'PYTHONUNBUFFERED': 1,
+            }
+            if self.config.debug:
+                environment['DEBUG'] = 'true'
+
+            logger.info(f'Workspace Base: {self.config.workspace_base}')
+            if mount_dir is not None and sandbox_workspace_dir is not None:
+                # e.g. result would be: {"/home/user/openhands/workspace": {'bind': "/workspace", 'mode': 'rw'}}
                volumes = {mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}}
-                logger.info(f'Mount dir: {sandbox_workspace_dir}')
+                logger.info(f'Mount dir: {mount_dir}')
            else:
                logger.warn(
-                    'Mount dir is not set, will not mount the workspace directory to the container.'
+                    'Warning: Mount dir is not set, will not mount the workspace directory to the container!\n'
                )
                volumes = None
+            logger.info(f'Sandbox workspace: {sandbox_workspace_dir}')

            if self.config.sandbox.browsergym_eval_env is not None:
                browsergym_arg = (
@ -225,9 +260,9 @@ class EventStreamRuntime(Runtime):
                self.runtime_container_image,
                command=(
                    f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
-                    'PYTHONUNBUFFERED=1 poetry run '
-                    f'python -u -m openhands.runtime.client.client {self._port} '
-                    f'--working-dir {sandbox_workspace_dir} '
+                    f'poetry run '
+                    f'python -u -m openhands.runtime.client.client {self._container_port} '
+                    f'--working-dir "{sandbox_workspace_dir}" '
                    f'{plugin_arg}'
                    f'--username {"openhands" if self.config.run_as_openhands else "root"} '
                    f'--user-id {self.config.sandbox.user_id} '
@ -235,24 +270,26 @@ class EventStreamRuntime(Runtime):
                ),
                network_mode=network_mode,
                ports=port_mapping,
-                working_dir='/openhands/code/',
+                working_dir='/openhands/code/',  # do not change this!
                name=self.container_name,
                detach=True,
-                environment={'DEBUG': 'true'} if self.config.debug else None,
+                environment=environment,
                volumes=volumes,
            )
            self.log_buffer = LogBuffer(container)
            logger.info(f'Container started. Server url: {self.api_url}')
            return container
        except Exception as e:
-            logger.error('Failed to start container')
+            logger.error(
+                f'Error: Instance {self.instance_id} FAILED to start container!\n'
+            )
            logger.exception(e)
            self.close(close_client=False)
            raise e

    @tenacity.retry(
        stop=tenacity.stop_after_attempt(10),
-        wait=tenacity.wait_exponential(multiplier=2, min=10, max=60),
+        wait=tenacity.wait_exponential(multiplier=2, min=1, max=20),
        reraise=(ConnectionRefusedError,),
    )
    def _wait_until_alive(self):
@ -278,10 +315,11 @@ class EventStreamRuntime(Runtime):
            )

        if not self.log_buffer.client_ready:
+            time.sleep(1)
            attempts = 0
-            while not self.log_buffer.client_ready and attempts < 5:
+            while not self.log_buffer.client_ready and attempts < 4:
                attempts += 1
-                time.sleep(1)
+                time.sleep(2)
                logs = self.log_buffer.get_and_clear()
                if logs:
                    formatted_logs = '\n'.join([f'    |{log}' for log in logs])
@ -303,13 +341,8 @@ class EventStreamRuntime(Runtime):
            logger.error(msg)
            raise RuntimeError(msg)

-    @property
-    def sandbox_workspace_dir(self):
-        return self.config.workspace_mount_path_in_sandbox
-
    def close(self, close_client: bool = True, rm_all_containers: bool = True):
-        """
-        Closes the EventStreamRuntime and associated objects
+        """Closes the EventStreamRuntime and associated objects

        Parameters:
        - close_client (bool): Whether to close the DockerClient
@ -322,23 +355,29 @@ class EventStreamRuntime(Runtime):
        if self.session:
            self.session.close()

-        containers = self.docker_client.containers.list(all=True)
-        for container in containers:
-            try:
-                # If the app doesn't shut down properly, it can leave runtime containers on the system. This ensures
-                # that all 'openhands-sandbox-' containers are removed as well.
-                if rm_all_containers and container.name.startswith(
-                    self.container_name_prefix
-                ):
-                    container.remove(force=True)
-                elif container.name == self.container_name:
-                    logs = container.logs(tail=1000).decode('utf-8')
-                    logger.debug(
-                        f'==== Container logs ====\n{logs}\n==== End of container logs ===='
-                    )
-                    container.remove(force=True)
-            except docker.errors.NotFound:
-                pass
+        try:
+            containers = self.docker_client.containers.list(all=True)
+            for container in containers:
+                try:
+                    # If the app doesn't shut down properly, it can leave runtime containers on the system. This ensures
+                    # that all 'openhands-sandbox-' containers are removed as well.
+                    if rm_all_containers and container.name.startswith(
+                        self.container_name_prefix
+                    ):
+                        container.remove(force=True)
+                    elif container.name == self.container_name:
+                        if not self.skip_container_logs:
+                            logs = container.logs(tail=1000).decode('utf-8')
+                            logger.debug(
+                                f'==== Container logs on close ====\n{logs}\n==== End of container logs ===='
+                            )
+                        container.remove(force=True)
+                except docker.errors.APIError:
+                    pass
+                except docker.errors.NotFound:
+                    pass
+        except docker.errors.NotFound:  # yes, this can happen!
+            pass

        if close_client:
            self.docker_client.close()
@ -494,3 +533,20 @@ class EventStreamRuntime(Runtime):
            raise TimeoutError('List files operation timed out')
        except Exception as e:
            raise RuntimeError(f'List files operation failed: {str(e)}')
+
+    def _is_port_in_use_docker(self, port):
+        containers = self.docker_client.containers.list()
+        for container in containers:
+            container_ports = container.ports
+            if str(port) in str(container_ports):
+                return True
+        return False
+
+    def _find_available_port(self, max_attempts=5):
+        port = 39999
+        for _ in range(max_attempts):
+            port = find_available_tcp_port(30000, 39999)
+            if not self._is_port_in_use_docker(port):
+                return port
+        # If no port is found after max_attempts, return the last tried port
+        return port
--- a/openhands/runtime/plugins/jupyter/init.py
+++ b/openhands/runtime/plugins/jupyter/init.py
@ -19,7 +19,7 @@ class JupyterPlugin(Plugin):
    name: str = 'jupyter'

    async def initialize(self, username: str, kernel_id: str = 'openhands-default'):
-        self.kernel_gateway_port = find_available_tcp_port()
+        self.kernel_gateway_port = find_available_tcp_port(40000, 49999)
        self.kernel_id = kernel_id
        self.gateway_process = subprocess.Popen(
            (
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@ -142,7 +142,7 @@ class RemoteRuntime(Runtime):
                f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
                'PYTHONUNBUFFERED=1 poetry run '
                f'python -u -m openhands.runtime.client.client {self.port} '
-                f'--working-dir {self.sandbox_workspace_dir} '
+                f'--working-dir {self.config.workspace_mount_path_in_sandbox} '
                f'{plugin_arg}'
                f'--username {"openhands" if self.config.run_as_openhands else "root"} '
                f'--user-id {self.config.sandbox.user_id} '
@ -203,10 +203,6 @@ class RemoteRuntime(Runtime):
            logger.warning(msg)
            raise RuntimeError(msg)

-    @property
-    def sandbox_workspace_dir(self):
-        return self.config.workspace_mount_path_in_sandbox
-
    def close(self):
        if self.runtime_id:
            try:
--- a/openhands/runtime/runtime.py
+++ b/openhands/runtime/runtime.py
@ -67,7 +67,6 @@ class Runtime:
        self.config = copy.deepcopy(config)
        self.DEFAULT_ENV_VARS = _default_env_vars(config.sandbox)
        atexit.register(self.close)
-        logger.debug(f'Runtime `{sid}`')

        if self.DEFAULT_ENV_VARS:
            logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
--- a/openhands/runtime/utils/init.py
+++ b/openhands/runtime/utils/init.py
@ -1,4 +1,7 @@
 from openhands.runtime.utils.bash import split_bash_commands
-from openhands.runtime.utils.system import find_available_tcp_port
+from openhands.runtime.utils.system import (
+    display_number_matrix,
+    find_available_tcp_port,
+)

-__all__ = ['find_available_tcp_port', 'split_bash_commands']
+__all__ = ['display_number_matrix', 'find_available_tcp_port', 'split_bash_commands']
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@ -55,7 +55,7 @@ def _put_source_code_to_dir(temp_dir: str):
        ' ', r'\ '
    )  # escape spaces in the project root
    result = subprocess.run(
-        f'python -m build -s -o {temp_dir} {_cleaned_project_root}',
+        f'python -m build -s -o "{temp_dir}" {_cleaned_project_root}',
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
@ -142,13 +142,14 @@ def prep_docker_build_folder(
        skip_init=skip_init,
        extra_deps=extra_deps,
    )
-    logger.debug(
-        (
-            f'===== Dockerfile content start =====\n'
-            f'{dockerfile_content}\n'
-            f'===== Dockerfile content end ====='
+    if os.getenv('SKIP_CONTAINER_LOGS', 'false') != 'true':
+        logger.debug(
+            (
+                f'===== Dockerfile content start =====\n'
+                f'{dockerfile_content}\n'
+                f'===== Dockerfile content end ====='
+            )
        )
-    )
    with open(os.path.join(dir_path, 'Dockerfile'), 'w') as file:
        file.write(dockerfile_content)

--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@ -60,6 +60,7 @@ RUN cd /openhands/code && \
    {{ extra_deps }} {% if extra_deps %} && {% endif %} \
    /openhands/miniforge3/bin/mamba run -n base poetry cache clear --all . && \
    {% if not skip_init %}chmod -R g+rws /openhands/poetry && {% endif %} \
+    mkdir -p /openhands/workspace && chmod -R g+rws,o+rw /openhands/workspace && \
    apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    /openhands/miniforge3/bin/mamba clean --all

--- a/openhands/runtime/utils/system.py
+++ b/openhands/runtime/utils/system.py
@ -1,17 +1,34 @@
+import random
 import socket
+import time


-def find_available_tcp_port() -> int:
-    """Find an available TCP port, return -1 if none available."""
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    try:
-        sock.bind(('localhost', 0))
-        port = sock.getsockname()[1]
-        return port
-    except Exception:
-        return -1
-    finally:
-        sock.close()
+def find_available_tcp_port(min_port=30000, max_port=39999, max_attempts=10) -> int:
+    """Find an available TCP port in a specified range.
+
+    Args:
+        min_port (int): The lower bound of the port range (default: 30000)
+        max_port (int): The upper bound of the port range (default: 39999)
+        max_attempts (int): Maximum number of attempts to find an available port (default: 10)
+
+    Returns:
+        int: An available port number, or -1 if none found after max_attempts
+    """
+    rng = random.SystemRandom()
+    ports = list(range(min_port, max_port + 1))
+    rng.shuffle(ports)
+
+    for port in ports[:max_attempts]:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        try:
+            sock.bind(('localhost', port))
+            return port
+        except OSError:
+            time.sleep(0.1)  # Short delay to further reduce chance of collisions
+            continue
+        finally:
+            sock.close()
+    return -1


 def display_number_matrix(number: int) -> str | None:
--- a/pytest.ini
+++ b/pytest.ini
@ -1,2 +1,3 @@
 [pytest]
 addopts = -p no:warnings
+asyncio_default_fixture_loop_scope = function
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@ -5,6 +5,8 @@ set -eo pipefail
 ##           CONSTANTS AND ENVIRONMENTAL VARIABLES          ##
 ##############################################################

+echo -e "\n\n============================================================"
+
 # unset environmental variables that might disturb testing
 unset OPENAI_API_KEY
 unset SANDBOX_ENV_OPENAI_API_KEY
@ -16,7 +18,7 @@ get_script_dir() {
    local source="${BASH_SOURCE[0]}"
    while [ -h "$source" ]; do
        local dir="$( cd -P "$( dirname "$source" )" && pwd )"
-        source="$(readlink "$source")"
+        source="$(readlink -f "$source" 2>/dev/null || echo "$source")"
        [[ $source != /* ]] && source="$dir/$source"
    done
    echo "$( cd -P "$( dirname "$source" )" && pwd )"
@ -27,9 +29,6 @@ TMP_FILE="${TMP_FILE:-tmp.log}"
 if [ -z "$WORKSPACE_BASE" ]; then
  WORKSPACE_BASE=$(pwd)
 fi
-if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
-  WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE
-fi

 DEBUG=true  # needed for llm logging to create mock files!

@ -39,7 +38,7 @@ fi

 export SCRIPT_DIR=$(get_script_dir)
 export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
-export LOG_DIR=$PROJECT_ROOT/logs
+export LOG_DIR="$PROJECT_ROOT/logs"
 echo "Current working directory: $(pwd)"
 echo "SCRIPT_DIR: $SCRIPT_DIR"
 echo "PROJECT_ROOT: $PROJECT_ROOT"
@ -47,22 +46,29 @@ echo "LOG_DIR: $LOG_DIR"
 echo "LOG_TO_FILE: $LOG_TO_FILE"

 WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
-mkdir -p $WORKSPACE_BASE
-chmod -R 777 $WORKSPACE_BASE
-WORKSPACE_BASE=$(realpath $WORKSPACE_BASE)
+mkdir -p "$WORKSPACE_BASE"
+chmod -R 777 "$WORKSPACE_BASE"
+WORKSPACE_BASE=$(realpath "$WORKSPACE_BASE")

-WORKSPACE_MOUNT_PATH=${WORKSPACE_MOUNT_PATH}/_test_workspace
-mkdir -p $WORKSPACE_MOUNT_PATH
-chmod -R 777 $WORKSPACE_MOUNT_PATH
-WORKSPACE_MOUNT_PATH=$(realpath $WORKSPACE_MOUNT_PATH)
+if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
+  WORKSPACE_MOUNT_PATH="$WORKSPACE_BASE"
+else
+  WORKSPACE_MOUNT_PATH="${WORKSPACE_MOUNT_PATH}/_test_workspace"
+  mkdir -p "$WORKSPACE_MOUNT_PATH"
+  chmod -R 755 "$WORKSPACE_MOUNT_PATH"
+  WORKSPACE_MOUNT_PATH=$(realpath "$WORKSPACE_MOUNT_PATH")
+fi
+
+WORKSPACE_MOUNT_PATH_IN_SANDBOX="${WORKSPACE_MOUNT_PATH_IN_SANDBOX:-/workspace}"

 echo "WORKSPACE_BASE: $WORKSPACE_BASE"
 echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
+echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"

 # Ensure we're in the correct directory
 cd "$PROJECT_ROOT" || exit 1

-mkdir -p $WORKSPACE_BASE
+mkdir -p "$WORKSPACE_BASE"

 # use environmental variable if exists
 TEST_RUNTIME="${TEST_RUNTIME:-eventstream}"
@ -178,7 +184,7 @@ cleanup() {
    kill $HTTP_SERVER_PID || true
    unset HTTP_SERVER_PID
  fi
-  [ -f $TMP_FILE ] && rm $TMP_FILE
+  [ -f "$TMP_FILE" ] && rm "$TMP_FILE"
  echo "Cleanup done!"
 }

@ -200,14 +206,14 @@ regenerate_without_llm() {
      PROJECT_ROOT="$PROJECT_ROOT" \
      WORKSPACE_BASE="$WORKSPACE_BASE" \
      WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
-      MAX_ITERATIONS=$MAX_ITERATIONS \
+      MAX_ITERATIONS="$MAX_ITERATIONS" \
      FORCE_APPLY_PROMPTS=true \
-      DEFAULT_AGENT=$agent \
+      DEFAULT_AGENT="$agent" \
      TEST_RUNTIME="$TEST_RUNTIME" \
-      LLM=$LLM \
-      DEBUG=$DEBUG \
-      LOG_TO_FILE=$LOG_TO_FILE \
-      FORCE_REGENERATE=$FORCE_REGENERATE \
+      LLM="$LLM" \
+      DEBUG="$DEBUG" \
+      LOG_TO_FILE="$LOG_TO_FILE" \
+      FORCE_REGENERATE="$FORCE_REGENERATE" \
      SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
      poetry run pytest -s "$SCRIPT_DIR/test_agent.py::$test_name"
  set +x
@ -216,12 +222,12 @@ regenerate_without_llm() {
 regenerate_with_llm() {
  cd "$PROJECT_ROOT"

-  rm -rf $WORKSPACE_BASE/*
+  rm -rf "$WORKSPACE_BASE/*"
  if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
-    cp -r "$SCRIPT_DIR/workspace/$test_name"/* $WORKSPACE_BASE
+    cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
  fi

-  rm -rf logs
+  rm -rf "$LOG_DIR"
  rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
  # set -x to print the command being executed
  set -x
@ -233,12 +239,12 @@ regenerate_with_llm() {
      DEFAULT_AGENT=$agent \
      RUNTIME="$TEST_RUNTIME" \
      SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
-      LLM=$LLM \
-      DEBUG=$DEBUG \
-      LOG_TO_FILE=$LOG_TO_FILE \
-      FORCE_REGENERATE=$FORCE_REGENERATE \
+      LLM="$LLM" \
+      DEBUG="$DEBUG" \
+      LOG_TO_FILE="$LOG_TO_FILE" \
+      FORCE_REGENERATE="$FORCE_REGENERATE" \
      poetry run python "$PROJECT_ROOT/openhands/core/main.py" \
-      -i $MAX_ITERATIONS \
+      -i "$MAX_ITERATIONS" \
      -t "$task Do not ask me for confirmation at any point." \
      -c $agent
  set +x
@ -256,8 +262,8 @@ if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  exit 1
 fi

-rm -rf logs
-rm -rf $WORKSPACE_BASE/*
+rm -rf "$LOG_DIR"
+rm -rf "$WORKSPACE_BASE/*"
 for ((i = 0; i < num_of_tests; i++)); do
  task=${tasks[i]}
  test_name=${test_names[i]}
@ -286,9 +292,9 @@ for ((i = 0; i < num_of_tests; i++)); do
    cd "$PROJECT_ROOT/tests"
    cd "$PROJECT_ROOT"

-    rm -rf $WORKSPACE_BASE/*
+    rm -rf "$WORKSPACE_BASE/*"
    if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
-      cp -r "$SCRIPT_DIR/workspace/$test_name"/* $WORKSPACE_BASE
+      cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
    fi

    if [ "$TEST_ONLY" ]; then
@ -395,7 +401,7 @@ for ((i = 0; i < num_of_tests; i++)); do
  fi
 done

-rm -rf logs
-rm -rf $WORKSPACE_BASE
+rm -rf "$LOG_DIR"
+rm -rf "$WORKSPACE_BASE"
 echo "Done!"
 cd "$PROJECT_ROOT"
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@ -6,7 +6,7 @@ import subprocess
 import pytest

 from openhands.controller.state.state import State
-from openhands.core.config import AppConfig, SandboxConfig, load_from_env
+from openhands.core.config import load_app_config
 from openhands.core.main import run_controller
 from openhands.core.schema import AgentState
 from openhands.events.action import (
@ -21,36 +21,23 @@ TEST_RUNTIME = os.getenv('TEST_RUNTIME')
 assert TEST_RUNTIME in ['eventstream', 'remote']
 _ = get_runtime_cls(TEST_RUNTIME)  # make sure it does not raise an error

-CONFIG = AppConfig(
-    max_iterations=int(os.getenv('MAX_ITERATIONS', 20)),
-    max_budget_per_task=int(os.getenv('MAX_BUDGET_PER_TASK', 15)),
-    runtime=TEST_RUNTIME,
-    default_agent=os.getenv('DEFAULT_AGENT'),
-    workspace_base=os.getenv('WORKSPACE_BASE'),
-    workspace_mount_path=os.getenv('WORKSPACE_MOUNT_PATH'),
-    sandbox=SandboxConfig(
-        use_host_network=True,
-    ),
+CONFIG = load_app_config()
+CONFIG.max_iterations = int(os.getenv('MAX_ITERATIONS', 20))
+CONFIG.max_budget_per_task = int(os.getenv('MAX_BUDGET_PER_TASK', 15))
+CONFIG.runtime = TEST_RUNTIME
+CONFIG.default_agent = os.getenv('DEFAULT_AGENT')
+CONFIG.workspace_base = os.getenv('WORKSPACE_BASE')
+CONFIG.workspace_mount_path = os.getenv('WORKSPACE_MOUNT_PATH')
+CONFIG.workspace_mount_path_in_sandbox = os.getenv(
+    'WORKSPACE_MOUNT_PATH_IN_SANDBOX', '/workspace'
 )
-load_from_env(CONFIG, os.environ)
+CONFIG.sandbox.use_host_network = True

 print('\nPaths used:')
 print(f'workspace_base: {CONFIG.workspace_base}')
 print(f'workspace_mount_path: {CONFIG.workspace_mount_path}')
 print(f'workspace_mount_path_in_sandbox: {CONFIG.workspace_mount_path_in_sandbox}')

-# Check if running in WSL environment
-if 'WSL_DISTRO_NAME' in os.environ:
-    if (
-        CONFIG.workspace_base
-        and CONFIG.workspace_mount_path
-        and CONFIG.workspace_base != CONFIG.workspace_mount_path
-    ):
-        print(
-            '\n**********\nWARNING: if WORKSPACE_MOUNT_PATH is set differently to'
-            '\nWORKSPACE_BASE some file operation tests may fail!\n**********\n'
-        )
-

 def get_number_of_prompts(test_name: str):
    mock_dir = os.path.join(
--- a/tests/runtime/conftest.py
+++ b/tests/runtime/conftest.py
@ -1,11 +1,15 @@
 import os
 import random
+import shutil
+import stat
 import time
+from pathlib import Path

 import pytest
 from pytest import TempPathFactory

-from openhands.core.config import AppConfig, SandboxConfig, load_from_env
+from openhands.core.config import load_app_config
+from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
 from openhands.runtime.client.runtime import EventStreamRuntime
 from openhands.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
@ -13,19 +17,86 @@ from openhands.runtime.remote.runtime import RemoteRuntime
 from openhands.runtime.runtime import Runtime
 from openhands.storage import get_file_store

+TEST_IN_CI = os.getenv('TEST_IN_CI', 'False').lower() in ['true', '1', 'yes']
+TEST_RUNTIME = os.getenv('TEST_RUNTIME', 'eventstream').lower()
+RUN_AS_OPENHANDS = os.getenv('RUN_AS_OPENHANDS', 'True').lower() in ['true', '1', 'yes']
+test_mount_path = ''
+project_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+sandbox_test_folder = '/openhands/workspace'
+
+
+def _get_runtime_sid(runtime: Runtime):
+    logger.debug(f'\nruntime.sid: {runtime.sid}')
+    return runtime.sid
+
+
+def _get_host_folder(runtime: Runtime):
+    return runtime.config.workspace_mount_path
+
+
+def _get_sandbox_folder(runtime: Runtime):
+    sid = _get_runtime_sid(runtime)
+    if sid:
+        return Path(os.path.join(sandbox_test_folder, sid))
+    return None
+
+
+def _remove_folder(folder: str) -> bool:
+    success = False
+    if folder and os.path.isdir(folder):
+        try:
+            os.rmdir(folder)
+            success = True
+        except OSError:
+            try:
+                shutil.rmtree(folder)
+                success = True
+            except OSError:
+                pass
+        logger.debug(f'\nCleanup: `{folder}`: ' + ('[OK]' if success else '[FAILED]'))
+    return success
+
+
+def _close_test_runtime(runtime: Runtime):
+    if isinstance(runtime, EventStreamRuntime):
+        runtime.close(rm_all_containers=False)
+    else:
+        runtime.close()
+    time.sleep(1)
+
+
+def _reset_pwd():
+    global project_dir
+    # Try to change back to project directory
+    try:
+        os.chdir(project_dir)
+        logger.info(f'Changed back to project directory `{project_dir}')
+    except Exception as e:
+        logger.error(f'Failed to change back to project directory: {e}')
+
+
+# *****************************************************************************
+# *****************************************************************************
+

@pytest.fixture(autouse=True)
 def print_method_name(request):
-    print('\n########################################################################')
+    print(
+        '\n\n########################################################################'
+    )
    print(f'Running test: {request.node.name}')
-    print('########################################################################')
-    yield
+    print(
+        '########################################################################\n\n'
+    )


@pytest.fixture
-def temp_dir(tmp_path_factory: TempPathFactory) -> str:
-    """
-    Creates a unique temporary directory
+def temp_dir(tmp_path_factory: TempPathFactory, request) -> str:
+    """Creates a unique temporary directory.
+    Upon finalization, the temporary directory and its content is removed.
+    The cleanup function is also called upon KeyboardInterrupt.

    Parameters:
    - tmp_path_factory (TempPathFactory): A TempPathFactory class
@ -33,15 +104,23 @@ def temp_dir(tmp_path_factory: TempPathFactory) -> str:
    Returns:
    - str: The temporary directory path that was created
    """
-
-    unique_suffix = random.randint(10000, 99999)
-    temp_directory = tmp_path_factory.mktemp(
-        f'test_runtime_{unique_suffix}', numbered=False
+    temp_dir = tmp_path_factory.mktemp(
+        'rt_' + str(random.randint(100000, 999999)), numbered=False
    )
-    return str(temp_directory)

+    logger.info(f'\n*** {request.node.name}\n>> temp folder: {temp_dir}\n')

-TEST_RUNTIME = os.getenv('TEST_RUNTIME', 'eventstream')
+    # Set permissions to ensure the directory is writable and deletable
+    os.chmod(temp_dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)  # 0777 permissions
+
+    def cleanup():
+        global project_dir
+        os.chdir(project_dir)
+        _remove_folder(temp_dir)
+
+    request.addfinalizer(cleanup)
+
+    return str(temp_dir)


 # Depending on TEST_RUNTIME, feed the appropriate box class(es) to the test.
@ -55,28 +134,47 @@ def get_box_classes():
        raise ValueError(f'Invalid runtime: {runtime}')


+def get_run_as_openhands():
+    print(
+        '\n\n########################################################################'
+    )
+    print('USER: ' + 'openhands' if RUN_AS_OPENHANDS else 'root')
+    print(
+        '########################################################################\n\n'
+    )
+    return [RUN_AS_OPENHANDS]
+
+
+@pytest.fixture(scope='module')  # for xdist
+def runtime_setup_module():
+    _reset_pwd()
+    yield
+    _reset_pwd()
+
+
+@pytest.fixture(scope='session')  # not for xdist
+def runtime_setup_session():
+    _reset_pwd()
+    yield
+    _reset_pwd()
+
+
 # This assures that all tests run together per runtime, not alternating between them,
 # which cause errors (especially outside GitHub actions).
@pytest.fixture(scope='module', params=get_box_classes())
 def box_class(request):
-    time.sleep(2)
+    time.sleep(1)
    return request.param


 # TODO: We will change this to `run_as_user` when `ServerRuntime` is deprecated.
 # since `EventStreamRuntime` supports running as an arbitrary user.
-@pytest.fixture(scope='module', params=[True, False])
+@pytest.fixture(scope='module', params=get_run_as_openhands())
 def run_as_openhands(request):
    time.sleep(1)
    return request.param


-@pytest.fixture(scope='module', params=[True, False])
-def enable_auto_lint(request):
-    time.sleep(1)
-    return request.param
-
-
@pytest.fixture(scope='module', params=None)
 def base_container_image(request):
    time.sleep(1)
@ -96,21 +194,12 @@ def base_container_image(request):
        if request.param is None:
            request.param = pytest.param(
                'nikolaik/python-nodejs:python3.11-nodejs22',
-                'python:3.11-bookworm',
-                'node:22-bookworm',
                'golang:1.23-bookworm',
            )
    print(f'Container image: {request.param}')
    return request.param


-@pytest.fixture
-def runtime(temp_dir, box_class, run_as_openhands):
-    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
-    yield runtime
-    time.sleep(1)
-
-
 def _load_runtime(
    temp_dir,
    box_class,
@ -118,29 +207,45 @@ def _load_runtime(
    enable_auto_lint: bool = False,
    base_container_image: str | None = None,
    browsergym_eval_env: str | None = None,
+    use_workspace: bool | None = None,
 ) -> Runtime:
-    sid = 'test'
-    cli_session = 'main_test'
+    sid = 'rt_' + str(random.randint(100000, 999999))

    # AgentSkills need to be initialized **before** Jupyter
    # otherwise Jupyter will not access the proper dependencies installed by AgentSkills
    plugins = [AgentSkillsRequirement(), JupyterRequirement()]
-    config = AppConfig(
-        workspace_base=temp_dir,
-        workspace_mount_path=temp_dir,
-        sandbox=SandboxConfig(
-            use_host_network=True,
-            browsergym_eval_env=browsergym_eval_env,
-        ),
-    )
-    load_from_env(config, os.environ)
+
+    config = load_app_config()
    config.run_as_openhands = run_as_openhands
+
+    # Folder where all tests create their own folder
+    global test_mount_path
+    if use_workspace:
+        test_mount_path = os.path.join(config.workspace_base, 'rt')
+    else:
+        test_mount_path = os.path.join(
+            temp_dir, sid
+        )  # need a subfolder to avoid conflicts
+    config.workspace_mount_path = test_mount_path
+
+    # Mounting folder specific for this test inside the sandbox
+    config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}/{sid}'
+    print('\nPaths used:')
+    print(f'use_host_network: {config.sandbox.use_host_network}')
+    print(f'workspace_base: {config.workspace_base}')
+    print(f'workspace_mount_path: {config.workspace_mount_path}')
+    print(
+        f'workspace_mount_path_in_sandbox: {config.workspace_mount_path_in_sandbox}\n'
+    )
+
+    config.sandbox.browsergym_eval_env = browsergym_eval_env
    config.sandbox.enable_auto_lint = enable_auto_lint
+
    if base_container_image is not None:
        config.sandbox.base_container_image = base_container_image

    file_store = get_file_store(config.file_store, config.file_store_path)
-    event_stream = EventStream(cli_session, file_store)
+    event_stream = EventStream(sid, file_store)

    runtime = box_class(
        config=config,
@ -148,9 +253,14 @@ def _load_runtime(
        sid=sid,
        plugins=plugins,
    )
-    time.sleep(1)
+    time.sleep(2)
    return runtime


 # Export necessary function
-__all__ = ['_load_runtime']
+__all__ = [
+    '_load_runtime',
+    '_get_host_folder',
+    '_get_sandbox_folder',
+    '_remove_folder',
+]
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@ -1,11 +1,14 @@
 """Bash-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""

 import os
-import tempfile
-import time

 import pytest
-from conftest import _load_runtime
+from conftest import (
+    TEST_IN_CI,
+    _close_test_runtime,
+    _get_sandbox_folder,
+    _load_runtime,
+)

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
@ -16,77 +19,63 @@ from openhands.events.observation import CmdOutputObservation
 # ============================================================================================================================


+def _run_cmd_action(runtime, custom_command: str, keep_prompt=True):
+    action = CmdRunAction(command=custom_command, keep_prompt=keep_prompt)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert isinstance(obs, CmdOutputObservation)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    return obs
+
+
 def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        # We set env var PS1="\u@\h:\w $"
+        # and construct the PEXCEPT prompt base on it.
+        # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
+        # and failed to pexcept the right content, causing it fail to get error code.
+        obs = runtime.run_action(CmdRunAction(command='env'))

-    # We set env var PS1="\u@\h:\w $"
-    # and construct the PEXCEPT prompt base on it.
-    # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
-    # and failed to pexcept the right content, causing it fail to get error code.
-    obs = runtime.run_action(CmdRunAction(command='env'))
+        # For example:
+        # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env
+        # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
+        # CONDA_EXE=/openhands/miniforge3/bin/conda
+        # [...]
+        # LC_CTYPE=C.UTF-8
+        # PS1=\u@\h:\w $
+        # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env
+        # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output:
+        # CONDA_DEFAULT_ENV=base

-    # For example:
-    # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env
-    # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
-    # CONDA_EXE=/openhands/miniforge3/bin/conda
-    # [...]
-    # LC_CTYPE=C.UTF-8
-    # PS1=\u@\h:\w $
-    # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env
-    # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output:
-    # CONDA_DEFAULT_ENV=base
-
-    # As long as the exit code is 0, the test will pass.
-    assert isinstance(
-        obs, CmdOutputObservation
-    ), 'The observation should be a CmdOutputObservation.'
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # As long as the exit code is 0, the test will pass.
+        assert isinstance(
+            obs, CmdOutputObservation
+        ), 'The observation should be a CmdOutputObservation.'
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+    finally:
+        _close_test_runtime(runtime)


-def test_single_multiline_command(temp_dir, box_class):
+def test_multiline_commands(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
+    try:
+        # single multiline command
+        obs = _run_cmd_action(runtime, 'echo \\\n -e "foo"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'foo' in obs.content

-    action = CmdRunAction(command='echo \\\n -e "foo"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'foo' in obs.content
+        # test multiline echo
+        obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'hello\r\nworld' in obs.content

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
-
-
-def test_multiline_echo(temp_dir, box_class):
-    runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='echo -e "hello\nworld"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'hello\r\nworld' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
-
-
-def test_runtime_whitespace(temp_dir, box_class):
-    runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='echo -e "\\n\\n\\n"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert '\r\n\r\n\r\n' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # test whitespace
+        obs = _run_cmd_action(runtime, 'echo -e "\\n\\n\\n"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert '\r\n\r\n\r\n' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_multiple_multiline_commands(temp_dir, box_class, run_as_openhands):
@ -120,48 +109,36 @@ world "
    joined_cmds = '\n'.join(cmds)

    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        obs = _run_cmd_action(runtime, joined_cmds)
+        assert obs.exit_code == 0, 'The exit code should be 0.'

-    action = CmdRunAction(command=joined_cmds)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-
-    assert 'total 0' in obs.content
-    assert 'hello\r\nworld' in obs.content
-    assert "hello it\\'s me" in obs.content
-    assert 'hello -v' in obs.content
-    assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
-    assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
-    assert 'hello\r\nworld "\r\n' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        assert 'total 0' in obs.content
+        assert 'hello\r\nworld' in obs.content
+        assert "hello it\\'s me" in obs.content
+        assert 'hello -v' in obs.content
+        assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
+        assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
+        assert 'hello\r\nworld "\r\n' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_no_ps2_in_output(temp_dir, box_class, run_as_openhands):
    """Test that the PS2 sign is not added to the output of a multiline command."""
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'

-    action = CmdRunAction(command='echo -e "hello\nworld"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert 'hello\r\nworld' in obs.content
-    assert '>' not in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        assert 'hello\r\nworld' in obs.content
+        assert '>' not in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_multiline_command_loop(temp_dir, box_class):
    # https://github.com/All-Hands-AI/OpenHands/issues/3143
-
-    runtime = _load_runtime(temp_dir, box_class)
-
    init_cmd = """
 mkdir -p _modules && \
 for month in {01..04}; do
@ -171,15 +148,6 @@ for month in {01..04}; do
 done
 echo "created files"
 """
-    action = CmdRunAction(command=init_cmd)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'created files' in obs.content
-
    follow_up_cmd = """
 for file in _modules/*.md; do
    new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/')
@ -187,153 +155,104 @@ for file in _modules/*.md; do
 done
 echo "success"
 """
-    action = CmdRunAction(command=follow_up_cmd)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    runtime = _load_runtime(temp_dir, box_class)
+    try:
+        obs = _run_cmd_action(runtime, init_cmd)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'created files' in obs.content

-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'success' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, follow_up_cmd)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'success' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_cmd_run(temp_dir, box_class, run_as_openhands):
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        obs = _run_cmd_action(runtime, 'ls -l /openhands/workspace')
+        assert obs.exit_code == 0

-    action = CmdRunAction(command='ls -l')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'total 0' in obs.content
+        obs = _run_cmd_action(runtime, 'ls -l')
+        assert obs.exit_code == 0
+        assert 'total 0' in obs.content

-    action = CmdRunAction(command='mkdir test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        obs = _run_cmd_action(runtime, 'mkdir test')
+        assert obs.exit_code == 0

-    action = CmdRunAction(command='ls -l')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    if run_as_openhands:
-        assert 'openhands' in obs.content
-    else:
-        assert 'root' in obs.content
-    assert 'test' in obs.content
+        obs = _run_cmd_action(runtime, 'ls -l')
+        assert obs.exit_code == 0
+        if run_as_openhands:
+            assert 'openhands' in obs.content
+        else:
+            assert 'root' in obs.content
+        assert 'test' in obs.content

-    action = CmdRunAction(command='touch test/foo.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        obs = _run_cmd_action(runtime, 'touch test/foo.txt')
+        assert obs.exit_code == 0

-    action = CmdRunAction(command='ls -l test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'foo.txt' in obs.content
+        obs = _run_cmd_action(runtime, 'ls -l test')
+        assert obs.exit_code == 0
+        assert 'foo.txt' in obs.content

-    # clean up: this is needed, since CI will not be
-    # run as root, and this test may leave a file
-    # owned by root
-    action = CmdRunAction(command='rm -rf test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # clean up: this is needed, since CI will not be
+        # run as root, and this test may leave a file
+        # owned by root
+        _run_cmd_action(runtime, 'rm -rf test')
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)


 def test_run_as_user_correct_home_dir(temp_dir, box_class, run_as_openhands):
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
-
-    action = CmdRunAction(command='cd ~ && pwd')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    if run_as_openhands:
-        assert '/home/openhands' in obs.content
-    else:
-        assert '/root' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    try:
+        obs = _run_cmd_action(runtime, 'cd ~ && pwd')
+        assert obs.exit_code == 0
+        if run_as_openhands:
+            assert '/home/openhands' in obs.content
+        else:
+            assert '/root' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_multi_cmd_run_in_single_line(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='pwd && ls -l')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert '/workspace' in obs.content
-    assert 'total 0' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    try:
+        obs = _run_cmd_action(runtime, 'pwd && ls -l')
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+        assert 'total 0' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_stateful_cmd(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
+    sandbox_dir = _get_sandbox_folder(runtime)
+    try:
+        obs = _run_cmd_action(runtime, 'mkdir -p test')
+        assert obs.exit_code == 0, 'The exit code should be 0.'

-    action = CmdRunAction(command='mkdir test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
+        obs = _run_cmd_action(runtime, 'cd test')
+        assert obs.exit_code == 0, 'The exit code should be 0.'

-    action = CmdRunAction(command='cd test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-
-    action = CmdRunAction(command='pwd')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert '/workspace/test' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, 'pwd')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert f'{sandbox_dir}/test' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_failed_cmd(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='non_existing_command')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code != 0, 'The exit code should not be 0 for a failed command.'
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    try:
+        obs = _run_cmd_action(runtime, 'non_existing_command')
+        assert obs.exit_code != 0, 'The exit code should not be 0 for a failed command.'
+    finally:
+        _close_test_runtime(runtime)


 def _create_test_file(host_temp_dir):
@ -344,154 +263,121 @@ def _create_test_file(host_temp_dir):

 def test_copy_single_file(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
+        sandbox_file = os.path.join(sandbox_dir, 'test_file.txt')
+        _create_test_file(temp_dir)
+        runtime.copy_to(os.path.join(temp_dir, 'test_file.txt'), sandbox_dir)

-    with tempfile.TemporaryDirectory() as host_temp_dir:
-        _create_test_file(host_temp_dir)
-        runtime.copy_to(os.path.join(host_temp_dir, 'test_file.txt'), '/workspace')
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0
+        assert 'test_file.txt' in obs.content

-    action = CmdRunAction(command='ls -alh /workspace')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'test_file.txt' in obs.content
-
-    action = CmdRunAction(command='cat /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'cat {sandbox_file}')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' in obs.content
+    finally:
+        _close_test_runtime(runtime)


-def _create_test_dir_with_files(host_temp_dir):
-    os.mkdir(os.path.join(host_temp_dir, 'test_dir'))
-    with open(os.path.join(host_temp_dir, 'test_dir', 'file1.txt'), 'w') as f:
+def _create_host_test_dir_with_files(test_dir):
+    logger.debug(f'creating `{test_dir}`')
+    if not os.path.isdir(test_dir):
+        os.makedirs(test_dir, exist_ok=True)
+    logger.debug('creating test files in `test_dir`')
+    with open(os.path.join(test_dir, 'file1.txt'), 'w') as f:
        f.write('File 1 content')
-    with open(os.path.join(host_temp_dir, 'test_dir', 'file2.txt'), 'w') as f:
+    with open(os.path.join(test_dir, 'file2.txt'), 'w') as f:
        f.write('File 2 content')


 def test_copy_directory_recursively(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)

-    with tempfile.TemporaryDirectory() as host_temp_dir:
+    sandbox_dir = _get_sandbox_folder(runtime)
+    try:
+        temp_dir_copy = os.path.join(temp_dir, 'test_dir')
        # We need a separate directory, since temp_dir is mounted to /workspace
-        _create_test_dir_with_files(host_temp_dir)
-        runtime.copy_to(
-            os.path.join(host_temp_dir, 'test_dir'), '/workspace', recursive=True
-        )
+        _create_host_test_dir_with_files(temp_dir_copy)

-    action = CmdRunAction(command='ls -alh /workspace')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'test_dir' in obs.content
-    assert 'file1.txt' not in obs.content
-    assert 'file2.txt' not in obs.content
+        runtime.copy_to(temp_dir_copy, sandbox_dir, recursive=True)

-    action = CmdRunAction(command='ls -alh /workspace/test_dir')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'file1.txt' in obs.content
-    assert 'file2.txt' in obs.content
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0
+        assert 'test_dir' in obs.content
+        assert 'file1.txt' not in obs.content
+        assert 'file2.txt' not in obs.content

-    action = CmdRunAction(command='cat /workspace/test_dir/file1.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'File 1 content' in obs.content
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}/test_dir')
+        assert obs.exit_code == 0
+        assert 'file1.txt' in obs.content
+        assert 'file2.txt' in obs.content

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/test_dir/file1.txt')
+        assert obs.exit_code == 0
+        assert 'File 1 content' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_copy_to_non_existent_directory(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
-
-    with tempfile.TemporaryDirectory() as host_temp_dir:
-        _create_test_file(host_temp_dir)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
+        _create_test_file(temp_dir)
        runtime.copy_to(
-            os.path.join(host_temp_dir, 'test_file.txt'), '/workspace/new_dir'
+            os.path.join(temp_dir, 'test_file.txt'), f'{sandbox_dir}/new_dir'
        )

-    action = CmdRunAction(command='cat /workspace/new_dir/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/new_dir/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_overwrite_existing_file(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)

-    # touch a file in /workspace
-    action = CmdRunAction(command='touch /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0

-    action = CmdRunAction(command='cat /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' not in obs.content
+        obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0

-    with tempfile.TemporaryDirectory() as host_temp_dir:
-        _create_test_file(host_temp_dir)
-        runtime.copy_to(os.path.join(host_temp_dir, 'test_file.txt'), '/workspace')
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0

-    action = CmdRunAction(command='cat /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' in obs.content
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' not in obs.content

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        _create_test_file(temp_dir)
+        runtime.copy_to(os.path.join(temp_dir, 'test_file.txt'), sandbox_dir)
+
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' in obs.content
+    finally:
+        _close_test_runtime(runtime)


 def test_copy_non_existent_file(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
+        with pytest.raises(FileNotFoundError):
+            runtime.copy_to(
+                os.path.join(sandbox_dir, 'non_existent_file.txt'),
+                f'{sandbox_dir}/should_not_exist.txt',
+            )

-    with pytest.raises(FileNotFoundError):
-        runtime.copy_to(
-            os.path.join(temp_dir, 'non_existent_file.txt'),
-            '/workspace/should_not_exist.txt',
-        )
-
-    action = CmdRunAction(command='ls /workspace/should_not_exist.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code != 0  # File should not exist
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'ls {sandbox_dir}/should_not_exist.txt')
+        assert obs.exit_code != 0  # File should not exist
+    finally:
+        _close_test_runtime(runtime)


 def test_keep_prompt(box_class, temp_dir):
@ -500,27 +386,26 @@ def test_keep_prompt(box_class, temp_dir):
        box_class=box_class,
        run_as_openhands=False,
    )
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)

-    action = CmdRunAction(command='touch /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'root@' in obs.content
+        obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'root@' in obs.content

-    action = CmdRunAction(command='cat /workspace/test_file.txt', keep_prompt=False)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'root@' not in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(
+            runtime, f'cat {sandbox_dir}/test_file.txt', keep_prompt=False
+        )
+        assert obs.exit_code == 0
+        assert 'root@' not in obs.content
+    finally:
+        _close_test_runtime(runtime)


+@pytest.mark.skipif(
+    TEST_IN_CI != 'True',
+    reason='This test is not working in WSL (file ownership)',
+)
 def test_git_operation(box_class):
    # do not mount workspace, since workspace mount by tests will be owned by root
    # while the user_id we get via os.getuid() is different from root
@ -531,69 +416,43 @@ def test_git_operation(box_class):
        # Need to use non-root user to expose issues
        run_as_openhands=True,
    )
-
    # this will happen if permission of runtime is not properly configured
    # fatal: detected dubious ownership in repository at '/workspace'
+    try:
+        # check the ownership of the current directory
+        obs = _run_cmd_action(runtime, 'ls -alh .')
+        assert obs.exit_code == 0
+        # drwx--S--- 2 openhands root   64 Aug  7 23:32 .
+        # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
+        for line in obs.content.split('\r\n'):
+            if ' ..' in line:
+                # parent directory should be owned by root
+                assert 'root' in line
+                assert 'openhands' not in line
+            elif ' .' in line:
+                # current directory should be owned by openhands
+                # and its group should be root
+                assert 'openhands' in line
+                assert 'root' in line

-    # check the ownership of the current directory
-    action = CmdRunAction(command='ls -alh .')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    # drwx--S--- 2 openhands root   64 Aug  7 23:32 .
-    # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
-    for line in obs.content.split('\r\n'):
-        if ' ..' in line:
-            # parent directory should be owned by root
-            assert 'root' in line
-            assert 'openhands' not in line
-        elif ' .' in line:
-            # current directory should be owned by openhands
-            # and its group should be root
-            assert 'openhands' in line
-            assert 'root' in line
+        # make sure all git operations are allowed
+        obs = _run_cmd_action(runtime, 'git init')
+        assert obs.exit_code == 0

-    # make sure all git operations are allowed
-    action = CmdRunAction(command='git init')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        # create a file
+        obs = _run_cmd_action(runtime, 'echo "hello" > test_file.txt')
+        assert obs.exit_code == 0

-    # create a file
-    action = CmdRunAction(command='echo "hello" > test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        # git add
+        obs = _run_cmd_action(runtime, 'git add test_file.txt')
+        assert obs.exit_code == 0

-    # git add
-    action = CmdRunAction(command='git add test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        # git diff
+        obs = _run_cmd_action(runtime, 'git diff')
+        assert obs.exit_code == 0

-    # git diff
-    action = CmdRunAction(command='git diff')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-
-    # git commit
-    action = CmdRunAction(command='git commit -m "test commit"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # git commit
+        obs = _run_cmd_action(runtime, 'git commit -m "test commit"')
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@ -1,9 +1,8 @@
 """Browsing-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""

 import json
-import time

-from conftest import _load_runtime
+from conftest import _close_test_runtime, _load_runtime

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
@ -66,8 +65,7 @@ def test_simple_browse(temp_dir, box_class, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


 def test_browsergym_eval_env(box_class, temp_dir):
@ -111,5 +109,4 @@ def test_browsergym_eval_env(box_class, temp_dir):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert json.loads(obs.content) == [0.0]

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
--- a/tests/runtime/test_env_vars.py
+++ b/tests/runtime/test_env_vars.py
@ -1,10 +1,9 @@
 """Env vars related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""

 import os
-import time
 from unittest.mock import patch

-from conftest import _load_runtime
+from conftest import _close_test_runtime, _load_runtime

 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation
@ -30,8 +29,7 @@ def test_env_vars_os_environ(temp_dir, box_class, run_as_openhands):
            obs.content.strip().split('\n\r')[0].strip() == 'BAZ'
        ), f'Output: [{obs.content}] for {box_class}'

-        runtime.close(rm_all_containers=False)
-        time.sleep(1)
+        _close_test_runtime(runtime)


 def test_env_vars_runtime_operations(temp_dir, box_class):
@ -66,5 +64,4 @@ def test_env_vars_runtime_operations(temp_dir, box_class):
        and obs.content.strip().split('\r\n')[0].strip() == 'new_value'
    )

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
--- a/tests/runtime/test_images.py
+++ b/tests/runtime/test_images.py
@ -1,9 +1,7 @@
 """Image-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""

-import time
-
 import pytest
-from conftest import _load_runtime
+from conftest import _close_test_runtime, _load_runtime

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
@ -17,7 +15,6 @@ def test_bash_python_version(temp_dir, box_class, base_container_image):
    """Make sure Python is available in bash."""
    if base_container_image not in [
        'python:3.11-bookworm',
-        'nikolaik/python-nodejs:python3.11-nodejs22',
    ]:
        pytest.skip('This test is only for python-related images')

@ -45,15 +42,13 @@ def test_bash_python_version(temp_dir, box_class, base_container_image):
    assert obs.exit_code == 0
    assert 'pip' in obs.content  # Check that pip is available

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


 def test_nodejs_22_version(temp_dir, box_class, base_container_image):
    """Make sure Node.js is available in bash."""
    if base_container_image not in [
        'node:22-bookworm',
-        'nikolaik/python-nodejs:python3.11-nodejs22',
    ]:
        pytest.skip('This test is only for nodejs-related images')

@ -68,8 +63,7 @@ def test_nodejs_22_version(temp_dir, box_class, base_container_image):
    assert obs.exit_code == 0
    assert 'v22' in obs.content  # Check for specific version

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


 def test_go_version(temp_dir, box_class, base_container_image):
@ -90,5 +84,4 @@ def test_go_version(temp_dir, box_class, base_container_image):
    assert obs.exit_code == 0
    assert 'go1.23' in obs.content  # Check for specific version

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
--- a/tests/runtime/test_ipython.py
+++ b/tests/runtime/test_ipython.py
@ -1,8 +1,12 @@
 """Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""

-import time
-
-from conftest import _load_runtime
+import pytest
+from conftest import (
+    TEST_IN_CI,
+    _close_test_runtime,
+    _get_sandbox_folder,
+    _load_runtime,
+)

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
@ -28,6 +32,8 @@ from openhands.runtime.client.runtime import EventStreamRuntime
 def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)

+    sandbox_dir = _get_sandbox_folder(runtime)
+
    # Test run command
    action_cmd = CmdRunAction(command='ls -l')
    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
@ -48,7 +54,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.content.strip() == (
        'Hello, `World`!\n'
-        '[Jupyter current working directory: /workspace]\n'
+        f'[Jupyter current working directory: {sandbox_dir}]\n'
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
    )

@ -69,7 +75,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):

    assert obs.content == ''
    # event stream runtime will always use absolute path
-    assert obs.path == '/workspace/hello.sh'
+    assert obs.path == f'{sandbox_dir}/hello.sh'

    # Test read file (file should exist)
    action_read = FileReadAction(path='hello.sh')
@ -81,7 +87,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

    assert obs.content == 'echo "Hello, World!"\n'
-    assert obs.path == '/workspace/hello.sh'
+    assert obs.path == f'{sandbox_dir}/hello.sh'

    # clean up
    action = CmdRunAction(command='rm -rf hello.sh')
@ -90,10 +96,13 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


+@pytest.mark.skipif(
+    TEST_IN_CI != 'True',
+    reason='This test is not working in WSL (file ownership)',
+)
 def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)

@ -111,7 +120,7 @@ def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
    else:
        assert 'root' in obs.content

-    # print pwd
+    # print the current working directory
    test_code = 'import os; print(os.getcwd())'
    action_ipython = IPythonRunCellAction(code=test_code)
    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
@ -152,7 +161,6 @@ def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
    if run_as_openhands:
        # -rw-r--r-- 1 openhands root 13 Jul 28 03:53 test.txt
        assert 'openhands' in obs.content.split('\r\n')[0]
-        assert 'root' in obs.content.split('\r\n')[0]
    else:
        # -rw-r--r-- 1 root root 13 Jul 28 03:53 test.txt
        assert 'root' in obs.content.split('\r\n')[0]
@ -164,12 +172,12 @@ def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


 def test_ipython_simple(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
+    sandbox_dir = _get_sandbox_folder(runtime)

    # Test run ipython
    # get username
@ -183,20 +191,20 @@ def test_ipython_simple(temp_dir, box_class):
        obs.content.strip()
        == (
            '1\n'
-            '[Jupyter current working directory: /workspace]\n'
+            f'[Jupyter current working directory: {sandbox_dir}]\n'
            '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
        ).strip()
    )

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


 def _test_ipython_agentskills_fileop_pwd_impl(
    runtime: EventStreamRuntime, enable_auto_lint: bool
 ):
+    sandbox_dir = _get_sandbox_folder(runtime)
    # remove everything in /workspace
-    action = CmdRunAction(command='rm -rf /workspace/*')
+    action = CmdRunAction(command=f'rm -rf {sandbox_dir}/*')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -215,12 +223,12 @@ def _test_ipython_agentskills_fileop_pwd_impl(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert isinstance(obs, IPythonRunCellObservation)
    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-        '[File: /workspace/hello.py (1 lines total)]\n'
+        f'[File: {sandbox_dir}/hello.py (1 lines total)]\n'
        '(this is the beginning of the file)\n'
        '1|\n'
        '(this is the end of the file)\n'
        '[File hello.py created.]\n'
-        '[Jupyter current working directory: /workspace]\n'
+        f'[Jupyter current working directory: {sandbox_dir}]\n'
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
    ).strip().split('\n')

@ -239,12 +247,12 @@ def _test_ipython_agentskills_fileop_pwd_impl(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert isinstance(obs, IPythonRunCellObservation)
    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-        '[File: /workspace/test/hello.py (1 lines total)]\n'
+        f'[File: {sandbox_dir}/test/hello.py (1 lines total)]\n'
        '(this is the beginning of the file)\n'
        '1|\n'
        '(this is the end of the file)\n'
        '[File hello.py created.]\n'
-        '[Jupyter current working directory: /workspace/test]\n'
+        f'[Jupyter current working directory: {sandbox_dir}/test]\n'
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
    ).strip().split('\n')

@ -258,10 +266,10 @@ def _test_ipython_agentskills_fileop_pwd_impl(
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert isinstance(obs, IPythonRunCellObservation)
        assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-            """
+            f"""
 [Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
 ERRORS:
-/workspace/test/hello.py:1:3: E999 IndentationError: unexpected indent
+{sandbox_dir}/test/hello.py:1:3: E999 IndentationError: unexpected indent
 [This is how your edit would have looked if applied]
 -------------------------------------------------
 (this is the beginning of the file)
@ -278,7 +286,7 @@ ERRORS:
 Your changes have NOT been applied. Please fix your edit command and try again.
 You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
 DO NOT re-run the same failed edit command. Running it again will lead to the same error.
-[Jupyter current working directory: /workspace/test]
+[Jupyter current working directory: {sandbox_dir}/test]
 [Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]
 """
        ).strip().split('\n')
@ -292,39 +300,44 @@ DO NOT re-run the same failed edit command. Running it again will lead to the sa
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert isinstance(obs, IPythonRunCellObservation)
    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-        """
-[File: /workspace/test/hello.py (1 lines total after edit)]
+        f"""
+[File: {sandbox_dir}/test/hello.py (1 lines total after edit)]
 (this is the beginning of the file)
 1|print("hello world")
 (this is the end of the file)
 [File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
-[Jupyter current working directory: /workspace/test]
+[Jupyter current working directory: {sandbox_dir}/test]
 [Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]
 """
    ).strip().split('\n')

-    action = CmdRunAction(command='rm -rf /workspace/*')
+    action = CmdRunAction(command=f'rm -rf {sandbox_dir}/*')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)

-
-def test_ipython_agentskills_fileop_pwd(
-    temp_dir, box_class, run_as_openhands, enable_auto_lint
+def test_ipython_agentskills_fileop_pwd_with_lint(
+    temp_dir, box_class, run_as_openhands
 ):
-    """Make sure that cd in bash also update the current working directory in ipython."""
-
    runtime = _load_runtime(
-        temp_dir, box_class, run_as_openhands, enable_auto_lint=enable_auto_lint
+        temp_dir, box_class, run_as_openhands, enable_auto_lint=True
    )
-    _test_ipython_agentskills_fileop_pwd_impl(runtime, enable_auto_lint)
+    _test_ipython_agentskills_fileop_pwd_impl(runtime, True)

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
+
+
+def test_ipython_agentskills_fileop_pwd_without_lint(
+    temp_dir, box_class, run_as_openhands
+):
+    runtime = _load_runtime(
+        temp_dir, box_class, run_as_openhands, enable_auto_lint=False
+    )
+    _test_ipython_agentskills_fileop_pwd_impl(runtime, False)
+
+    _close_test_runtime(runtime)


 def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class):
@ -392,13 +405,13 @@ def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class):
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
    ).strip().split('\n')

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)


 def test_ipython_package_install(temp_dir, box_class, run_as_openhands):
    """Make sure that cd in bash also update the current working directory in ipython."""
    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    sandbox_dir = _get_sandbox_folder(runtime)

    # It should error out since pymsgbox is not installed
    action = IPythonRunCellAction(code='import pymsgbox')
@ -424,9 +437,8 @@ def test_ipython_package_install(temp_dir, box_class, run_as_openhands):
    # import should not error out
    assert obs.content.strip() == (
        '[Code executed successfully with no output]\n'
-        '[Jupyter current working directory: /workspace]\n'
+        f'[Jupyter current working directory: {sandbox_dir}]\n'
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
    )

-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)