From 554636cf2aa03d16015db56ed583eff61cbb585e Mon Sep 17 00:00:00 2001
From: tobitege <10787084+tobitege@users.noreply.github.com>
Date: Sat, 14 Sep 2024 21:51:30 +0200
Subject: [PATCH] (fix) Fix runtime (RT) tests and split tests in 2 actions
 (openhands/root) (#3791)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
---
 .github/workflows/ghcr_runtime.yml            |  89 ++-
 openhands/core/config.py                      |  12 +-
 openhands/runtime/builder/docker.py           |   4 +
 openhands/runtime/client/client.py            | 113 ++-
 openhands/runtime/client/runtime.py           | 162 +++--
 openhands/runtime/plugins/jupyter/__init__.py |   2 +-
 openhands/runtime/remote/runtime.py           |   6 +-
 openhands/runtime/runtime.py                  |   1 -
 openhands/runtime/utils/__init__.py           |   7 +-
 openhands/runtime/utils/runtime_build.py      |  15 +-
 .../utils/runtime_templates/Dockerfile.j2     |   1 +
 openhands/runtime/utils/system.py             |  39 +-
 pytest.ini                                    |   1 +
 tests/integration/regenerate.sh               |  74 +-
 tests/integration/test_agent.py               |  35 +-
 tests/runtime/conftest.py                     | 198 +++--
 tests/runtime/test_bash.py                    | 679 +++++++-----------
 tests/runtime/test_browsing.py                |   9 +-
 tests/runtime/test_env_vars.py                |   9 +-
 tests/runtime/test_images.py                  |  15 +-
 tests/runtime/test_ipython.py                 |  98 +--
 21 files changed, 867 insertions(+), 702 deletions(-)

diff --git a/.github/workflows/ghcr_runtime.yml b/.github/workflows/ghcr_runtime.yml
index 055c5db31a..4efe5293c6 100644
--- a/.github/workflows/ghcr_runtime.yml
+++ b/.github/workflows/ghcr_runtime.yml
@@ -1,5 +1,5 @@
 # Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
-name: Build, Test and Publish Runtime Image
+name: Build, Test and Publish RT Image
 
 # Only run one workflow of the same group at a time.
 # There can be at most one running and one pending job in a concurrency group at any time.
@@ -104,9 +104,9 @@ jobs:
           name: runtime-${{ matrix.base_image.tag }}
           path: /tmp/runtime-${{ matrix.base_image.tag }}.tar
 
-  # Run unit tests with the EventStream runtime Docker images
-  test_runtime:
-    name: Test Runtime
+  # Run unit tests with the EventStream runtime Docker images as root
+  test_runtime_root:
+    name: RT Unit Tests (Root)
     needs: [ghcr_build_runtime]
     runs-on: ubuntu-latest
     strategy:
@@ -164,11 +164,84 @@ jobs:
           image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
           image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
 
+          SKIP_CONTAINER_LOGS=true \
           TEST_RUNTIME=eventstream \
           SANDBOX_USER_ID=$(id -u) \
           SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
-          poetry run pytest -n 2 --reruns 2 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          RUN_AS_OPENHANDS=false \
+          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # Run unit tests with the EventStream runtime Docker images as openhands user
+  test_runtime_oh:
+    name: RT Unit Tests (openhands)
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime]
+    strategy:
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
+      # Forked repos can't push to GHCR, so we need to download the image as an artifact
+      - name: Download runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        uses: actions/download-artifact@v4
+        with:
+          name: runtime-${{ matrix.base_image }}
+          path: /tmp
+      - name: Load runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        run: |
+          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Run runtime tests
+        run: |
+          # We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
+          # then across more than 2 CPUs for some reason
+          poetry run pip install pytest-xdist
+
+          # Install to be able to retry on failures for flaky tests
+          poetry run pip install pytest-rerunfailures
+
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
+
+          SKIP_CONTAINER_LOGS=true \
+          TEST_RUNTIME=eventstream \
+          SANDBOX_USER_ID=$(id -u) \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
+          TEST_IN_CI=true \
+          RUN_AS_OPENHANDS=true \
+          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -176,7 +249,7 @@ jobs:
 
   # Run integration tests with the eventstream runtime Docker image
   runtime_integration_tests_on_linux:
-    name: Runtime Integration Tests on Linux
+    name: RT Integration Tests (Linux)
     runs-on: ubuntu-latest
     needs: [ghcr_build_runtime]
     strategy:
@@ -237,7 +310,7 @@ jobs:
     name: All Runtime Tests Passed
     if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
     runs-on: ubuntu-latest
-    needs: [test_runtime, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
     steps:
       - name: All tests passed
         run: echo "All runtime tests have passed successfully!"
@@ -246,7 +319,7 @@ jobs:
     name: All Runtime Tests Passed
     if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
     runs-on: ubuntu-latest
-    needs: [test_runtime, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
     steps:
       - name: Some tests failed
         run: |
diff --git a/openhands/core/config.py b/openhands/core/config.py
index 108523adc2..6d4dacb8af 100644
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -507,7 +507,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
         if isinstance(value, dict):
             try:
                 if key is not None and key.lower() == 'agent':
-                    logger.openhands_logger.info(
+                    logger.openhands_logger.debug(
                         'Attempt to load default agent config from config toml'
                     )
                     non_dict_fields = {
@@ -517,13 +517,13 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                     cfg.set_agent_config(agent_config, 'agent')
                     for nested_key, nested_value in value.items():
                         if isinstance(nested_value, dict):
-                            logger.openhands_logger.info(
+                            logger.openhands_logger.debug(
                                 f'Attempt to load group {nested_key} from config toml as agent config'
                             )
                             agent_config = AgentConfig(**nested_value)
                             cfg.set_agent_config(agent_config, nested_key)
                 elif key is not None and key.lower() == 'llm':
-                    logger.openhands_logger.info(
+                    logger.openhands_logger.debug(
                         'Attempt to load default LLM config from config toml'
                     )
                     non_dict_fields = {
@@ -533,7 +533,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                     cfg.set_llm_config(llm_config, 'llm')
                     for nested_key, nested_value in value.items():
                         if isinstance(nested_value, dict):
-                            logger.openhands_logger.info(
+                            logger.openhands_logger.debug(
                                 f'Attempt to load group {nested_key} from config toml as llm config'
                             )
                             llm_config = LLMConfig(**nested_value)
@@ -584,10 +584,10 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
 
 def finalize_config(cfg: AppConfig):
     """More tweaks to the config after it's been loaded."""
+    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
     # Set workspace_mount_path if not set by the user
     if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
-        cfg.workspace_mount_path = os.path.abspath(cfg.workspace_base)
-    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
+        cfg.workspace_mount_path = cfg.workspace_base
 
     if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
         # TODO why do we need to check if workspace_mount_path is None?
diff --git a/openhands/runtime/builder/docker.py b/openhands/runtime/builder/docker.py
index 441a253fdd..3dcd7cb0a6 100644
--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@@ -68,6 +68,10 @@ class DockerRuntimeBuilder(RuntimeBuilder):
         Returns:
             bool: Whether the Docker image exists in the registry or in the local store
         """
+        if not image_name:
+            logger.error(f'Invalid image name: `{image_name}`')
+            return False
+
         try:
             logger.info(f'Checking, if image exists locally:\n{image_name}')
             self.docker_client.images.get(image_name)
diff --git a/openhands/runtime/client/client.py b/openhands/runtime/client/client.py
index f26b739ca8..5d5f63993a 100644
--- a/openhands/runtime/client/client.py
+++ b/openhands/runtime/client/client.py
@@ -84,7 +84,6 @@ class RuntimeClient:
         self.lock = asyncio.Lock()
         self.plugins: dict[str, Plugin] = {}
         self.browser = BrowserEnv(browsergym_eval_env)
-        self._initial_pwd = work_dir
 
     @property
     def initial_pwd(self):
@@ -116,27 +115,85 @@ class RuntimeClient:
         logger.info('Runtime client initialized.')
 
     def _init_user(self, username: str, user_id: int) -> None:
-        """Create user if not exists."""
+        """Create working directory and user if not exists.
+        It performs the following steps effectively:
+        * Creates the Working Directory:
+            - Uses mkdir -p to create the directory.
+            - Sets ownership to username:root.
+            - Adjusts permissions to be readable and writable by group and others.
+        * User Verification and Creation:
+            - Checks if the user exists using id -u.
+            - If the user exists with the correct UID, it skips creation.
+            - If the UID differs, it logs a warning and updates self.user_id.
+            - If the user doesn't exist, it proceeds to create the user.
+        * Sudo Configuration:
+            - Appends %sudo ALL=(ALL) NOPASSWD:ALL to /etc/sudoers to grant
+              passwordless sudo access to the sudo group.
+            - Adds the user to the sudo group with the useradd command, handling
+              UID conflicts by incrementing the UID if necessary.
+        """
+
+        # First create the working directory, independent of the user
+        logger.info(f'Client working directory: {self.initial_pwd}')
+        command = f'umask 002; mkdir -p {self.initial_pwd}'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        out_str = output.stdout.decode()
+
+        command = f'chown -R {username}:root {self.initial_pwd}'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        out_str += output.stdout.decode()
+
+        command = f'chmod g+rw {self.initial_pwd}'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        out_str += output.stdout.decode()
+        logger.debug(f'Created working directory. Output: [{out_str}]')
+
         # Skip root since it is already created
         if username == 'root':
             return
 
         # Check if the username already exists
+        existing_user_id = -1
         try:
-            subprocess.run(
+            result = subprocess.run(
                 f'id -u {username}', shell=True, check=True, capture_output=True
             )
-            logger.debug(f'User {username} already exists. Skipping creation.')
+            existing_user_id = int(result.stdout.decode().strip())
+
+            # The user ID already exists, skip setup
+            if existing_user_id == user_id:
+                logger.debug(
+                    f'User `{username}` already has the provided UID {user_id}. Skipping user setup.'
+                )
+            else:
+                logger.warning(
+                    f'User `{username}` already exists with UID {existing_user_id}. Skipping user setup.'
+                )
+                self.user_id = existing_user_id
             return
-        except subprocess.CalledProcessError:
-            pass  # User does not exist, continue with creation
+        except subprocess.CalledProcessError as e:
+            # Returncode 1 indicates, that the user does not exist yet
+            if e.returncode == 1:
+                logger.debug(
+                    f'User `{username}` does not exist. Proceeding with user creation.'
+                )
+            else:
+                logger.error(
+                    f'Error checking user `{username}`, skipping setup:\n{e}\n'
+                )
+                raise
 
         # Add sudoer
-        sudoer_line = r"echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers"
-        output = subprocess.run(sudoer_line, shell=True, capture_output=True)
-        if output.returncode != 0:
-            raise RuntimeError(f'Failed to add sudoer: {output.stderr.decode()}')
-        logger.debug(f'Added sudoer successfully. Output: [{output.stdout.decode()}]')
+        sudoer_line = r'%sudo ALL=(ALL) NOPASSWD:ALL\n'
+        sudoers_path = '/etc/sudoers.d/99_sudo'
+        if not Path(sudoers_path).exists():
+            with open(sudoers_path, 'w') as f:
+                f.write(sudoer_line)
+            output = subprocess.run(['chmod', '0440', sudoers_path])
+            if output.returncode != 0:
+                logger.error('Failed to chmod 99_sudo file!')
+            else:
+                logger.debug('Added sudoer successfully.')
 
         # Attempt to add the user, retrying with incremented user_id if necessary
         while True:
@@ -144,16 +201,10 @@ class RuntimeClient:
                 f'useradd -rm -d /home/{username} -s /bin/bash '
                 f'-g root -G sudo -u {user_id} {username}'
             )
-
-            if not os.path.exists(self.initial_pwd):
-                command += f' && mkdir -p {self.initial_pwd}'
-                command += f' && chown -R {username}:root {self.initial_pwd}'
-                command += f' && chmod g+s {self.initial_pwd}'
-
             output = subprocess.run(command, shell=True, capture_output=True)
             if output.returncode == 0:
                 logger.debug(
-                    f'Added user {username} successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
+                    f'Added user `{username}` successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
                 )
                 break
             elif f'UID {user_id} is not unique' in output.stderr.decode():
@@ -163,7 +214,7 @@ class RuntimeClient:
                 user_id += 1
             else:
                 raise RuntimeError(
-                    f'Failed to create user {username}: {output.stderr.decode()}'
+                    f'Failed to create user `{username}`! Output: [{output.stderr.decode()}]'
                 )
 
     def _init_bash_shell(self, work_dir: str, username: str) -> None:
@@ -181,8 +232,8 @@ class RuntimeClient:
 
         # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
         self.__bash_expect_regex = r'\[PEXPECT_BEGIN\]\s*(.*?)\s*([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+)\s*\[PEXPECT_END\]'
-
-        self.shell.sendline(f'export PS1="{self.__bash_PS1}"; export PS2=""')
+        # Set umask to allow group write permissions
+        self.shell.sendline(f'umask 002; export PS1="{self.__bash_PS1}"; export PS2=""')
         self.shell.expect(self.__bash_expect_regex)
 
         self.shell.sendline(
@@ -190,8 +241,11 @@ class RuntimeClient:
         )
         self.shell.expect(self.__bash_expect_regex)
         logger.debug(
-            f'Bash initialized. Working directory: {work_dir}. Output: {self.shell.before}'
+            f'Bash initialized. Working directory: {work_dir}. Output: [{self.shell.before}]'
         )
+        # Ensure the group has write permissions on the working directory
+        self.shell.sendline(f'chmod g+rw "{work_dir}"')
+        self.shell.expect(self.__bash_expect_regex)
 
     async def _init_bash_commands(self):
         logger.info(f'Initializing by running {len(INIT_COMMANDS)} bash commands...')
@@ -295,14 +349,14 @@ class RuntimeClient:
             bash_prompt = self._get_bash_prompt_and_update_pwd()
             if keep_prompt:
                 output += '\r\n' + bash_prompt
-            logger.debug(f'Command output: {output}')
+            # logger.debug(f'Command output:\n{output}')
         return output, exit_code
 
     async def run_action(self, action) -> Observation:
         action_type = action.action
-        logger.debug(f'Running action: {action}')
+        logger.debug(f'Running action:\n{action}')
         observation = await getattr(self, action_type)(action)
-        logger.debug(f'Action output: {observation}')
+        logger.debug(f'Action output:\n{observation}')
         return observation
 
     async def run(self, action: CmdRunAction) -> CmdOutputObservation:
@@ -355,10 +409,9 @@ class RuntimeClient:
             _jupyter_plugin: JupyterPlugin = self.plugins['jupyter']  # type: ignore
             # This is used to make AgentSkills in Jupyter aware of the
             # current working directory in Bash
-            if self.pwd != getattr(self, '_jupyter_pwd', None):
-                logger.debug(
-                    f"{self.pwd} != {getattr(self, '_jupyter_pwd', None)} -> reset Jupyter PWD"
-                )
+            jupyter_pwd = getattr(self, '_jupyter_pwd', None)
+            if self.pwd != jupyter_pwd:
+                logger.debug(f'{self.pwd} != {jupyter_pwd} -> reset Jupyter PWD')
                 reset_jupyter_pwd_code = f'import os; os.chdir("{self.pwd}")'
                 _aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code)
                 _reset_obs = await _jupyter_plugin.run(_aux_action)
@@ -450,7 +503,7 @@ class RuntimeClient:
                     os.chown(filepath, file_stat.st_uid, file_stat.st_gid)
                 else:
                     # set the new file permissions if the file is new
-                    os.chmod(filepath, 0o644)
+                    os.chmod(filepath, 0o664)
                     os.chown(filepath, self.user_id, self.user_id)
 
             except FileNotFoundError:
diff --git a/openhands/runtime/client/runtime.py b/openhands/runtime/client/runtime.py
index 5767632f37..73ce7d548d 100644
--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@@ -38,8 +38,7 @@ from openhands.runtime.utils.runtime_build import build_runtime_image
 
 
 class LogBuffer:
-    """
-    Synchronous buffer for Docker container logs.
+    """Synchronous buffer for Docker container logs.
 
     This class provides a thread-safe way to collect, store, and retrieve logs
     from a Docker container. It uses a list to store log lines and provides methods
@@ -94,7 +93,7 @@ class LogBuffer:
             )
             self.close(timeout=5)
 
-    def close(self, timeout: float = 10.0):
+    def close(self, timeout: float = 5.0):
         self._stop_event.set()
         self.log_stream_thread.join(timeout)
 
@@ -102,6 +101,14 @@ class LogBuffer:
 class EventStreamRuntime(Runtime):
     """This runtime will subscribe the event stream.
     When receive an event, it will send the event to runtime-client which run inside the docker environment.
+    From the sid also an instance_id is generated in combination with a UID.
+
+    Args:
+        config (AppConfig): The application configuration.
+        event_stream (EventStream): The event stream to subscribe to.
+        sid (str, optional): The session ID. Defaults to 'default'.
+        plugins (list[PluginRequirement] | None, optional): List of plugin requirements. Defaults to None.
+        env_vars (dict[str, str] | None, optional): Environment variables to set. Defaults to None.
     """
 
     container_name_prefix = 'openhands-sandbox-'
@@ -115,13 +122,16 @@ class EventStreamRuntime(Runtime):
         env_vars: dict[str, str] | None = None,
     ):
         self.config = config
-        self._port = find_available_tcp_port()
-        self.api_url = f'http://{self.config.sandbox.api_hostname}:{self._port}'
+        self._host_port = 30000  # initial dummy value
+        self._container_port = 30001  # initial dummy value
+        self.api_url = (
+            f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+        )
         self.session = requests.Session()
-
         self.instance_id = (
             sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
         )
+
         self.docker_client: docker.DockerClient = self._init_docker_client()
         self.base_container_image = self.config.sandbox.base_container_image
         self.runtime_container_image = self.config.sandbox.runtime_container_image
@@ -131,7 +141,7 @@ class EventStreamRuntime(Runtime):
         self.action_semaphore = threading.Semaphore(1)  # Ensure one action at a time
 
         self.runtime_builder = DockerRuntimeBuilder(self.docker_client)
-        logger.debug(f'EventStreamRuntime `{sid}`')
+        logger.debug(f'EventStreamRuntime `{self.instance_id}`')
 
         # Buffer for container logs
         self.log_buffer: LogBuffer | None = None
@@ -140,7 +150,9 @@ class EventStreamRuntime(Runtime):
             logger.info(
                 f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}'
             )
-
+        self.skip_container_logs = (
+            os.environ.get('SKIP_CONTAINER_LOGS', 'false').lower() == 'true'
+        )
         if self.runtime_container_image is None:
             if self.base_container_image is None:
                 raise ValueError(
@@ -152,19 +164,18 @@ class EventStreamRuntime(Runtime):
                 extra_deps=self.config.sandbox.runtime_extra_deps,
             )
         self.container = self._init_container(
-            self.sandbox_workspace_dir,
-            mount_dir=self.config.workspace_mount_path,
+            sandbox_workspace_dir=self.config.workspace_mount_path_in_sandbox,  # e.g. /workspace
+            mount_dir=self.config.workspace_mount_path,  # e.g. /opt/openhands/_test_workspace
             plugins=plugins,
         )
         # will initialize both the event stream and the env vars
         super().__init__(config, event_stream, sid, plugins, env_vars)
 
-        self._wait_until_alive()
-
         logger.info(
             f'Container initialized with plugins: {[plugin.name for plugin in self.plugins]}'
         )
         logger.info(f'Container initialized with env vars: {env_vars}')
+        time.sleep(1)
 
     @staticmethod
     def _init_docker_client() -> docker.DockerClient:
@@ -196,24 +207,48 @@ class EventStreamRuntime(Runtime):
                     f'--plugins {" ".join([plugin.name for plugin in plugins])} '
                 )
 
-            network_mode: str | None = None
-            port_mapping: dict[str, int] | None = None
-            if self.config.sandbox.use_host_network:
-                network_mode = 'host'
+            self._host_port = self._find_available_port()
+            self._container_port = (
+                self._host_port
+            )  # in future this might differ from host port
+            self.api_url = (
+                f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+            )
+
+            use_host_network = self.config.sandbox.use_host_network
+            network_mode: str | None = 'host' if use_host_network else None
+            port_mapping: dict[str, list[dict[str, str]]] | None = (
+                None
+                if use_host_network
+                else {
+                    f'{self._container_port}/tcp': [{'HostPort': str(self._host_port)}]
+                }
+            )
+
+            if use_host_network:
                 logger.warn(
                     'Using host network mode. If you are using MacOS, please make sure you have the latest version of Docker Desktop and enabled host network feature: https://docs.docker.com/network/drivers/host/#docker-desktop'
                 )
-            else:
-                port_mapping = {f'{self._port}/tcp': self._port}
 
-            if mount_dir is not None:
+            # Combine environment variables
+            environment = {
+                'port': str(self._container_port),
+                'PYTHONUNBUFFERED': 1,
+            }
+            if self.config.debug:
+                environment['DEBUG'] = 'true'
+
+            logger.info(f'Workspace Base: {self.config.workspace_base}')
+            if mount_dir is not None and sandbox_workspace_dir is not None:
+                # e.g. result would be: {"/home/user/openhands/workspace": {'bind': "/workspace", 'mode': 'rw'}}
                 volumes = {mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}}
-                logger.info(f'Mount dir: {sandbox_workspace_dir}')
+                logger.info(f'Mount dir: {mount_dir}')
             else:
                 logger.warn(
-                    'Mount dir is not set, will not mount the workspace directory to the container.'
+                    'Warning: Mount dir is not set, will not mount the workspace directory to the container!\n'
                 )
                 volumes = None
+            logger.info(f'Sandbox workspace: {sandbox_workspace_dir}')
 
             if self.config.sandbox.browsergym_eval_env is not None:
                 browsergym_arg = (
@@ -225,9 +260,9 @@ class EventStreamRuntime(Runtime):
                 self.runtime_container_image,
                 command=(
                     f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
-                    'PYTHONUNBUFFERED=1 poetry run '
-                    f'python -u -m openhands.runtime.client.client {self._port} '
-                    f'--working-dir {sandbox_workspace_dir} '
+                    f'poetry run '
+                    f'python -u -m openhands.runtime.client.client {self._container_port} '
+                    f'--working-dir "{sandbox_workspace_dir}" '
                     f'{plugin_arg}'
                     f'--username {"openhands" if self.config.run_as_openhands else "root"} '
                     f'--user-id {self.config.sandbox.user_id} '
@@ -235,24 +270,26 @@ class EventStreamRuntime(Runtime):
                 ),
                 network_mode=network_mode,
                 ports=port_mapping,
-                working_dir='/openhands/code/',
+                working_dir='/openhands/code/',  # do not change this!
                 name=self.container_name,
                 detach=True,
-                environment={'DEBUG': 'true'} if self.config.debug else None,
+                environment=environment,
                 volumes=volumes,
             )
             self.log_buffer = LogBuffer(container)
             logger.info(f'Container started. Server url: {self.api_url}')
             return container
         except Exception as e:
-            logger.error('Failed to start container')
+            logger.error(
+                f'Error: Instance {self.instance_id} FAILED to start container!\n'
+            )
             logger.exception(e)
             self.close(close_client=False)
             raise e
 
     @tenacity.retry(
         stop=tenacity.stop_after_attempt(10),
-        wait=tenacity.wait_exponential(multiplier=2, min=10, max=60),
+        wait=tenacity.wait_exponential(multiplier=2, min=1, max=20),
         reraise=(ConnectionRefusedError,),
     )
     def _wait_until_alive(self):
@@ -278,10 +315,11 @@ class EventStreamRuntime(Runtime):
             )
 
         if not self.log_buffer.client_ready:
+            time.sleep(1)
             attempts = 0
-            while not self.log_buffer.client_ready and attempts < 5:
+            while not self.log_buffer.client_ready and attempts < 4:
                 attempts += 1
-                time.sleep(1)
+                time.sleep(2)
                 logs = self.log_buffer.get_and_clear()
                 if logs:
                     formatted_logs = '\n'.join([f'    |{log}' for log in logs])
@@ -303,13 +341,8 @@ class EventStreamRuntime(Runtime):
             logger.error(msg)
             raise RuntimeError(msg)
 
-    @property
-    def sandbox_workspace_dir(self):
-        return self.config.workspace_mount_path_in_sandbox
-
     def close(self, close_client: bool = True, rm_all_containers: bool = True):
-        """
-        Closes the EventStreamRuntime and associated objects
+        """Closes the EventStreamRuntime and associated objects
 
         Parameters:
         - close_client (bool): Whether to close the DockerClient
@@ -322,23 +355,29 @@ class EventStreamRuntime(Runtime):
         if self.session:
             self.session.close()
 
-        containers = self.docker_client.containers.list(all=True)
-        for container in containers:
-            try:
-                # If the app doesn't shut down properly, it can leave runtime containers on the system. This ensures
-                # that all 'openhands-sandbox-' containers are removed as well.
-                if rm_all_containers and container.name.startswith(
-                    self.container_name_prefix
-                ):
-                    container.remove(force=True)
-                elif container.name == self.container_name:
-                    logs = container.logs(tail=1000).decode('utf-8')
-                    logger.debug(
-                        f'==== Container logs ====\n{logs}\n==== End of container logs ===='
-                    )
-                    container.remove(force=True)
-            except docker.errors.NotFound:
-                pass
+        try:
+            containers = self.docker_client.containers.list(all=True)
+            for container in containers:
+                try:
+                    # If the app doesn't shut down properly, it can leave runtime containers on the system. This ensures
+                    # that all 'openhands-sandbox-' containers are removed as well.
+                    if rm_all_containers and container.name.startswith(
+                        self.container_name_prefix
+                    ):
+                        container.remove(force=True)
+                    elif container.name == self.container_name:
+                        if not self.skip_container_logs:
+                            logs = container.logs(tail=1000).decode('utf-8')
+                            logger.debug(
+                                f'==== Container logs on close ====\n{logs}\n==== End of container logs ===='
+                            )
+                        container.remove(force=True)
+                except docker.errors.APIError:
+                    pass
+                except docker.errors.NotFound:
+                    pass
+        except docker.errors.NotFound:  # yes, this can happen!
+            pass
 
         if close_client:
             self.docker_client.close()
@@ -494,3 +533,20 @@ class EventStreamRuntime(Runtime):
             raise TimeoutError('List files operation timed out')
         except Exception as e:
             raise RuntimeError(f'List files operation failed: {str(e)}')
+
+    def _is_port_in_use_docker(self, port):
+        containers = self.docker_client.containers.list()
+        for container in containers:
+            container_ports = container.ports
+            if str(port) in str(container_ports):
+                return True
+        return False
+
+    def _find_available_port(self, max_attempts=5):
+        port = 39999
+        for _ in range(max_attempts):
+            port = find_available_tcp_port(30000, 39999)
+            if not self._is_port_in_use_docker(port):
+                return port
+        # If no port is found after max_attempts, return the last tried port
+        return port
diff --git a/openhands/runtime/plugins/jupyter/__init__.py b/openhands/runtime/plugins/jupyter/__init__.py
index 0feb5caef0..26eda35054 100644
--- a/openhands/runtime/plugins/jupyter/__init__.py
+++ b/openhands/runtime/plugins/jupyter/__init__.py
@@ -19,7 +19,7 @@ class JupyterPlugin(Plugin):
     name: str = 'jupyter'
 
     async def initialize(self, username: str, kernel_id: str = 'openhands-default'):
-        self.kernel_gateway_port = find_available_tcp_port()
+        self.kernel_gateway_port = find_available_tcp_port(40000, 49999)
         self.kernel_id = kernel_id
         self.gateway_process = subprocess.Popen(
             (
diff --git a/openhands/runtime/remote/runtime.py b/openhands/runtime/remote/runtime.py
index 6c3e755608..48862d28aa 100644
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@@ -142,7 +142,7 @@ class RemoteRuntime(Runtime):
                 f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
                 'PYTHONUNBUFFERED=1 poetry run '
                 f'python -u -m openhands.runtime.client.client {self.port} '
-                f'--working-dir {self.sandbox_workspace_dir} '
+                f'--working-dir {self.config.workspace_mount_path_in_sandbox} '
                 f'{plugin_arg}'
                 f'--username {"openhands" if self.config.run_as_openhands else "root"} '
                 f'--user-id {self.config.sandbox.user_id} '
@@ -203,10 +203,6 @@ class RemoteRuntime(Runtime):
             logger.warning(msg)
             raise RuntimeError(msg)
 
-    @property
-    def sandbox_workspace_dir(self):
-        return self.config.workspace_mount_path_in_sandbox
-
     def close(self):
         if self.runtime_id:
             try:
diff --git a/openhands/runtime/runtime.py b/openhands/runtime/runtime.py
index 97b5464d49..ae54f4b177 100644
--- a/openhands/runtime/runtime.py
+++ b/openhands/runtime/runtime.py
@@ -67,7 +67,6 @@ class Runtime:
         self.config = copy.deepcopy(config)
         self.DEFAULT_ENV_VARS = _default_env_vars(config.sandbox)
         atexit.register(self.close)
-        logger.debug(f'Runtime `{sid}`')
 
         if self.DEFAULT_ENV_VARS:
             logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
diff --git a/openhands/runtime/utils/__init__.py b/openhands/runtime/utils/__init__.py
index eca9436da6..71d9d4198c 100644
--- a/openhands/runtime/utils/__init__.py
+++ b/openhands/runtime/utils/__init__.py
@@ -1,4 +1,7 @@
 from openhands.runtime.utils.bash import split_bash_commands
-from openhands.runtime.utils.system import find_available_tcp_port
+from openhands.runtime.utils.system import (
+    display_number_matrix,
+    find_available_tcp_port,
+)
 
-__all__ = ['find_available_tcp_port', 'split_bash_commands']
+__all__ = ['display_number_matrix', 'find_available_tcp_port', 'split_bash_commands']
diff --git a/openhands/runtime/utils/runtime_build.py b/openhands/runtime/utils/runtime_build.py
index 2baa6732d0..83df63819f 100644
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@@ -55,7 +55,7 @@ def _put_source_code_to_dir(temp_dir: str):
         ' ', r'\ '
     )  # escape spaces in the project root
     result = subprocess.run(
-        f'python -m build -s -o {temp_dir} {_cleaned_project_root}',
+        f'python -m build -s -o "{temp_dir}" {_cleaned_project_root}',
         shell=True,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
@@ -142,13 +142,14 @@ def prep_docker_build_folder(
         skip_init=skip_init,
         extra_deps=extra_deps,
     )
-    logger.debug(
-        (
-            f'===== Dockerfile content start =====\n'
-            f'{dockerfile_content}\n'
-            f'===== Dockerfile content end ====='
+    if os.getenv('SKIP_CONTAINER_LOGS', 'false') != 'true':
+        logger.debug(
+            (
+                f'===== Dockerfile content start =====\n'
+                f'{dockerfile_content}\n'
+                f'===== Dockerfile content end ====='
+            )
         )
-    )
     with open(os.path.join(dir_path, 'Dockerfile'), 'w') as file:
         file.write(dockerfile_content)
 
diff --git a/openhands/runtime/utils/runtime_templates/Dockerfile.j2 b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
index 519760a2ef..1f49b9eb7c 100644
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@@ -60,6 +60,7 @@ RUN cd /openhands/code && \
     {{ extra_deps }} {% if extra_deps %} && {% endif %} \
     /openhands/miniforge3/bin/mamba run -n base poetry cache clear --all . && \
     {% if not skip_init %}chmod -R g+rws /openhands/poetry && {% endif %} \
+    mkdir -p /openhands/workspace && chmod -R g+rws,o+rw /openhands/workspace && \
     apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     /openhands/miniforge3/bin/mamba clean --all
 
diff --git a/openhands/runtime/utils/system.py b/openhands/runtime/utils/system.py
index 09c8408add..921a8bf94b 100644
--- a/openhands/runtime/utils/system.py
+++ b/openhands/runtime/utils/system.py
@@ -1,17 +1,34 @@
+import random
 import socket
+import time
 
 
-def find_available_tcp_port() -> int:
-    """Find an available TCP port, return -1 if none available."""
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    try:
-        sock.bind(('localhost', 0))
-        port = sock.getsockname()[1]
-        return port
-    except Exception:
-        return -1
-    finally:
-        sock.close()
+def find_available_tcp_port(min_port=30000, max_port=39999, max_attempts=10) -> int:
+    """Find an available TCP port in a specified range.
+
+    Args:
+        min_port (int): The lower bound of the port range (default: 30000)
+        max_port (int): The upper bound of the port range (default: 39999)
+        max_attempts (int): Maximum number of attempts to find an available port (default: 10)
+
+    Returns:
+        int: An available port number, or -1 if none found after max_attempts
+    """
+    rng = random.SystemRandom()
+    ports = list(range(min_port, max_port + 1))
+    rng.shuffle(ports)
+
+    for port in ports[:max_attempts]:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        try:
+            sock.bind(('localhost', port))
+            return port
+        except OSError:
+            time.sleep(0.1)  # Short delay to further reduce chance of collisions
+            continue
+        finally:
+            sock.close()
+    return -1
 
 
 def display_number_matrix(number: int) -> str | None:
diff --git a/pytest.ini b/pytest.ini
index 1ceab94295..7a222b2d34 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,3 @@
 [pytest]
 addopts = -p no:warnings
+asyncio_default_fixture_loop_scope = function
diff --git a/tests/integration/regenerate.sh b/tests/integration/regenerate.sh
index 25bfad49f4..e88a203476 100755
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@@ -5,6 +5,8 @@ set -eo pipefail
 ##           CONSTANTS AND ENVIRONMENTAL VARIABLES          ##
 ##############################################################
 
+echo -e "\n\n============================================================"
+
 # unset environmental variables that might disturb testing
 unset OPENAI_API_KEY
 unset SANDBOX_ENV_OPENAI_API_KEY
@@ -16,7 +18,7 @@ get_script_dir() {
     local source="${BASH_SOURCE[0]}"
     while [ -h "$source" ]; do
         local dir="$( cd -P "$( dirname "$source" )" && pwd )"
-        source="$(readlink "$source")"
+        source="$(readlink -f "$source" 2>/dev/null || echo "$source")"
         [[ $source != /* ]] && source="$dir/$source"
     done
     echo "$( cd -P "$( dirname "$source" )" && pwd )"
@@ -27,9 +29,6 @@ TMP_FILE="${TMP_FILE:-tmp.log}"
 if [ -z "$WORKSPACE_BASE" ]; then
   WORKSPACE_BASE=$(pwd)
 fi
-if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
-  WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE
-fi
 
 DEBUG=true  # needed for llm logging to create mock files!
 
@@ -39,7 +38,7 @@ fi
 
 export SCRIPT_DIR=$(get_script_dir)
 export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
-export LOG_DIR=$PROJECT_ROOT/logs
+export LOG_DIR="$PROJECT_ROOT/logs"
 echo "Current working directory: $(pwd)"
 echo "SCRIPT_DIR: $SCRIPT_DIR"
 echo "PROJECT_ROOT: $PROJECT_ROOT"
@@ -47,22 +46,29 @@ echo "LOG_DIR: $LOG_DIR"
 echo "LOG_TO_FILE: $LOG_TO_FILE"
 
 WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
-mkdir -p $WORKSPACE_BASE
-chmod -R 777 $WORKSPACE_BASE
-WORKSPACE_BASE=$(realpath $WORKSPACE_BASE)
+mkdir -p "$WORKSPACE_BASE"
+chmod -R 777 "$WORKSPACE_BASE"
+WORKSPACE_BASE=$(realpath "$WORKSPACE_BASE")
 
-WORKSPACE_MOUNT_PATH=${WORKSPACE_MOUNT_PATH}/_test_workspace
-mkdir -p $WORKSPACE_MOUNT_PATH
-chmod -R 777 $WORKSPACE_MOUNT_PATH
-WORKSPACE_MOUNT_PATH=$(realpath $WORKSPACE_MOUNT_PATH)
+if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
+  WORKSPACE_MOUNT_PATH="$WORKSPACE_BASE"
+else
+  WORKSPACE_MOUNT_PATH="${WORKSPACE_MOUNT_PATH}/_test_workspace"
+  mkdir -p "$WORKSPACE_MOUNT_PATH"
+  chmod -R 755 "$WORKSPACE_MOUNT_PATH"
+  WORKSPACE_MOUNT_PATH=$(realpath "$WORKSPACE_MOUNT_PATH")
+fi
+
+WORKSPACE_MOUNT_PATH_IN_SANDBOX="${WORKSPACE_MOUNT_PATH_IN_SANDBOX:-/workspace}"
 
 echo "WORKSPACE_BASE: $WORKSPACE_BASE"
 echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
+echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"
 
 # Ensure we're in the correct directory
 cd "$PROJECT_ROOT" || exit 1
 
-mkdir -p $WORKSPACE_BASE
+mkdir -p "$WORKSPACE_BASE"
 
 # use environmental variable if exists
 TEST_RUNTIME="${TEST_RUNTIME:-eventstream}"
@@ -178,7 +184,7 @@ cleanup() {
     kill $HTTP_SERVER_PID || true
     unset HTTP_SERVER_PID
   fi
-  [ -f $TMP_FILE ] && rm $TMP_FILE
+  [ -f "$TMP_FILE" ] && rm "$TMP_FILE"
   echo "Cleanup done!"
 }
 
@@ -200,14 +206,14 @@ regenerate_without_llm() {
       PROJECT_ROOT="$PROJECT_ROOT" \
       WORKSPACE_BASE="$WORKSPACE_BASE" \
       WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
-      MAX_ITERATIONS=$MAX_ITERATIONS \
+      MAX_ITERATIONS="$MAX_ITERATIONS" \
       FORCE_APPLY_PROMPTS=true \
-      DEFAULT_AGENT=$agent \
+      DEFAULT_AGENT="$agent" \
       TEST_RUNTIME="$TEST_RUNTIME" \
-      LLM=$LLM \
-      DEBUG=$DEBUG \
-      LOG_TO_FILE=$LOG_TO_FILE \
-      FORCE_REGENERATE=$FORCE_REGENERATE \
+      LLM="$LLM" \
+      DEBUG="$DEBUG" \
+      LOG_TO_FILE="$LOG_TO_FILE" \
+      FORCE_REGENERATE="$FORCE_REGENERATE" \
       SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
       poetry run pytest -s "$SCRIPT_DIR/test_agent.py::$test_name"
   set +x
@@ -216,12 +222,12 @@ regenerate_without_llm() {
 regenerate_with_llm() {
   cd "$PROJECT_ROOT"
 
-  rm -rf $WORKSPACE_BASE/*
+  rm -rf "$WORKSPACE_BASE/*"
   if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
-    cp -r "$SCRIPT_DIR/workspace/$test_name"/* $WORKSPACE_BASE
+    cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
   fi
 
-  rm -rf logs
+  rm -rf "$LOG_DIR"
   rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
   # set -x to print the command being executed
   set -x
@@ -233,12 +239,12 @@ regenerate_with_llm() {
       DEFAULT_AGENT=$agent \
       RUNTIME="$TEST_RUNTIME" \
       SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
-      LLM=$LLM \
-      DEBUG=$DEBUG \
-      LOG_TO_FILE=$LOG_TO_FILE \
-      FORCE_REGENERATE=$FORCE_REGENERATE \
+      LLM="$LLM" \
+      DEBUG="$DEBUG" \
+      LOG_TO_FILE="$LOG_TO_FILE" \
+      FORCE_REGENERATE="$FORCE_REGENERATE" \
       poetry run python "$PROJECT_ROOT/openhands/core/main.py" \
-      -i $MAX_ITERATIONS \
+      -i "$MAX_ITERATIONS" \
       -t "$task Do not ask me for confirmation at any point." \
       -c $agent
   set +x
@@ -256,8 +262,8 @@ if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
   exit 1
 fi
 
-rm -rf logs
-rm -rf $WORKSPACE_BASE/*
+rm -rf "$LOG_DIR"
+rm -rf "$WORKSPACE_BASE/*"
 for ((i = 0; i < num_of_tests; i++)); do
   task=${tasks[i]}
   test_name=${test_names[i]}
@@ -286,9 +292,9 @@ for ((i = 0; i < num_of_tests; i++)); do
     cd "$PROJECT_ROOT/tests"
     cd "$PROJECT_ROOT"
 
-    rm -rf $WORKSPACE_BASE/*
+    rm -rf "$WORKSPACE_BASE/*"
     if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
-      cp -r "$SCRIPT_DIR/workspace/$test_name"/* $WORKSPACE_BASE
+      cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
     fi
 
     if [ "$TEST_ONLY" ]; then
@@ -395,7 +401,7 @@ for ((i = 0; i < num_of_tests; i++)); do
   fi
 done
 
-rm -rf logs
-rm -rf $WORKSPACE_BASE
+rm -rf "$LOG_DIR"
+rm -rf "$WORKSPACE_BASE"
 echo "Done!"
 cd "$PROJECT_ROOT"
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index 9199943fcb..c89a8e8b7b 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -6,7 +6,7 @@ import subprocess
 import pytest
 
 from openhands.controller.state.state import State
-from openhands.core.config import AppConfig, SandboxConfig, load_from_env
+from openhands.core.config import load_app_config
 from openhands.core.main import run_controller
 from openhands.core.schema import AgentState
 from openhands.events.action import (
@@ -21,36 +21,23 @@ TEST_RUNTIME = os.getenv('TEST_RUNTIME')
 assert TEST_RUNTIME in ['eventstream', 'remote']
 _ = get_runtime_cls(TEST_RUNTIME)  # make sure it does not raise an error
 
-CONFIG = AppConfig(
-    max_iterations=int(os.getenv('MAX_ITERATIONS', 20)),
-    max_budget_per_task=int(os.getenv('MAX_BUDGET_PER_TASK', 15)),
-    runtime=TEST_RUNTIME,
-    default_agent=os.getenv('DEFAULT_AGENT'),
-    workspace_base=os.getenv('WORKSPACE_BASE'),
-    workspace_mount_path=os.getenv('WORKSPACE_MOUNT_PATH'),
-    sandbox=SandboxConfig(
-        use_host_network=True,
-    ),
+CONFIG = load_app_config()
+CONFIG.max_iterations = int(os.getenv('MAX_ITERATIONS', 20))
+CONFIG.max_budget_per_task = int(os.getenv('MAX_BUDGET_PER_TASK', 15))
+CONFIG.runtime = TEST_RUNTIME
+CONFIG.default_agent = os.getenv('DEFAULT_AGENT')
+CONFIG.workspace_base = os.getenv('WORKSPACE_BASE')
+CONFIG.workspace_mount_path = os.getenv('WORKSPACE_MOUNT_PATH')
+CONFIG.workspace_mount_path_in_sandbox = os.getenv(
+    'WORKSPACE_MOUNT_PATH_IN_SANDBOX', '/workspace'
 )
-load_from_env(CONFIG, os.environ)
+CONFIG.sandbox.use_host_network = True
 
 print('\nPaths used:')
 print(f'workspace_base: {CONFIG.workspace_base}')
 print(f'workspace_mount_path: {CONFIG.workspace_mount_path}')
 print(f'workspace_mount_path_in_sandbox: {CONFIG.workspace_mount_path_in_sandbox}')
 
-# Check if running in WSL environment
-if 'WSL_DISTRO_NAME' in os.environ:
-    if (
-        CONFIG.workspace_base
-        and CONFIG.workspace_mount_path
-        and CONFIG.workspace_base != CONFIG.workspace_mount_path
-    ):
-        print(
-            '\n**********\nWARNING: if WORKSPACE_MOUNT_PATH is set differently to'
-            '\nWORKSPACE_BASE some file operation tests may fail!\n**********\n'
-        )
-
 
 def get_number_of_prompts(test_name: str):
     mock_dir = os.path.join(
diff --git a/tests/runtime/conftest.py b/tests/runtime/conftest.py
index 6cec0f9c4b..6ce93256e4 100644
--- a/tests/runtime/conftest.py
+++ b/tests/runtime/conftest.py
@@ -1,11 +1,15 @@
 import os
 import random
+import shutil
+import stat
 import time
+from pathlib import Path
 
 import pytest
 from pytest import TempPathFactory
 
-from openhands.core.config import AppConfig, SandboxConfig, load_from_env
+from openhands.core.config import load_app_config
+from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
 from openhands.runtime.client.runtime import EventStreamRuntime
 from openhands.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
@@ -13,19 +17,86 @@ from openhands.runtime.remote.runtime import RemoteRuntime
 from openhands.runtime.runtime import Runtime
 from openhands.storage import get_file_store
 
+TEST_IN_CI = os.getenv('TEST_IN_CI', 'False').lower() in ['true', '1', 'yes']
+TEST_RUNTIME = os.getenv('TEST_RUNTIME', 'eventstream').lower()
+RUN_AS_OPENHANDS = os.getenv('RUN_AS_OPENHANDS', 'True').lower() in ['true', '1', 'yes']
+test_mount_path = ''
+project_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+sandbox_test_folder = '/openhands/workspace'
+
+
+def _get_runtime_sid(runtime: Runtime):
+    logger.debug(f'\nruntime.sid: {runtime.sid}')
+    return runtime.sid
+
+
+def _get_host_folder(runtime: Runtime):
+    return runtime.config.workspace_mount_path
+
+
+def _get_sandbox_folder(runtime: Runtime):
+    sid = _get_runtime_sid(runtime)
+    if sid:
+        return Path(os.path.join(sandbox_test_folder, sid))
+    return None
+
+
+def _remove_folder(folder: str) -> bool:
+    success = False
+    if folder and os.path.isdir(folder):
+        try:
+            os.rmdir(folder)
+            success = True
+        except OSError:
+            try:
+                shutil.rmtree(folder)
+                success = True
+            except OSError:
+                pass
+        logger.debug(f'\nCleanup: `{folder}`: ' + ('[OK]' if success else '[FAILED]'))
+    return success
+
+
+def _close_test_runtime(runtime: Runtime):
+    if isinstance(runtime, EventStreamRuntime):
+        runtime.close(rm_all_containers=False)
+    else:
+        runtime.close()
+    time.sleep(1)
+
+
+def _reset_pwd():
+    global project_dir
+    # Try to change back to project directory
+    try:
+        os.chdir(project_dir)
+        logger.info(f'Changed back to project directory `{project_dir}')
+    except Exception as e:
+        logger.error(f'Failed to change back to project directory: {e}')
+
+
+# *****************************************************************************
+# *****************************************************************************
+
 
 @pytest.fixture(autouse=True)
 def print_method_name(request):
-    print('\n########################################################################')
+    print(
+        '\n\n########################################################################'
+    )
     print(f'Running test: {request.node.name}')
-    print('########################################################################')
-    yield
+    print(
+        '########################################################################\n\n'
+    )
 
 
 @pytest.fixture
-def temp_dir(tmp_path_factory: TempPathFactory) -> str:
-    """
-    Creates a unique temporary directory
+def temp_dir(tmp_path_factory: TempPathFactory, request) -> str:
+    """Creates a unique temporary directory.
+    Upon finalization, the temporary directory and its content is removed.
+    The cleanup function is also called upon KeyboardInterrupt.
 
     Parameters:
     - tmp_path_factory (TempPathFactory): A TempPathFactory class
@@ -33,15 +104,23 @@ def temp_dir(tmp_path_factory: TempPathFactory) -> str:
     Returns:
     - str: The temporary directory path that was created
     """
-
-    unique_suffix = random.randint(10000, 99999)
-    temp_directory = tmp_path_factory.mktemp(
-        f'test_runtime_{unique_suffix}', numbered=False
+    temp_dir = tmp_path_factory.mktemp(
+        'rt_' + str(random.randint(100000, 999999)), numbered=False
     )
-    return str(temp_directory)
 
+    logger.info(f'\n*** {request.node.name}\n>> temp folder: {temp_dir}\n')
 
-TEST_RUNTIME = os.getenv('TEST_RUNTIME', 'eventstream')
+    # Set permissions to ensure the directory is writable and deletable
+    os.chmod(temp_dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)  # 0777 permissions
+
+    def cleanup():
+        global project_dir
+        os.chdir(project_dir)
+        _remove_folder(temp_dir)
+
+    request.addfinalizer(cleanup)
+
+    return str(temp_dir)
 
 
 # Depending on TEST_RUNTIME, feed the appropriate box class(es) to the test.
@@ -55,28 +134,47 @@ def get_box_classes():
         raise ValueError(f'Invalid runtime: {runtime}')
 
 
+def get_run_as_openhands():
+    print(
+        '\n\n########################################################################'
+    )
+    print('USER: ' + 'openhands' if RUN_AS_OPENHANDS else 'root')
+    print(
+        '########################################################################\n\n'
+    )
+    return [RUN_AS_OPENHANDS]
+
+
+@pytest.fixture(scope='module')  # for xdist
+def runtime_setup_module():
+    _reset_pwd()
+    yield
+    _reset_pwd()
+
+
+@pytest.fixture(scope='session')  # not for xdist
+def runtime_setup_session():
+    _reset_pwd()
+    yield
+    _reset_pwd()
+
+
 # This assures that all tests run together per runtime, not alternating between them,
 # which cause errors (especially outside GitHub actions).
 @pytest.fixture(scope='module', params=get_box_classes())
 def box_class(request):
-    time.sleep(2)
+    time.sleep(1)
     return request.param
 
 
 # TODO: We will change this to `run_as_user` when `ServerRuntime` is deprecated.
 # since `EventStreamRuntime` supports running as an arbitrary user.
-@pytest.fixture(scope='module', params=[True, False])
+@pytest.fixture(scope='module', params=get_run_as_openhands())
 def run_as_openhands(request):
     time.sleep(1)
     return request.param
 
 
-@pytest.fixture(scope='module', params=[True, False])
-def enable_auto_lint(request):
-    time.sleep(1)
-    return request.param
-
-
 @pytest.fixture(scope='module', params=None)
 def base_container_image(request):
     time.sleep(1)
@@ -96,21 +194,12 @@ def base_container_image(request):
         if request.param is None:
             request.param = pytest.param(
                 'nikolaik/python-nodejs:python3.11-nodejs22',
-                'python:3.11-bookworm',
-                'node:22-bookworm',
                 'golang:1.23-bookworm',
             )
     print(f'Container image: {request.param}')
     return request.param
 
 
-@pytest.fixture
-def runtime(temp_dir, box_class, run_as_openhands):
-    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
-    yield runtime
-    time.sleep(1)
-
-
 def _load_runtime(
     temp_dir,
     box_class,
@@ -118,29 +207,45 @@ def _load_runtime(
     enable_auto_lint: bool = False,
     base_container_image: str | None = None,
     browsergym_eval_env: str | None = None,
+    use_workspace: bool | None = None,
 ) -> Runtime:
-    sid = 'test'
-    cli_session = 'main_test'
+    sid = 'rt_' + str(random.randint(100000, 999999))
 
     # AgentSkills need to be initialized **before** Jupyter
     # otherwise Jupyter will not access the proper dependencies installed by AgentSkills
     plugins = [AgentSkillsRequirement(), JupyterRequirement()]
-    config = AppConfig(
-        workspace_base=temp_dir,
-        workspace_mount_path=temp_dir,
-        sandbox=SandboxConfig(
-            use_host_network=True,
-            browsergym_eval_env=browsergym_eval_env,
-        ),
-    )
-    load_from_env(config, os.environ)
+
+    config = load_app_config()
     config.run_as_openhands = run_as_openhands
+
+    # Folder where all tests create their own folder
+    global test_mount_path
+    if use_workspace:
+        test_mount_path = os.path.join(config.workspace_base, 'rt')
+    else:
+        test_mount_path = os.path.join(
+            temp_dir, sid
+        )  # need a subfolder to avoid conflicts
+    config.workspace_mount_path = test_mount_path
+
+    # Mounting folder specific for this test inside the sandbox
+    config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}/{sid}'
+    print('\nPaths used:')
+    print(f'use_host_network: {config.sandbox.use_host_network}')
+    print(f'workspace_base: {config.workspace_base}')
+    print(f'workspace_mount_path: {config.workspace_mount_path}')
+    print(
+        f'workspace_mount_path_in_sandbox: {config.workspace_mount_path_in_sandbox}\n'
+    )
+
+    config.sandbox.browsergym_eval_env = browsergym_eval_env
     config.sandbox.enable_auto_lint = enable_auto_lint
+
     if base_container_image is not None:
         config.sandbox.base_container_image = base_container_image
 
     file_store = get_file_store(config.file_store, config.file_store_path)
-    event_stream = EventStream(cli_session, file_store)
+    event_stream = EventStream(sid, file_store)
 
     runtime = box_class(
         config=config,
@@ -148,9 +253,14 @@ def _load_runtime(
         sid=sid,
         plugins=plugins,
     )
-    time.sleep(1)
+    time.sleep(2)
     return runtime
 
 
 # Export necessary function
-__all__ = ['_load_runtime']
+__all__ = [
+    '_load_runtime',
+    '_get_host_folder',
+    '_get_sandbox_folder',
+    '_remove_folder',
+]
diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py
index 02301ebb14..69ddfe8eca 100644
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@@ -1,11 +1,14 @@
 """Bash-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
 
 import os
-import tempfile
-import time
 
 import pytest
-from conftest import _load_runtime
+from conftest import (
+    TEST_IN_CI,
+    _close_test_runtime,
+    _get_sandbox_folder,
+    _load_runtime,
+)
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
@@ -16,77 +19,63 @@ from openhands.events.observation import CmdOutputObservation
 # ============================================================================================================================
 
 
+def _run_cmd_action(runtime, custom_command: str, keep_prompt=True):
+    action = CmdRunAction(command=custom_command, keep_prompt=keep_prompt)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert isinstance(obs, CmdOutputObservation)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    return obs
+
+
 def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        # We set env var PS1="\u@\h:\w $"
+        # and construct the PEXCEPT prompt base on it.
+        # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
+        # and failed to pexcept the right content, causing it fail to get error code.
+        obs = runtime.run_action(CmdRunAction(command='env'))
 
-    # We set env var PS1="\u@\h:\w $"
-    # and construct the PEXCEPT prompt base on it.
-    # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
-    # and failed to pexcept the right content, causing it fail to get error code.
-    obs = runtime.run_action(CmdRunAction(command='env'))
+        # For example:
+        # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env
+        # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
+        # CONDA_EXE=/openhands/miniforge3/bin/conda
+        # [...]
+        # LC_CTYPE=C.UTF-8
+        # PS1=\u@\h:\w $
+        # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env
+        # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output:
+        # CONDA_DEFAULT_ENV=base
 
-    # For example:
-    # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env
-    # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
-    # CONDA_EXE=/openhands/miniforge3/bin/conda
-    # [...]
-    # LC_CTYPE=C.UTF-8
-    # PS1=\u@\h:\w $
-    # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env
-    # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output:
-    # CONDA_DEFAULT_ENV=base
-
-    # As long as the exit code is 0, the test will pass.
-    assert isinstance(
-        obs, CmdOutputObservation
-    ), 'The observation should be a CmdOutputObservation.'
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # As long as the exit code is 0, the test will pass.
+        assert isinstance(
+            obs, CmdOutputObservation
+        ), 'The observation should be a CmdOutputObservation.'
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+    finally:
+        _close_test_runtime(runtime)
 
 
-def test_single_multiline_command(temp_dir, box_class):
+def test_multiline_commands(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
+    try:
+        # single multiline command
+        obs = _run_cmd_action(runtime, 'echo \\\n -e "foo"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'foo' in obs.content
 
-    action = CmdRunAction(command='echo \\\n -e "foo"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'foo' in obs.content
+        # test multiline echo
+        obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'hello\r\nworld' in obs.content
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
-
-
-def test_multiline_echo(temp_dir, box_class):
-    runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='echo -e "hello\nworld"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'hello\r\nworld' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
-
-
-def test_runtime_whitespace(temp_dir, box_class):
-    runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='echo -e "\\n\\n\\n"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert '\r\n\r\n\r\n' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # test whitespace
+        obs = _run_cmd_action(runtime, 'echo -e "\\n\\n\\n"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert '\r\n\r\n\r\n' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_multiple_multiline_commands(temp_dir, box_class, run_as_openhands):
@@ -120,48 +109,36 @@ world "
     joined_cmds = '\n'.join(cmds)
 
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        obs = _run_cmd_action(runtime, joined_cmds)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
 
-    action = CmdRunAction(command=joined_cmds)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-
-    assert 'total 0' in obs.content
-    assert 'hello\r\nworld' in obs.content
-    assert "hello it\\'s me" in obs.content
-    assert 'hello -v' in obs.content
-    assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
-    assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
-    assert 'hello\r\nworld "\r\n' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        assert 'total 0' in obs.content
+        assert 'hello\r\nworld' in obs.content
+        assert "hello it\\'s me" in obs.content
+        assert 'hello -v' in obs.content
+        assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
+        assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
+        assert 'hello\r\nworld "\r\n' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_no_ps2_in_output(temp_dir, box_class, run_as_openhands):
     """Test that the PS2 sign is not added to the output of a multiline command."""
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
 
-    action = CmdRunAction(command='echo -e "hello\nworld"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert 'hello\r\nworld' in obs.content
-    assert '>' not in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        assert 'hello\r\nworld' in obs.content
+        assert '>' not in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_multiline_command_loop(temp_dir, box_class):
     # https://github.com/All-Hands-AI/OpenHands/issues/3143
-
-    runtime = _load_runtime(temp_dir, box_class)
-
     init_cmd = """
 mkdir -p _modules && \
 for month in {01..04}; do
@@ -171,15 +148,6 @@ for month in {01..04}; do
 done
 echo "created files"
 """
-    action = CmdRunAction(command=init_cmd)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'created files' in obs.content
-
     follow_up_cmd = """
 for file in _modules/*.md; do
     new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/')
@@ -187,153 +155,104 @@ for file in _modules/*.md; do
 done
 echo "success"
 """
-    action = CmdRunAction(command=follow_up_cmd)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    runtime = _load_runtime(temp_dir, box_class)
+    try:
+        obs = _run_cmd_action(runtime, init_cmd)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'created files' in obs.content
 
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert 'success' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, follow_up_cmd)
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert 'success' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_cmd_run(temp_dir, box_class, run_as_openhands):
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        obs = _run_cmd_action(runtime, 'ls -l /openhands/workspace')
+        assert obs.exit_code == 0
 
-    action = CmdRunAction(command='ls -l')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'total 0' in obs.content
+        obs = _run_cmd_action(runtime, 'ls -l')
+        assert obs.exit_code == 0
+        assert 'total 0' in obs.content
 
-    action = CmdRunAction(command='mkdir test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        obs = _run_cmd_action(runtime, 'mkdir test')
+        assert obs.exit_code == 0
 
-    action = CmdRunAction(command='ls -l')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    if run_as_openhands:
-        assert 'openhands' in obs.content
-    else:
-        assert 'root' in obs.content
-    assert 'test' in obs.content
+        obs = _run_cmd_action(runtime, 'ls -l')
+        assert obs.exit_code == 0
+        if run_as_openhands:
+            assert 'openhands' in obs.content
+        else:
+            assert 'root' in obs.content
+        assert 'test' in obs.content
 
-    action = CmdRunAction(command='touch test/foo.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        obs = _run_cmd_action(runtime, 'touch test/foo.txt')
+        assert obs.exit_code == 0
 
-    action = CmdRunAction(command='ls -l test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'foo.txt' in obs.content
+        obs = _run_cmd_action(runtime, 'ls -l test')
+        assert obs.exit_code == 0
+        assert 'foo.txt' in obs.content
 
-    # clean up: this is needed, since CI will not be
-    # run as root, and this test may leave a file
-    # owned by root
-    action = CmdRunAction(command='rm -rf test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # clean up: this is needed, since CI will not be
+        # run as root, and this test may leave a file
+        # owned by root
+        _run_cmd_action(runtime, 'rm -rf test')
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_run_as_user_correct_home_dir(temp_dir, box_class, run_as_openhands):
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
-
-    action = CmdRunAction(command='cd ~ && pwd')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    if run_as_openhands:
-        assert '/home/openhands' in obs.content
-    else:
-        assert '/root' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    try:
+        obs = _run_cmd_action(runtime, 'cd ~ && pwd')
+        assert obs.exit_code == 0
+        if run_as_openhands:
+            assert '/home/openhands' in obs.content
+        else:
+            assert '/root' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_multi_cmd_run_in_single_line(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='pwd && ls -l')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert '/workspace' in obs.content
-    assert 'total 0' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    try:
+        obs = _run_cmd_action(runtime, 'pwd && ls -l')
+        assert obs.exit_code == 0
+        assert '/workspace' in obs.content
+        assert 'total 0' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_stateful_cmd(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
+    sandbox_dir = _get_sandbox_folder(runtime)
+    try:
+        obs = _run_cmd_action(runtime, 'mkdir -p test')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
 
-    action = CmdRunAction(command='mkdir test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
+        obs = _run_cmd_action(runtime, 'cd test')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
 
-    action = CmdRunAction(command='cd test')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-
-    action = CmdRunAction(command='pwd')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0, 'The exit code should be 0.'
-    assert '/workspace/test' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, 'pwd')
+        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert f'{sandbox_dir}/test' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_failed_cmd(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
-
-    action = CmdRunAction(command='non_existing_command')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code != 0, 'The exit code should not be 0 for a failed command.'
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    try:
+        obs = _run_cmd_action(runtime, 'non_existing_command')
+        assert obs.exit_code != 0, 'The exit code should not be 0 for a failed command.'
+    finally:
+        _close_test_runtime(runtime)
 
 
 def _create_test_file(host_temp_dir):
@@ -344,154 +263,121 @@ def _create_test_file(host_temp_dir):
 
 def test_copy_single_file(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
+        sandbox_file = os.path.join(sandbox_dir, 'test_file.txt')
+        _create_test_file(temp_dir)
+        runtime.copy_to(os.path.join(temp_dir, 'test_file.txt'), sandbox_dir)
 
-    with tempfile.TemporaryDirectory() as host_temp_dir:
-        _create_test_file(host_temp_dir)
-        runtime.copy_to(os.path.join(host_temp_dir, 'test_file.txt'), '/workspace')
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0
+        assert 'test_file.txt' in obs.content
 
-    action = CmdRunAction(command='ls -alh /workspace')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'test_file.txt' in obs.content
-
-    action = CmdRunAction(command='cat /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'cat {sandbox_file}')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
-def _create_test_dir_with_files(host_temp_dir):
-    os.mkdir(os.path.join(host_temp_dir, 'test_dir'))
-    with open(os.path.join(host_temp_dir, 'test_dir', 'file1.txt'), 'w') as f:
+def _create_host_test_dir_with_files(test_dir):
+    logger.debug(f'creating `{test_dir}`')
+    if not os.path.isdir(test_dir):
+        os.makedirs(test_dir, exist_ok=True)
+    logger.debug('creating test files in `test_dir`')
+    with open(os.path.join(test_dir, 'file1.txt'), 'w') as f:
         f.write('File 1 content')
-    with open(os.path.join(host_temp_dir, 'test_dir', 'file2.txt'), 'w') as f:
+    with open(os.path.join(test_dir, 'file2.txt'), 'w') as f:
         f.write('File 2 content')
 
 
 def test_copy_directory_recursively(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
 
-    with tempfile.TemporaryDirectory() as host_temp_dir:
+    sandbox_dir = _get_sandbox_folder(runtime)
+    try:
+        temp_dir_copy = os.path.join(temp_dir, 'test_dir')
         # We need a separate directory, since temp_dir is mounted to /workspace
-        _create_test_dir_with_files(host_temp_dir)
-        runtime.copy_to(
-            os.path.join(host_temp_dir, 'test_dir'), '/workspace', recursive=True
-        )
+        _create_host_test_dir_with_files(temp_dir_copy)
 
-    action = CmdRunAction(command='ls -alh /workspace')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'test_dir' in obs.content
-    assert 'file1.txt' not in obs.content
-    assert 'file2.txt' not in obs.content
+        runtime.copy_to(temp_dir_copy, sandbox_dir, recursive=True)
 
-    action = CmdRunAction(command='ls -alh /workspace/test_dir')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'file1.txt' in obs.content
-    assert 'file2.txt' in obs.content
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0
+        assert 'test_dir' in obs.content
+        assert 'file1.txt' not in obs.content
+        assert 'file2.txt' not in obs.content
 
-    action = CmdRunAction(command='cat /workspace/test_dir/file1.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'File 1 content' in obs.content
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}/test_dir')
+        assert obs.exit_code == 0
+        assert 'file1.txt' in obs.content
+        assert 'file2.txt' in obs.content
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/test_dir/file1.txt')
+        assert obs.exit_code == 0
+        assert 'File 1 content' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_copy_to_non_existent_directory(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
-
-    with tempfile.TemporaryDirectory() as host_temp_dir:
-        _create_test_file(host_temp_dir)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
+        _create_test_file(temp_dir)
         runtime.copy_to(
-            os.path.join(host_temp_dir, 'test_file.txt'), '/workspace/new_dir'
+            os.path.join(temp_dir, 'test_file.txt'), f'{sandbox_dir}/new_dir'
         )
 
-    action = CmdRunAction(command='cat /workspace/new_dir/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/new_dir/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_overwrite_existing_file(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
 
-    # touch a file in /workspace
-    action = CmdRunAction(command='touch /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0
 
-    action = CmdRunAction(command='cat /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' not in obs.content
+        obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
 
-    with tempfile.TemporaryDirectory() as host_temp_dir:
-        _create_test_file(host_temp_dir)
-        runtime.copy_to(os.path.join(host_temp_dir, 'test_file.txt'), '/workspace')
+        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
+        assert obs.exit_code == 0
 
-    action = CmdRunAction(command='cat /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'Hello, World!' in obs.content
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' not in obs.content
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        _create_test_file(temp_dir)
+        runtime.copy_to(os.path.join(temp_dir, 'test_file.txt'), sandbox_dir)
+
+        obs = _run_cmd_action(runtime, f'cat {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'Hello, World!' in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_copy_non_existent_file(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
+        with pytest.raises(FileNotFoundError):
+            runtime.copy_to(
+                os.path.join(sandbox_dir, 'non_existent_file.txt'),
+                f'{sandbox_dir}/should_not_exist.txt',
+            )
 
-    with pytest.raises(FileNotFoundError):
-        runtime.copy_to(
-            os.path.join(temp_dir, 'non_existent_file.txt'),
-            '/workspace/should_not_exist.txt',
-        )
-
-    action = CmdRunAction(command='ls /workspace/should_not_exist.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code != 0  # File should not exist
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(runtime, f'ls {sandbox_dir}/should_not_exist.txt')
+        assert obs.exit_code != 0  # File should not exist
+    finally:
+        _close_test_runtime(runtime)
 
 
 def test_keep_prompt(box_class, temp_dir):
@@ -500,27 +386,26 @@ def test_keep_prompt(box_class, temp_dir):
         box_class=box_class,
         run_as_openhands=False,
     )
+    try:
+        sandbox_dir = _get_sandbox_folder(runtime)
 
-    action = CmdRunAction(command='touch /workspace/test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'root@' in obs.content
+        obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt')
+        assert obs.exit_code == 0
+        assert 'root@' in obs.content
 
-    action = CmdRunAction(command='cat /workspace/test_file.txt', keep_prompt=False)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    assert 'root@' not in obs.content
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        obs = _run_cmd_action(
+            runtime, f'cat {sandbox_dir}/test_file.txt', keep_prompt=False
+        )
+        assert obs.exit_code == 0
+        assert 'root@' not in obs.content
+    finally:
+        _close_test_runtime(runtime)
 
 
+@pytest.mark.skipif(
+    TEST_IN_CI != 'True',
+    reason='This test is not working in WSL (file ownership)',
+)
 def test_git_operation(box_class):
     # do not mount workspace, since workspace mount by tests will be owned by root
     # while the user_id we get via os.getuid() is different from root
@@ -531,69 +416,43 @@ def test_git_operation(box_class):
         # Need to use non-root user to expose issues
         run_as_openhands=True,
     )
-
     # this will happen if permission of runtime is not properly configured
     # fatal: detected dubious ownership in repository at '/workspace'
+    try:
+        # check the ownership of the current directory
+        obs = _run_cmd_action(runtime, 'ls -alh .')
+        assert obs.exit_code == 0
+        # drwx--S--- 2 openhands root   64 Aug  7 23:32 .
+        # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
+        for line in obs.content.split('\r\n'):
+            if ' ..' in line:
+                # parent directory should be owned by root
+                assert 'root' in line
+                assert 'openhands' not in line
+            elif ' .' in line:
+                # current directory should be owned by openhands
+                # and its group should be root
+                assert 'openhands' in line
+                assert 'root' in line
 
-    # check the ownership of the current directory
-    action = CmdRunAction(command='ls -alh .')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-    # drwx--S--- 2 openhands root   64 Aug  7 23:32 .
-    # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
-    for line in obs.content.split('\r\n'):
-        if ' ..' in line:
-            # parent directory should be owned by root
-            assert 'root' in line
-            assert 'openhands' not in line
-        elif ' .' in line:
-            # current directory should be owned by openhands
-            # and its group should be root
-            assert 'openhands' in line
-            assert 'root' in line
+        # make sure all git operations are allowed
+        obs = _run_cmd_action(runtime, 'git init')
+        assert obs.exit_code == 0
 
-    # make sure all git operations are allowed
-    action = CmdRunAction(command='git init')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        # create a file
+        obs = _run_cmd_action(runtime, 'echo "hello" > test_file.txt')
+        assert obs.exit_code == 0
 
-    # create a file
-    action = CmdRunAction(command='echo "hello" > test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        # git add
+        obs = _run_cmd_action(runtime, 'git add test_file.txt')
+        assert obs.exit_code == 0
 
-    # git add
-    action = CmdRunAction(command='git add test_file.txt')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
+        # git diff
+        obs = _run_cmd_action(runtime, 'git diff')
+        assert obs.exit_code == 0
 
-    # git diff
-    action = CmdRunAction(command='git diff')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-
-    # git commit
-    action = CmdRunAction(command='git commit -m "test commit"')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(obs, CmdOutputObservation)
-    assert obs.exit_code == 0
-
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+        # git commit
+        obs = _run_cmd_action(runtime, 'git commit -m "test commit"')
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py
index 1108482262..c0ea3e1810 100644
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@@ -1,9 +1,8 @@
 """Browsing-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
 
 import json
-import time
 
-from conftest import _load_runtime
+from conftest import _close_test_runtime, _load_runtime
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
@@ -66,8 +65,7 @@ def test_simple_browse(temp_dir, box_class, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
 def test_browsergym_eval_env(box_class, temp_dir):
@@ -111,5 +109,4 @@ def test_browsergym_eval_env(box_class, temp_dir):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert json.loads(obs.content) == [0.0]
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
diff --git a/tests/runtime/test_env_vars.py b/tests/runtime/test_env_vars.py
index 316bea63d2..c049404901 100644
--- a/tests/runtime/test_env_vars.py
+++ b/tests/runtime/test_env_vars.py
@@ -1,10 +1,9 @@
 """Env vars related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
 
 import os
-import time
 from unittest.mock import patch
 
-from conftest import _load_runtime
+from conftest import _close_test_runtime, _load_runtime
 
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation
@@ -30,8 +29,7 @@ def test_env_vars_os_environ(temp_dir, box_class, run_as_openhands):
             obs.content.strip().split('\n\r')[0].strip() == 'BAZ'
         ), f'Output: [{obs.content}] for {box_class}'
 
-        runtime.close(rm_all_containers=False)
-        time.sleep(1)
+        _close_test_runtime(runtime)
 
 
 def test_env_vars_runtime_operations(temp_dir, box_class):
@@ -66,5 +64,4 @@ def test_env_vars_runtime_operations(temp_dir, box_class):
         and obs.content.strip().split('\r\n')[0].strip() == 'new_value'
     )
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
diff --git a/tests/runtime/test_images.py b/tests/runtime/test_images.py
index 4660b363ab..99ff28e969 100644
--- a/tests/runtime/test_images.py
+++ b/tests/runtime/test_images.py
@@ -1,9 +1,7 @@
 """Image-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
 
-import time
-
 import pytest
-from conftest import _load_runtime
+from conftest import _close_test_runtime, _load_runtime
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
@@ -17,7 +15,6 @@ def test_bash_python_version(temp_dir, box_class, base_container_image):
     """Make sure Python is available in bash."""
     if base_container_image not in [
         'python:3.11-bookworm',
-        'nikolaik/python-nodejs:python3.11-nodejs22',
     ]:
         pytest.skip('This test is only for python-related images')
 
@@ -45,15 +42,13 @@ def test_bash_python_version(temp_dir, box_class, base_container_image):
     assert obs.exit_code == 0
     assert 'pip' in obs.content  # Check that pip is available
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
 def test_nodejs_22_version(temp_dir, box_class, base_container_image):
     """Make sure Node.js is available in bash."""
     if base_container_image not in [
         'node:22-bookworm',
-        'nikolaik/python-nodejs:python3.11-nodejs22',
     ]:
         pytest.skip('This test is only for nodejs-related images')
 
@@ -68,8 +63,7 @@ def test_nodejs_22_version(temp_dir, box_class, base_container_image):
     assert obs.exit_code == 0
     assert 'v22' in obs.content  # Check for specific version
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
 def test_go_version(temp_dir, box_class, base_container_image):
@@ -90,5 +84,4 @@ def test_go_version(temp_dir, box_class, base_container_image):
     assert obs.exit_code == 0
     assert 'go1.23' in obs.content  # Check for specific version
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
diff --git a/tests/runtime/test_ipython.py b/tests/runtime/test_ipython.py
index b648198a27..e818e2636f 100644
--- a/tests/runtime/test_ipython.py
+++ b/tests/runtime/test_ipython.py
@@ -1,8 +1,12 @@
 """Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
 
-import time
-
-from conftest import _load_runtime
+import pytest
+from conftest import (
+    TEST_IN_CI,
+    _close_test_runtime,
+    _get_sandbox_folder,
+    _load_runtime,
+)
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
@@ -28,6 +32,8 @@ from openhands.runtime.client.runtime import EventStreamRuntime
 def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
 
+    sandbox_dir = _get_sandbox_folder(runtime)
+
     # Test run command
     action_cmd = CmdRunAction(command='ls -l')
     logger.info(action_cmd, extra={'msg_type': 'ACTION'})
@@ -48,7 +54,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.content.strip() == (
         'Hello, `World`!\n'
-        '[Jupyter current working directory: /workspace]\n'
+        f'[Jupyter current working directory: {sandbox_dir}]\n'
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
     )
 
@@ -69,7 +75,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
 
     assert obs.content == ''
     # event stream runtime will always use absolute path
-    assert obs.path == '/workspace/hello.sh'
+    assert obs.path == f'{sandbox_dir}/hello.sh'
 
     # Test read file (file should exist)
     action_read = FileReadAction(path='hello.sh')
@@ -81,7 +87,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
 
     assert obs.content == 'echo "Hello, World!"\n'
-    assert obs.path == '/workspace/hello.sh'
+    assert obs.path == f'{sandbox_dir}/hello.sh'
 
     # clean up
     action = CmdRunAction(command='rm -rf hello.sh')
@@ -90,10 +96,13 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
+@pytest.mark.skipif(
+    TEST_IN_CI != 'True',
+    reason='This test is not working in WSL (file ownership)',
+)
 def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
 
@@ -111,7 +120,7 @@ def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
     else:
         assert 'root' in obs.content
 
-    # print pwd
+    # print the current working directory
     test_code = 'import os; print(os.getcwd())'
     action_ipython = IPythonRunCellAction(code=test_code)
     logger.info(action_ipython, extra={'msg_type': 'ACTION'})
@@ -152,7 +161,6 @@ def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
     if run_as_openhands:
         # -rw-r--r-- 1 openhands root 13 Jul 28 03:53 test.txt
         assert 'openhands' in obs.content.split('\r\n')[0]
-        assert 'root' in obs.content.split('\r\n')[0]
     else:
         # -rw-r--r-- 1 root root 13 Jul 28 03:53 test.txt
         assert 'root' in obs.content.split('\r\n')[0]
@@ -164,12 +172,12 @@ def test_ipython_multi_user(temp_dir, box_class, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
 def test_ipython_simple(temp_dir, box_class):
     runtime = _load_runtime(temp_dir, box_class)
+    sandbox_dir = _get_sandbox_folder(runtime)
 
     # Test run ipython
     # get username
@@ -183,20 +191,20 @@ def test_ipython_simple(temp_dir, box_class):
         obs.content.strip()
         == (
             '1\n'
-            '[Jupyter current working directory: /workspace]\n'
+            f'[Jupyter current working directory: {sandbox_dir}]\n'
             '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
         ).strip()
     )
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
 def _test_ipython_agentskills_fileop_pwd_impl(
     runtime: EventStreamRuntime, enable_auto_lint: bool
 ):
+    sandbox_dir = _get_sandbox_folder(runtime)
     # remove everything in /workspace
-    action = CmdRunAction(command='rm -rf /workspace/*')
+    action = CmdRunAction(command=f'rm -rf {sandbox_dir}/*')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -215,12 +223,12 @@ def _test_ipython_agentskills_fileop_pwd_impl(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert isinstance(obs, IPythonRunCellObservation)
     assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-        '[File: /workspace/hello.py (1 lines total)]\n'
+        f'[File: {sandbox_dir}/hello.py (1 lines total)]\n'
         '(this is the beginning of the file)\n'
         '1|\n'
         '(this is the end of the file)\n'
         '[File hello.py created.]\n'
-        '[Jupyter current working directory: /workspace]\n'
+        f'[Jupyter current working directory: {sandbox_dir}]\n'
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
     ).strip().split('\n')
 
@@ -239,12 +247,12 @@ def _test_ipython_agentskills_fileop_pwd_impl(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert isinstance(obs, IPythonRunCellObservation)
     assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-        '[File: /workspace/test/hello.py (1 lines total)]\n'
+        f'[File: {sandbox_dir}/test/hello.py (1 lines total)]\n'
         '(this is the beginning of the file)\n'
         '1|\n'
         '(this is the end of the file)\n'
         '[File hello.py created.]\n'
-        '[Jupyter current working directory: /workspace/test]\n'
+        f'[Jupyter current working directory: {sandbox_dir}/test]\n'
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
     ).strip().split('\n')
 
@@ -258,10 +266,10 @@ def _test_ipython_agentskills_fileop_pwd_impl(
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, IPythonRunCellObservation)
         assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-            """
+            f"""
 [Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
 ERRORS:
-/workspace/test/hello.py:1:3: E999 IndentationError: unexpected indent
+{sandbox_dir}/test/hello.py:1:3: E999 IndentationError: unexpected indent
 [This is how your edit would have looked if applied]
 -------------------------------------------------
 (this is the beginning of the file)
@@ -278,7 +286,7 @@ ERRORS:
 Your changes have NOT been applied. Please fix your edit command and try again.
 You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
 DO NOT re-run the same failed edit command. Running it again will lead to the same error.
-[Jupyter current working directory: /workspace/test]
+[Jupyter current working directory: {sandbox_dir}/test]
 [Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]
 """
         ).strip().split('\n')
@@ -292,39 +300,44 @@ DO NOT re-run the same failed edit command. Running it again will lead to the sa
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert isinstance(obs, IPythonRunCellObservation)
     assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
-        """
-[File: /workspace/test/hello.py (1 lines total after edit)]
+        f"""
+[File: {sandbox_dir}/test/hello.py (1 lines total after edit)]
 (this is the beginning of the file)
 1|print("hello world")
 (this is the end of the file)
 [File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
-[Jupyter current working directory: /workspace/test]
+[Jupyter current working directory: {sandbox_dir}/test]
 [Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]
 """
     ).strip().split('\n')
 
-    action = CmdRunAction(command='rm -rf /workspace/*')
+    action = CmdRunAction(command=f'rm -rf {sandbox_dir}/*')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
 
-
-def test_ipython_agentskills_fileop_pwd(
-    temp_dir, box_class, run_as_openhands, enable_auto_lint
+def test_ipython_agentskills_fileop_pwd_with_lint(
+    temp_dir, box_class, run_as_openhands
 ):
-    """Make sure that cd in bash also update the current working directory in ipython."""
-
     runtime = _load_runtime(
-        temp_dir, box_class, run_as_openhands, enable_auto_lint=enable_auto_lint
+        temp_dir, box_class, run_as_openhands, enable_auto_lint=True
     )
-    _test_ipython_agentskills_fileop_pwd_impl(runtime, enable_auto_lint)
+    _test_ipython_agentskills_fileop_pwd_impl(runtime, True)
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
+
+
+def test_ipython_agentskills_fileop_pwd_without_lint(
+    temp_dir, box_class, run_as_openhands
+):
+    runtime = _load_runtime(
+        temp_dir, box_class, run_as_openhands, enable_auto_lint=False
+    )
+    _test_ipython_agentskills_fileop_pwd_impl(runtime, False)
+
+    _close_test_runtime(runtime)
 
 
 def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class):
@@ -392,13 +405,13 @@ def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class):
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
     ).strip().split('\n')
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)
 
 
 def test_ipython_package_install(temp_dir, box_class, run_as_openhands):
     """Make sure that cd in bash also update the current working directory in ipython."""
     runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    sandbox_dir = _get_sandbox_folder(runtime)
 
     # It should error out since pymsgbox is not installed
     action = IPythonRunCellAction(code='import pymsgbox')
@@ -424,9 +437,8 @@ def test_ipython_package_install(temp_dir, box_class, run_as_openhands):
     # import should not error out
     assert obs.content.strip() == (
         '[Code executed successfully with no output]\n'
-        '[Jupyter current working directory: /workspace]\n'
+        f'[Jupyter current working directory: {sandbox_dir}]\n'
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]'
     )
 
-    runtime.close(rm_all_containers=False)
-    time.sleep(1)
+    _close_test_runtime(runtime)