Add sysbox support to remote runtime for eval; Add memory monitor, stress tests to help debug memory issue (#6684)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
2026-03-22 13:47:19 +08:00 · 2025-02-18 15:02:28 -05:00
parent 8d097efb4f
commit 1a7003a705
35 changed files with 687 additions and 419 deletions
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -17,7 +18,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -60,17 +60,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=False,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -40,21 +40,15 @@ from openhands.utils.async_utils import call_async_from_sync
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-slim'
+
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-slim',
-            enable_auto_lint=True,
-            use_host_network=False,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -24,7 +25,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    load_from_toml,
    parse_arguments,
@@ -47,22 +47,14 @@ SKIP_NUM = (
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.11-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            timeout=100,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=1800,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -57,18 +57,15 @@ def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE

    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -71,17 +71,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
+
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -18,7 +19,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -36,17 +36,14 @@ def get_config(
    assert (
        metadata.max_iterations == 1
    ), 'max_iterations must be 1 for browsing delegation evaluation.'
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=False,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        workspace_base=None,
        workspace_mount_path=None,
    )
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    assert_and_raise,
    codeact_user_response,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -105,9 +105,7 @@ def get_config(
    instance: pd.Series,
    metadata: EvalMetadata,
 ) -> AppConfig:
-    # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
    assert USE_INSTANCE_IMAGE
-    # We use a different instance image for the each instance of commit0 eval
    repo_name = instance['repo'].split('/')[1]
    base_container_image = get_instance_docker_image(repo_name)
    logger.info(
@@ -115,28 +113,16 @@ def get_config(
        f'Please make sure this image exists. '
        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
    )
-    # else:
-    #     raise
-    # base_container_image = SWE_BENCH_CONTAINER_IMAGE
-    # logger.info(f'Using swe-bench container image: {base_container_image}')
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image

    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        max_iterations=metadata.max_iterations,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -62,17 +62,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -48,17 +48,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -19,7 +20,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -40,17 +40,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -29,6 +29,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -37,7 +38,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -61,17 +61,14 @@ ACTION_FORMAT = """
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -30,7 +31,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -82,17 +82,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -17,7 +18,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -45,18 +45,18 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'xingyaoww/od-eval-logic-reasoning:v1.0'
+    sandbox_config.runtime_extra_deps = (
+        '$OH_INTERPRETER_PATH -m pip install scitools-pyke'
+    )
+
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
-            enable_auto_lint=True,
-            use_host_network=False,
-            runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -55,23 +55,14 @@ def get_config(
    metadata: EvalMetadata,
    env_id: str,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='xingyaoww/od-eval-miniwob:v1.0',
-            enable_auto_lint=True,
-            use_host_network=False,
-            browsergym_eval_env=env_id,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            remote_runtime_init_timeout=1800,
-            keep_runtime_alive=False,
-            timeout=120,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -103,18 +103,18 @@ def load_incontext_example(task_name: str, with_tool: bool = True):
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0'
+    sandbox_config.runtime_extra_deps = (
+        f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
+    )
+
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='xingyaoww/od-eval-mint:v1.0',
-            enable_auto_lint=True,
-            use_host_network=False,
-            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -25,6 +25,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -33,7 +34,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
    load_app_config,
@@ -77,16 +77,14 @@ ID2CONDA = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='public.ecr.aws/i5g0m1f6/ml-bench',
-            enable_auto_lint=True,
-            use_host_network=False,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -20,7 +21,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -59,22 +59,17 @@ def get_config(
    metadata: EvalMetadata,
    instance_id: str,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = (
+        'docker.io/xingyaoww/openhands-eval-scienceagentbench'
+    )
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_budget_per_task=4,
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench',
-            enable_auto_lint=True,
-            use_host_network=False,
-            timeout=300,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -1,5 +1,6 @@
 import json
 import os
+import subprocess
 import tempfile
 import time
 from functools import partial
@@ -21,13 +22,14 @@ from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    get_default_sandbox_config_for_eval,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
 )
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
+    LLMConfig,
    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -79,22 +81,16 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig:
        f'Please make sure this image exists. '
        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
    )
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
+        dataset_name=metadata.dataset,
+        instance_id=instance['instance_id'],
+    )
    config = AppConfig(
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=600,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            remote_runtime_init_timeout=3600,
-            remote_runtime_resource_factor=get_instance_resource_factor(
-                dataset_name=metadata.dataset,
-                instance_id=instance['instance_id'],
-            ),
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
@@ -415,13 +411,17 @@ if __name__ == '__main__':
    else:
        # Initialize with a dummy metadata when file doesn't exist
        metadata = EvalMetadata(
-            agent_class="dummy_agent",  # Placeholder agent class
-            llm_config=LLMConfig(model="dummy_model"),  # Minimal LLM config
+            agent_class='dummy_agent',  # Placeholder agent class
+            llm_config=LLMConfig(model='dummy_model'),  # Minimal LLM config
            max_iterations=1,  # Minimal iterations
-            eval_output_dir=os.path.dirname(args.input_file),  # Use input file dir as output dir
+            eval_output_dir=os.path.dirname(
+                args.input_file
+            ),  # Use input file dir as output dir
            start_time=time.strftime('%Y-%m-%d %H:%M:%S'),  # Current time
-            git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip(),  # Current commit
-            dataset=args.dataset  # Dataset name from args
+            git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+            .decode('utf-8')
+            .strip(),  # Current commit
+            dataset=args.dataset,  # Dataset name from args
        )

    # The evaluation harness constrains the signature of `process_instance_func` but we need to
--- a/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
+++ b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
@@ -1 +0,0 @@
-{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    assert_and_raise,
    codeact_user_response,
+    get_default_sandbox_config_for_eval,
    get_metrics,
    is_fatal_evaluation_error,
    make_metadata,
@@ -30,7 +31,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -122,30 +122,23 @@ def get_config(
        base_container_image = SWE_BENCH_CONTAINER_IMAGE
        logger.info(f'Using swe-bench container image: {base_container_image}')

+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.enable_auto_lint = True
+    sandbox_config.use_host_network = False
+    # Add platform to the sandbox config to solve issue 4401
+    sandbox_config.platform = 'linux/amd64'
+    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
+        dataset_name=metadata.dataset,
+        instance_id=instance['instance_id'],
+    )
+
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        max_iterations=metadata.max_iterations,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            # Add platform to the sandbox config to solve issue 4401
-            platform='linux/amd64',
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
-            remote_runtime_api_timeout=120,
-            remote_runtime_resource_factor=get_instance_resource_factor(
-                dataset_name=metadata.dataset,
-                instance_id=instance['instance_id'],
-            ),
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
@@ -331,6 +324,22 @@ def complete_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code == -1:
+        # The previous command is still running
+        # We need to kill previous command
+        logger.info('The previous command is still running, trying to kill it...')
+        action = CmdRunAction(command='C-c')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Then run the command again
+        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
    assert_and_raise(
        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -13,11 +13,11 @@ from typing import List
 import yaml
 from browsing import pre_login

+from evaluation.utils.shared import get_default_sandbox_config_for_eval
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    LLMConfig,
-    SandboxConfig,
    get_agent_config_arg,
    get_llm_config_arg,
    get_parser,
@@ -38,6 +38,8 @@ def get_config(
    llm_config: LLMConfig,
    agent_config: AgentConfig | None,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
    config = AppConfig(
        run_as_openhands=False,
        max_budget_per_task=4,
@@ -45,16 +47,7 @@ def get_config(
        save_trajectory_path=os.path.join(
            mount_path_on_host, f'traj_{task_short_name}.json'
        ),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            # using host network to access the host machine from the container
-            use_host_network=True,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # we mount trajectories path so that trajectories, generated by OpenHands
        # controller, can be accessible to the evaluator file in the runtime container
        workspace_mount_path=mount_path_on_host,
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -18,7 +19,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -41,17 +41,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -20,7 +21,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -55,32 +55,29 @@ def get_config(
    assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set'
    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
    assert openai_base_url is not None, 'OPENAI_BASE_URL must be set'
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
+    sandbox_config.browsergym_eval_env = env_id
+    sandbox_config.runtime_startup_env_vars = {
+        'BASE_URL': base_url,
+        'OPENAI_API_KEY': openai_api_key,
+        'OPENAI_BASE_URL': openai_base_url,
+        'VWA_CLASSIFIEDS': f'{base_url}:9980',
+        'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
+        'VWA_SHOPPING': f'{base_url}:7770',
+        'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
+        'VWA_REDDIT': f'{base_url}:9999',
+        'VWA_GITLAB': f'{base_url}:8023',
+        'VWA_WIKIPEDIA': f'{base_url}:8888',
+        'VWA_HOMEPAGE': f'{base_url}:4399',
+    }
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            browsergym_eval_env=env_id,
-            runtime_startup_env_vars={
-                'BASE_URL': base_url,
-                'OPENAI_API_KEY': openai_api_key,
-                'OPENAI_BASE_URL': openai_base_url,
-                'VWA_CLASSIFIEDS': f'{base_url}:9980',
-                'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
-                'VWA_SHOPPING': f'{base_url}:7770',
-                'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
-                'VWA_REDDIT': f'{base_url}:9999',
-                'VWA_GITLAB': f'{base_url}:8023',
-                'VWA_WIKIPEDIA': f'{base_url}:8888',
-                'VWA_HOMEPAGE': f'{base_url}:4399',
-            },
-            timeout=300,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -19,7 +20,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -50,29 +50,26 @@ def get_config(
    assert base_url is not None, 'WEBARENA_BASE_URL must be set'
    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'

+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.12-bookworm'
+    sandbox_config.browsergym_eval_env = env_id
+    sandbox_config.runtime_startup_env_vars = {
+        'BASE_URL': base_url,
+        'OPENAI_API_KEY': openai_api_key,
+        'SHOPPING': f'{base_url}:7770/',
+        'SHOPPING_ADMIN': f'{base_url}:7780/admin',
+        'REDDIT': f'{base_url}:9999',
+        'GITLAB': f'{base_url}:8023',
+        'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
+        'MAP': f'{base_url}:3000',
+        'HOMEPAGE': f'{base_url}:4399',
+    }
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            browsergym_eval_env=env_id,
-            runtime_startup_env_vars={
-                'BASE_URL': base_url,
-                'OPENAI_API_KEY': openai_api_key,
-                'SHOPPING': f'{base_url}:7770/',
-                'SHOPPING_ADMIN': f'{base_url}:7780/admin',
-                'REDDIT': f'{base_url}:9999',
-                'GITLAB': f'{base_url}:8023',
-                'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
-                'MAP': f'{base_url}:3000',
-                'HOMEPAGE': f'{base_url}:4399',
-            },
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -8,6 +8,7 @@ from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestRes
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -43,23 +43,14 @@ def get_config(
    metadata: EvalMetadata,
    instance_id: str,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.platform = 'linux/amd64'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            # use default base_container_image
-            enable_auto_lint=True,
-            use_host_network=False,
-            timeout=300,
-            # Add platform to the sandbox config to solve issue 4401
-            platform='linux/amd64',
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -16,7 +16,7 @@ from pydantic import BaseModel
 from tqdm import tqdm

 from openhands.controller.state.state import State
-from openhands.core.config import LLMConfig
+from openhands.core.config import LLMConfig, SandboxConfig
 from openhands.core.config.agent_config import AgentConfig
 from openhands.core.config.condenser_config import (
    CondenserConfig,
@@ -555,3 +555,18 @@ def get_metrics(state: State) -> dict[str, Any]:
    metrics = state.metrics.get() if state.metrics else {}
    metrics['condenser'] = get_condensation_metadata(state)
    return metrics
+
+
+def get_default_sandbox_config_for_eval() -> SandboxConfig:
+    return SandboxConfig(
+        use_host_network=False,
+        # large enough timeout, since some testcases take very long to run
+        timeout=300,
+        api_key=os.environ.get('ALLHANDS_API_KEY', None),
+        remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+        keep_runtime_alive=False,
+        remote_runtime_init_timeout=3600,
+        remote_runtime_api_timeout=120,
+        remote_runtime_enable_retries=True,
+        remote_runtime_class='sysbox',
+    )
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -52,6 +52,9 @@ class SandboxConfig(BaseModel):
    remote_runtime_init_timeout: int = Field(default=180)
    remote_runtime_api_timeout: int = Field(default=10)
    remote_runtime_enable_retries: bool = Field(default=False)
+    remote_runtime_class: str | None = Field(
+        default='sysbox'
+    )  # can be "None" (default to gvisor) or "sysbox" (support docker inside runtime + more stable)
    enable_auto_lint: bool = Field(
        default=False  # once enabled, OpenHands would lint files after editing
    )
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -57,6 +57,7 @@ from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
 from openhands.runtime.utils.bash import BashSession
 from openhands.runtime.utils.files import insert_lines, read_lines
+from openhands.runtime.utils.memory_monitor import MemoryMonitor
 from openhands.runtime.utils.runtime_init import init_user_and_working_directory
 from openhands.runtime.utils.system_stats import get_system_stats
 from openhands.utils.async_utils import call_sync_from_async, wait_all
@@ -171,12 +172,19 @@ class ActionExecutor:
        else:
            logger.info('No max memory limit set, using all available system memory')

+        self.memory_monitor = MemoryMonitor(
+            enable=os.environ.get('RUNTIME_MEMORY_MONITOR', 'False').lower()
+            in ['true', '1', 'yes']
+        )
+        self.memory_monitor.start_monitoring()
+
    @property
    def initial_cwd(self):
        return self._initial_cwd

    async def ainit(self):
        # bash needs to be initialized first
+        logger.debug('Initializing bash session')
        self.bash_session = BashSession(
            work_dir=self._initial_cwd,
            username=self.username,
@@ -186,15 +194,18 @@ class ActionExecutor:
            max_memory_mb=self.max_memory_gb * 1024 if self.max_memory_gb else None,
        )
        self.bash_session.initialize()
+        logger.debug('Bash session initialized')

        await wait_all(
            (self._init_plugin(plugin) for plugin in self.plugins_to_load),
            timeout=30,
        )
+        logger.debug('All plugins initialized')

        # This is a temporary workaround
        # TODO: refactor AgentSkills to be part of JupyterPlugin
        # AFTER ServerRuntime is deprecated
+        logger.debug('Initializing AgentSkills')
        if 'agent_skills' in self.plugins and 'jupyter' in self.plugins:
            obs = await self.run_ipython(
                IPythonRunCellAction(
@@ -203,6 +214,7 @@ class ActionExecutor:
            )
            logger.debug(f'AgentSkills initialized: {obs}')

+        logger.debug('Initializing bash commands')
        await self._init_bash_commands()
        logger.debug('Runtime client initialized.')
        self._initialized = True
@@ -447,6 +459,7 @@ class ActionExecutor:
        return await browse(action, self.browser)

    def close(self):
+        self.memory_monitor.stop_monitoring()
        if self.bash_session is not None:
            self.bash_session.close()
        self.browser.close()
--- a/openhands/runtime/impl/docker/docker_runtime.py
+++ b/openhands/runtime/impl/docker/docker_runtime.py
@@ -255,7 +255,6 @@ class DockerRuntime(ActionExecutionClient):
            server_port=self._container_port,
            plugins=self.plugins,
            app_config=self.config,
-            use_nice_for_root=False,
        )

        try:
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -75,6 +75,8 @@ class RemoteRuntime(ActionExecutionClient):
                'remote_runtime_api_url is required in the remote runtime.'
            )

+        assert self.config.sandbox.remote_runtime_class in (None, 'sysbox', 'gvisor')
+
        self.runtime_builder = RemoteRuntimeBuilder(
            self.config.sandbox.remote_runtime_api_url,
            self.config.sandbox.api_key,
@@ -225,6 +227,9 @@ class RemoteRuntime(ActionExecutionClient):
            'session_id': self.sid,
            'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
        }
+        if self.config.sandbox.remote_runtime_class == 'sysbox':
+            start_request['runtime_class'] = 'sysbox-runc'
+        # We ignore other runtime classes for now, because both None and 'gvisor' map to 'gvisor'

        # Start the sandbox using the /start endpoint
        try:
--- a/openhands/runtime/utils/command.py
+++ b/openhands/runtime/utils/command.py
@@ -16,7 +16,6 @@ def get_action_execution_server_startup_command(
    plugins: list[PluginRequirement],
    app_config: AppConfig,
    python_prefix: list[str] = DEFAULT_PYTHON_PREFIX,
-    use_nice_for_root: bool = True,
    override_user_id: int | None = None,
    override_username: str | None = None,
 ):
@@ -40,7 +39,6 @@ def get_action_execution_server_startup_command(
    user_id = override_user_id or (
        sandbox_config.user_id if app_config.run_as_openhands else 0
    )
-    is_root = bool(username == 'root')

    base_cmd = [
        *python_prefix,
@@ -59,17 +57,4 @@ def get_action_execution_server_startup_command(
        *browsergym_args,
    ]

-    if is_root and use_nice_for_root:
-        # If running as root, set highest priority and lowest OOM score
-        cmd_str = ' '.join(base_cmd)
-        return [
-            'nice',
-            '-n',
-            '-20',  # Highest priority
-            'sh',
-            '-c',
-            f'echo -1000 > /proc/self/oom_score_adj && exec {cmd_str}',
-        ]
-    else:
-        # If not root OR not using nice for root, run with normal priority
-        return base_cmd
+    return base_cmd
--- a/openhands/runtime/utils/memory_monitor.py
+++ b/openhands/runtime/utils/memory_monitor.py
@@ -0,0 +1,66 @@
+"""Memory monitoring utilities for the runtime."""
+
+import threading
+
+from memory_profiler import memory_usage
+
+from openhands.core.logger import openhands_logger as logger
+
+
+class LogStream:
+    """Stream-like object that redirects writes to a logger."""
+
+    def write(self, message):
+        if message and not message.isspace():
+            logger.info(f'[Memory usage] {message.strip()}')
+
+    def flush(self):
+        pass
+
+
+class MemoryMonitor:
+    def __init__(self, enable: bool = False):
+        """Memory monitor for the runtime."""
+        self._monitoring_thread: threading.Thread | None = None
+        self._stop_monitoring = threading.Event()
+        self.log_stream = LogStream()
+        self.enable = enable
+
+    def start_monitoring(self):
+        """Start monitoring memory usage."""
+        if not self.enable:
+            return
+
+        if self._monitoring_thread is not None:
+            return
+
+        def monitor_process():
+            try:
+                # Use memory_usage's built-in monitoring loop
+                mem_usage = memory_usage(
+                    -1,  # Monitor current process
+                    interval=0.1,  # Check every second
+                    timeout=3600,  # Run indefinitely
+                    max_usage=False,  # Get continuous readings
+                    include_children=True,  # Include child processes
+                    multiprocess=True,  # Monitor all processes
+                    stream=self.log_stream,  # Redirect output to logger
+                    backend='psutil_pss',
+                )
+                logger.info(f'Memory usage across time: {mem_usage}')
+            except Exception as e:
+                logger.error(f'Memory monitoring failed: {e}')
+
+        self._monitoring_thread = threading.Thread(target=monitor_process, daemon=True)
+        self._monitoring_thread.start()
+        logger.info('Memory monitoring started')
+
+    def stop_monitoring(self):
+        """Stop monitoring memory usage."""
+        if not self.enable:
+            return
+
+        if self._monitoring_thread is not None:
+            self._stop_monitoring.set()
+            self._monitoring_thread = None
+            logger.info('Memory monitoring stopped')
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -4909,6 +4909,21 @@ files = [
    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
 ]

+[[package]]
+name = "memory-profiler"
+version = "0.61.0"
+description = "A module for monitoring memory usage of a python program"
+optional = false
+python-versions = ">=3.5"
+groups = ["main"]
+files = [
+    {file = "memory_profiler-0.61.0-py3-none-any.whl", hash = "sha256:400348e61031e3942ad4d4109d18753b2fb08c2f6fb8290671c5513a34182d84"},
+    {file = "memory_profiler-0.61.0.tar.gz", hash = "sha256:4e5b73d7864a1d1292fb76a03e82a3e78ef934d06828a698d9dada76da2067b0"},
+]
+
+[package.dependencies]
+psutil = "*"
+
 [[package]]
 name = "minio"
 version = "7.2.15"
@@ -10787,4 +10802,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "63c0a6d2f0c382f9e8010ab167df76d3275945acf4fba3da7611d68be8241429"
+content-hash = "a663ed31b71b4307c9f9665a8af4d5fbb8e1a4f0a5a562055df5ec981e5bdc16"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,9 +71,11 @@ openhands-aci = "^0.2.3"
 python-socketio = "^5.11.4"
 redis = "^5.2.0"
 sse-starlette = "^2.1.3"
+psutil = "*"
 stripe = "^11.5.0"
 ipywidgets = "^8.1.5"
 qtconsole = "^5.6.1"
+memory-profiler = "^0.61.0"

 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
--- a/tests/runtime/test_stress_remote_runtime.py
+++ b/tests/runtime/test_stress_remote_runtime.py
@@ -1,8 +1,21 @@
-"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.
+
+Example usage:
+
+```bash
+export ALLHANDS_API_KEY="YOUR_API_KEY"
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.staging.all-hands.dev"
+poetry run pytest -vvxss tests/runtime/test_stress_remote_runtime.py
+```
+
+"""

 import asyncio
 import os
 import tempfile
+import time
+from datetime import datetime
 from unittest.mock import MagicMock

 import pandas as pd
@@ -30,7 +43,12 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.action import (
+    CmdRunAction,
+    FileEditAction,
+    FileWriteAction,
+    MessageAction,
+)
 from openhands.events.observation import CmdOutputObservation
 from openhands.events.serialization.event import event_to_dict
 from openhands.llm import LLM
@@ -42,20 +60,10 @@ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
 }


-def get_config(
-    metadata: EvalMetadata,
-) -> AppConfig:
-    assert (
-        os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL') is not None
-    ), 'SANDBOX_REMOTE_RUNTIME_API_URL must be set.'
-    assert (
-        os.environ.get('ALLHANDS_API_KEY') is not None
-    ), 'ALLHANDS_API_KEY must be set.'
+def get_config() -> AppConfig:
    config = AppConfig(
-        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        runtime='remote',
+        runtime=os.environ.get('RUNTIME', 'remote'),
        sandbox=SandboxConfig(
            base_container_image='python:3.11-bookworm',
            enable_auto_lint=True,
@@ -63,8 +71,11 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_api_url=os.environ.get(
+                'SANDBOX_REMOTE_RUNTIME_API_URL', None
+            ),
            keep_runtime_alive=False,
+            remote_runtime_resource_factor=1,
        ),
        # do not mount workspace
        workspace_base=None,
@@ -79,132 +90,130 @@ def get_config(
    return config


-def initialize_runtime(
-    runtime: Runtime,
-):
-    """Initialize the runtime for the agent.
-
-    This function is called before the runtime is used to run the agent.
-    """
-    logger.info('-' * 30)
-    logger.info('BEGIN Runtime Initialization Fn')
-    logger.info('-' * 30)
-    obs: CmdOutputObservation
-
-    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
-
-    action = CmdRunAction(command='mkdir -p /dummy_dir')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to create /dummy_dir: {str(obs)}',
-    )
-
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Construct the full path for the desired file name within the temporary directory
-        temp_file_path = os.path.join(temp_dir, 'dummy_file')
-        # Write to the file with the desired name within the temporary directory
-        with open(temp_file_path, 'w') as f:
-            f.write('dummy content')
-
-        # Copy the file to the desired location
-        runtime.copy_to(temp_file_path, '/dummy_dir/')
-
-    logger.info('-' * 30)
-    logger.info('END Runtime Initialization Fn')
-    logger.info('-' * 30)
-
-
-def process_instance(
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    reset_logger: bool = True,
-) -> EvalOutput:
-    config = get_config(metadata)
-
-    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    if reset_logger:
-        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
-        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
-    else:
-        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-    runtime = create_runtime(config, headless_mode=False)
-    call_async_from_sync(runtime.connect)
-
-    try:
-        initialize_runtime(runtime)
-
-        instruction = 'dummy instruction'
-        agent = Agent.get_cls(metadata.agent_class)(
-            llm=LLM(config=metadata.llm_config),
-            config=config.get_agent_config(metadata.agent_class),
-        )
-
-        def next_command(*args, **kwargs):
-            return CmdRunAction(command='ls -lah')
-
-        agent.step = MagicMock(side_effect=next_command)
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_controller(
-                config=config,
-                initial_user_action=MessageAction(content=instruction),
-                runtime=runtime,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                    metadata.agent_class
-                ],
-                agent=agent,
-            )
-        )
-
-        # if fatal error, throw EvalError to trigger re-run
-        if (
-            state.last_error
-            and 'fatal error during agent execution' in state.last_error
-            and 'stuck in a loop' not in state.last_error
-        ):
-            raise EvalException('Fatal error detected: ' + state.last_error)
-
-    finally:
-        runtime.close()
-
-    test_result = {}
-    if state is None:
-        raise ValueError('State should not be None.')
-    histories = [event_to_dict(event) for event in state.history]
-    metrics = state.metrics.get() if state.metrics else None
-
-    # Save the output
-    output = EvalOutput(
-        instance_id=instance.instance_id,
-        instruction=instruction,
-        instance=instance.to_dict(),  # SWE Bench specific
-        test_result=test_result,
-        metadata=metadata,
-        history=histories,
-        metrics=metrics,
-        error=state.last_error if state and state.last_error else None,
-    )
-    return output
-
-
@pytest.mark.skipif(
    TEST_IN_CI,
    reason='This test should only be run locally, not in CI.',
 )
-def test_stress_remote_runtime(n_eval_workers: int = 64):
+def test_stress_remote_runtime_eval(n_eval_workers: int = 64):
    """Mimic evaluation setting to test remote runtime in a multi-processing setting."""

+    def _initialize_runtime(
+        runtime: Runtime,
+    ):
+        """Initialize the runtime for the agent.
+
+        This function is called before the runtime is used to run the agent.
+        """
+        logger.info('-' * 30)
+        logger.info('BEGIN Runtime Initialization Fn')
+        logger.info('-' * 30)
+        obs: CmdOutputObservation
+
+        action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
+
+        action = CmdRunAction(command='mkdir -p /dummy_dir')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to create /dummy_dir: {str(obs)}',
+        )
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Construct the full path for the desired file name within the temporary directory
+            temp_file_path = os.path.join(temp_dir, 'dummy_file')
+            # Write to the file with the desired name within the temporary directory
+            with open(temp_file_path, 'w') as f:
+                f.write('dummy content')
+
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/dummy_dir/')
+
+        logger.info('-' * 30)
+        logger.info('END Runtime Initialization Fn')
+        logger.info('-' * 30)
+
+    def _process_instance(
+        instance: pd.Series,
+        metadata: EvalMetadata,
+        reset_logger: bool = True,
+    ) -> EvalOutput:
+        config = get_config()
+
+        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+        if reset_logger:
+            log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+            reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+        else:
+            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+        runtime = create_runtime(config, headless_mode=True)
+        call_async_from_sync(runtime.connect)
+
+        try:
+            _initialize_runtime(runtime)
+
+            instruction = 'dummy instruction'
+            agent = Agent.get_cls(metadata.agent_class)(
+                llm=LLM(config=metadata.llm_config),
+                config=config.get_agent_config(metadata.agent_class),
+            )
+
+            def next_command(*args, **kwargs):
+                return CmdRunAction(command='ls -lah')
+
+            agent.step = MagicMock(side_effect=next_command)
+
+            # Here's how you can run the agent (similar to the `main` function) and get the final task state
+            state: State | None = asyncio.run(
+                run_controller(
+                    config=config,
+                    initial_user_action=MessageAction(content=instruction),
+                    runtime=runtime,
+                    fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                        metadata.agent_class
+                    ],
+                    agent=agent,
+                )
+            )
+
+            # if fatal error, throw EvalError to trigger re-run
+            if (
+                state.last_error
+                and 'fatal error during agent execution' in state.last_error
+                and 'stuck in a loop' not in state.last_error
+            ):
+                raise EvalException('Fatal error detected: ' + state.last_error)
+
+        finally:
+            runtime.close()
+
+        test_result = {}
+        if state is None:
+            raise ValueError('State should not be None.')
+        histories = [event_to_dict(event) for event in state.history]
+        metrics = state.metrics.get() if state.metrics else None
+
+        # Save the output
+        output = EvalOutput(
+            instance_id=instance.instance_id,
+            instruction=instruction,
+            instance=instance.to_dict(),  # SWE Bench specific
+            test_result=test_result,
+            metadata=metadata,
+            history=histories,
+            metrics=metrics,
+            error=state.last_error if state and state.last_error else None,
+        )
+        return output
+
    llm_config = LLMConfig()
    metadata = make_metadata(
        llm_config,
@@ -228,4 +237,247 @@ def test_stress_remote_runtime(n_eval_workers: int = 64):
        dummy_instance, output_file, eval_n_limit=len(dummy_instance)
    )

-    run_evaluation(instances, metadata, output_file, n_eval_workers, process_instance)
+    run_evaluation(instances, metadata, output_file, n_eval_workers, _process_instance)
+
+
+@pytest.mark.skipif(
+    TEST_IN_CI,
+    reason='This test should only be run locally, not in CI.',
+)
+def test_stress_remote_runtime_long_output_with_soft_and_hard_timeout():
+    """Stress test for the remote runtime."""
+    config = get_config()
+
+    try:
+        runtime = create_runtime(config, headless_mode=True)
+        call_async_from_sync(runtime.connect)
+        _time_for_test = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+
+        # Run a command that generates long output multiple times
+        for i in range(10):
+            start_time = time.time()
+            iteration_stats = {
+                'iteration': i,
+                'timestamp': time.time(),
+            }
+
+            # Check overall system memory usage
+            mem_action = CmdRunAction(
+                'free -k | grep "Mem:" | awk \'{printf "Total: %8.1f MB, Used: %8.1f MB, Free: %8.1f MB, Available: %8.1f MB\\n", $2/1024, $3/1024, $4/1024, $7/1024}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'System memory usage (iteration {i}): {mem_obs.content.strip()}'
+            )
+            # Parse memory values from output
+            mem_parts = mem_obs.content.strip().split(',')
+            for part in mem_parts:
+                key, value = part.strip().split(':')
+                iteration_stats[f'memory_{key.lower()}'] = float(
+                    value.replace('MB', '').strip()
+                )
+
+            # Check top memory-consuming processes
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | head -n 5'
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            _top_processes = [i.strip() for i in mem_obs.content.strip().split('\n')]
+            logger.info(
+                f'Top 5 memory-consuming processes (iteration {i}):\n{"- " + "\n- ".join(_top_processes)}'
+            )
+            iteration_stats['top_processes'] = _top_processes
+
+            # Check tmux memory usage (in KB)
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB'
+            )
+            try:
+                iteration_stats['tmux_memory_mb'] = float(mem_obs.content.strip())
+            except (ValueError, AttributeError):
+                iteration_stats['tmux_memory_mb'] = None
+
+            # Check action_execution_server mem
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} MB'
+            )
+            try:
+                iteration_stats['action_server_memory_mb'] = float(
+                    mem_obs.content.strip()
+                )
+            except (ValueError, AttributeError):
+                iteration_stats['action_server_memory_mb'] = None
+
+            # Test soft timeout
+            action = CmdRunAction(
+                'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi'
+            )
+            obs = runtime.run_action(action)
+            assert 'Do you want to continue?' in obs.content
+            assert obs.exit_code == -1  # Command is still running, waiting for input
+
+            # Send the confirmation
+            action = CmdRunAction('Y', is_input=True)
+            obs = runtime.run_action(action)
+            assert 'Proceeding with operation...' in obs.content
+            assert 'Operation completed successfully!' in obs.content
+            assert obs.exit_code == 0
+            assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+
+            # Test hard timeout w/ long output
+            # Generate long output with 1000 asterisks per line
+            action = CmdRunAction(
+                f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done'
+            )
+            action.set_hard_timeout(2)
+            obs = runtime.run_action(action)
+
+            # Verify the output
+            assert obs.exit_code == -1
+            assert f'Line 1 - Iteration {i}' in obs.content
+
+            # Because hard-timeout is triggered, the terminal will in a weird state
+            # where it will not accept any new commands.
+            obs = runtime.run_action(CmdRunAction('ls'))
+            assert obs.exit_code == -1
+            assert 'The previous command is still running' in obs.metadata.suffix
+
+            # We need to send a Ctrl+C to reset the terminal.
+            obs = runtime.run_action(CmdRunAction('C-c', is_input=True))
+            assert obs.exit_code == 130
+
+            # Now make sure the terminal is in a good state
+            obs = runtime.run_action(CmdRunAction('ls'))
+            assert obs.exit_code == 0
+
+            duration = time.time() - start_time
+            iteration_stats['duration'] = duration
+            logger.info(f'Completed iteration {i} in {duration:.2f} seconds')
+
+    finally:
+        runtime.close()
+
+
+@pytest.mark.skipif(
+    TEST_IN_CI,
+    reason='This test should only be run locally, not in CI.',
+)
+def test_stress_runtime_memory_limits():
+    """Test runtime behavior under resource constraints."""
+    config = get_config()
+
+    # For Docker runtime, add resource constraints
+    if config.runtime == 'docker':
+        config.sandbox.docker_runtime_kwargs = {
+            'cpu_period': 100000,  # 100ms
+            'cpu_quota': 100000,  # Can use 100ms out of each 100ms period (1 CPU)
+            'mem_limit': '4G',  # 4 GB of memory
+            'memswap_limit': '0',  # No swap
+            'mem_swappiness': 0,  # Disable swapping
+            'oom_kill_disable': False,  # Enable OOM killer
+        }
+    config.sandbox.runtime_startup_env_vars = {
+        'RUNTIME_MAX_MEMORY_GB': '3',
+        'RUNTIME_MEMORY_MONITOR': 'true',
+    }
+
+    try:
+        runtime = create_runtime(config, headless_mode=True)
+        call_async_from_sync(runtime.connect)
+
+        # Install stress-ng
+        action = CmdRunAction(
+            command='sudo apt-get update && sudo apt-get install -y stress-ng'
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        action = CmdRunAction(
+            command='stress-ng --vm 1 --vm-bytes 6G --timeout 1m --metrics'
+        )
+        action.set_hard_timeout(120)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'aborted early, out of system resources' in obs.content
+        assert obs.exit_code == 3  # OOM killed!
+
+    finally:
+        runtime.close()
+
+
+@pytest.mark.skipif(
+    TEST_IN_CI,
+    reason='This test should only be run locally, not in CI.',
+)
+def test_stress_runtime_memory_limits_with_repeated_file_edit():
+    """Test runtime behavior under resource constraints with repeated file edits."""
+    config = get_config()
+
+    # For Docker runtime, add resource constraints
+    if config.runtime == 'docker':
+        config.sandbox.docker_runtime_kwargs = {
+            'cpu_period': 100000,  # 100ms
+            'cpu_quota': 100000,  # Can use 100ms out of each 100ms period (1 CPU)
+            'mem_limit': '4G',  # 4 GB of memory
+            'memswap_limit': '0',  # No swap
+            'mem_swappiness': 0,  # Disable swapping
+            'oom_kill_disable': False,  # Enable OOM killer
+        }
+    config.sandbox.runtime_startup_env_vars = {
+        'RUNTIME_MAX_MEMORY_GB': '3',
+        'RUNTIME_MEMORY_MONITOR': 'true',
+    }
+
+    try:
+        runtime = create_runtime(config, headless_mode=True)
+        call_async_from_sync(runtime.connect)
+
+        # Create initial test file with base content
+        test_file = '/tmp/test_file.txt'
+        # base_content = 'content_1\n' * 1000  # Create a reasonably sized file
+        base_content = ''
+        for i in range(1000):
+            base_content += f'content_{i:03d}\n'
+
+        # Use FileWriteAction to create initial file
+        write_action = FileWriteAction(path=test_file, content=base_content)
+        obs = runtime.run_action(write_action)
+
+        # Perform repeated file edits
+        for i in range(1000):
+            # Use FileEditAction with str_replace instead of IPythonRunCellAction
+            edit_action = FileEditAction(
+                command='str_replace',
+                path=test_file,
+                old_str=f'content_{i:03d}',
+                new_str=f'-content_{i:03d}',
+            )
+            obs = runtime.run_action(edit_action)
+            assert (
+                f'The file {test_file} has been edited' in obs.content
+            ), f'Edit failed at iteration {i}'
+            logger.info(f'finished iteration {i}')
+
+        # Verify final file state using FileEditAction view command
+        action = FileEditAction(command='view', path=test_file)
+        obs = runtime.run_action(action)
+        assert '-content_999' in obs.content, 'Final content verification failed'
+        logger.info('Final file content verified successfully')
+
+    finally:
+        runtime.close()
				`@@ -1 +0,0 @@`
				{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}