From 12dd3352c542e26f8df12effbe39915e90f6144c Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 26 Nov 2024 08:45:49 -0500
Subject: [PATCH] Add remote runtime support to agent_bench (#5280)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
---
 evaluation/benchmarks/agent_bench/README.md    | 18 ++++++++++++++++++
 evaluation/benchmarks/agent_bench/run_infer.py |  8 ++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/evaluation/benchmarks/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md
index e8a1e3dc95..ea7da04e9f 100644
--- a/evaluation/benchmarks/agent_bench/README.md
+++ b/evaluation/benchmarks/agent_bench/README.md
@@ -36,3 +36,21 @@ You can update the arguments in the script `evaluation/benchmarks/agent_bench/sc
 ```bash
 ./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
 ```
+
+## Run with Remote Runtime (experimental)
+
+You can run the evaluation using a remote runtime instead of a local Docker container. This is useful when you want to run the evaluation in a cloud environment or when you don't have Docker installed locally.
+
+To use the remote runtime, set the following environment variables:
+
+```bash
+# Required environment variables
+export ALLHANDS_API_KEY="your-api-key"  # Contact the team to get an API key
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+
+# Run the evaluation
+./evaluation/benchmarks/agent_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 1
+```
+
+The remote runtime will build a container image and run the evaluation in a cloud environment. The results will be saved locally in the same way as when running with a local runtime.
diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
index 693718357a..2fb7213ce8 100644
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -43,12 +43,16 @@ def get_config(
     config = AppConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
         max_iterations=metadata.max_iterations,
         sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
+            base_container_image='python:3.12-slim',
             enable_auto_lint=True,
             use_host_network=False,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=3600,
         ),
         # do not mount workspace
         workspace_base=None,