From 12dd3352c542e26f8df12effbe39915e90f6144c Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Tue, 26 Nov 2024 08:45:49 -0500 Subject: [PATCH] Add remote runtime support to agent_bench (#5280) Co-authored-by: openhands Co-authored-by: Engel Nyst --- evaluation/benchmarks/agent_bench/README.md | 18 ++++++++++++++++++ evaluation/benchmarks/agent_bench/run_infer.py | 8 ++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/evaluation/benchmarks/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md index e8a1e3dc95..ea7da04e9f 100644 --- a/evaluation/benchmarks/agent_bench/README.md +++ b/evaluation/benchmarks/agent_bench/README.md @@ -36,3 +36,21 @@ You can update the arguments in the script `evaluation/benchmarks/agent_bench/sc ```bash ./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1 ``` + +## Run with Remote Runtime (experimental) + +You can run the evaluation using a remote runtime instead of a local Docker container. This is useful when you want to run the evaluation in a cloud environment or when you don't have Docker installed locally. + +To use the remote runtime, set the following environment variables: + +```bash +# Required environment variables +export ALLHANDS_API_KEY="your-api-key" # Contact the team to get an API key +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" + +# Run the evaluation +./evaluation/benchmarks/agent_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 1 +``` + +The remote runtime will build a container image and run the evaluation in a cloud environment. The results will be saved locally in the same way as when running with a local runtime. diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 693718357a..2fb7213ce8 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -43,12 +43,16 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime=os.environ.get('RUNTIME', 'eventstream'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', + base_container_image='python:3.12-slim', enable_auto_lint=True, use_host_network=False, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=3600, ), # do not mount workspace workspace_base=None,