diff --git a/containers/app/Dockerfile b/containers/app/Dockerfile index 2ff2acd9c2..c1ca90e8b2 100644 --- a/containers/app/Dockerfile +++ b/containers/app/Dockerfile @@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section ENV RUN_AS_OPENHANDS=true # A random number--we need this to be different from the user's UID on the host machine ENV OPENHANDS_USER_ID=42420 -ENV SANDBOX_API_HOSTNAME=host.docker.internal +ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal ENV USE_HOST_NETWORK=false ENV WORKSPACE_BASE=/opt/workspace_base ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md index 597c3cf6df..f3c8207a3e 100644 --- a/evaluation/swe_bench/README.md +++ b/evaluation/swe_bench/README.md @@ -69,7 +69,7 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash # ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] -ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \ +ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test # This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel ``` @@ -163,7 +163,8 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash # ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] -ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" +ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ +evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" # This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel ``` diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index 8372c30ca0..525bc17e97 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -81,6 +81,7 @@ def get_config(instance: pd.Series) -> AppConfig: # large enough timeout, since some testcases take very long to run timeout=1800, api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), ), # do not mount workspace workspace_base=None, diff --git a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh old mode 100644 new mode 100755 index d061e0d73c..77b1b7bdeb --- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh +++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh @@ -2,10 +2,10 @@ # API base URL -BASE_URL="https://api.all-hands.dev/v0" +BASE_URL="https://runtime.eval.all-hands.dev" # Get the list of runtimes -response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \ +response=$(curl --silent --location --request GET "${BASE_URL}/list" \ --header "X-API-Key: ${ALLHANDS_API_KEY}") n_runtimes=$(echo $response | jq -r '.total') @@ -16,7 +16,7 @@ runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id') counter=1 for runtime_id in $runtime_ids; do echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}" - curl --silent --location --request POST "${BASE_URL}/runtime/stop" \ + curl --silent --location --request POST "${BASE_URL}/stop" \ --header "X-API-Key: ${ALLHANDS_API_KEY}" \ --header "Content-Type: application/json" \ --data-raw "{\"runtime_id\": \"${runtime_id}\"}" diff --git a/openhands/core/config.py b/openhands/core/config.py index a4ed49a9c1..b5b441c8df 100644 --- a/openhands/core/config.py +++ b/openhands/core/config.py @@ -183,7 +183,8 @@ class SandboxConfig: """Configuration for the sandbox. Attributes: - api_hostname: The hostname for the EventStream Runtime API. + remote_runtime_api_url: The hostname for the Remote Runtime API. + local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments base_container_image: The base container image from which to build the runtime image. runtime_container_image: The runtime container image to use. user_id: The user ID for the sandbox. @@ -204,7 +205,8 @@ class SandboxConfig: Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples. """ - api_hostname: str = 'localhost' + remote_runtime_api_url: str = 'http://localhost:8000' + local_runtime_url: str = 'http://localhost' api_key: str | None = None base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22' # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime runtime_container_image: str | None = None @@ -755,6 +757,18 @@ def get_parser() -> argparse.ArgumentParser: type=str, help='The comma-separated list (in quotes) of IDs of the instances to evaluate', ) + # Map-reduce arguments for evaluation + parser.add_argument( + '--eval-map-reduce-write-inputs', + action='store_true', + help='write inputs to output_dir/mr_inputs', + ) + parser.add_argument( + '--eval-map-reduce-read-input-file', + type=str, + default=None, + help='read input (arguments for process_instance) from this file, run it, and write output to output_dir/mr_outputs', + ) return parser diff --git a/openhands/runtime/client/runtime.py b/openhands/runtime/client/runtime.py index 9d8c9c6f16..7447deb85c 100644 --- a/openhands/runtime/client/runtime.py +++ b/openhands/runtime/client/runtime.py @@ -124,9 +124,7 @@ class EventStreamRuntime(Runtime): self.config = config self._host_port = 30000 # initial dummy value self._container_port = 30001 # initial dummy value - self.api_url = ( - f'http://{self.config.sandbox.api_hostname}:{self._container_port}' - ) + self.api_url = f'{self.config.sandbox.local_runtime_url}:{self._container_port}' self.session = requests.Session() self.instance_id = ( sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4()) @@ -212,7 +210,7 @@ class EventStreamRuntime(Runtime): self._host_port ) # in future this might differ from host port self.api_url = ( - f'http://{self.config.sandbox.api_hostname}:{self._container_port}' + f'{self.config.sandbox.local_runtime_url}:{self._container_port}' ) use_host_network = self.config.sandbox.use_host_network diff --git a/openhands/runtime/remote/runtime.py b/openhands/runtime/remote/runtime.py index e1d93a953e..34300fb1fb 100644 --- a/openhands/runtime/remote/runtime.py +++ b/openhands/runtime/remote/runtime.py @@ -57,13 +57,6 @@ class RemoteRuntime(Runtime): env_vars: dict[str, str] | None = None, ): self.config = config - if self.config.sandbox.api_hostname == 'localhost': - self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime' - logger.info( - 'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n' - 'Setting it to default value: api.all-hands.dev/v0/runtime' - ) - self.api_url = f'https://{self.config.sandbox.api_hostname.rstrip("/")}' if self.config.sandbox.api_key is None: raise ValueError( @@ -80,7 +73,7 @@ class RemoteRuntime(Runtime): ) self.runtime_builder = RemoteRuntimeBuilder( - self.api_url, self.config.sandbox.api_key + self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key ) self.runtime_id: str | None = None self.runtime_url: str | None = None @@ -95,7 +88,11 @@ class RemoteRuntime(Runtime): self.container_image: str = self.config.sandbox.base_container_image self.container_name = 'oh-remote-runtime-' + self.instance_id logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}') - response = send_request(self.session, 'GET', f'{self.api_url}/registry_prefix') + response = send_request( + self.session, + 'GET', + f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix', + ) response_json = response.json() registry_prefix = response_json['registry_prefix'] os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = ( @@ -121,7 +118,7 @@ class RemoteRuntime(Runtime): response = send_request( self.session, 'GET', - f'{self.api_url}/image_exists', + f'{self.config.sandbox.remote_runtime_api_url}/image_exists', params={'image': self.container_image}, ) if response.status_code != 200 or not response.json()['exists']: @@ -155,7 +152,10 @@ class RemoteRuntime(Runtime): # Start the sandbox using the /start endpoint response = send_request( - self.session, 'POST', f'{self.api_url}/start', json=start_request + self.session, + 'POST', + f'{self.config.sandbox.remote_runtime_api_url}/start', + json=start_request, ) if response.status_code != 201: raise RuntimeError(f'Failed to start sandbox: {response.text}') @@ -211,7 +211,7 @@ class RemoteRuntime(Runtime): response = send_request( self.session, 'POST', - f'{self.api_url}/stop', + f'{self.config.sandbox.remote_runtime_api_url}/stop', json={'runtime_id': self.runtime_id}, ) if response.status_code != 200: diff --git a/poetry.lock b/poetry.lock index bc0a0d85f3..ea0b06aee1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3761,13 +3761,13 @@ types-tqdm = "*" [[package]] name = "litellm" -version = "1.46.1" +version = "1.48.6" description = "Library to easily interface with LLM API providers" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" files = [ - {file = "litellm-1.46.1-py3-none-any.whl", hash = "sha256:f6b78278cf21a38da0d10a8b3e7b1084b6410012552c0a413774d1c43706e5ba"}, - {file = "litellm-1.46.1.tar.gz", hash = "sha256:993c23d6f5e1d0f070b250d858a6ee87750a032e38f460f8c82385be854bc45f"}, + {file = "litellm-1.48.6-py3-none-any.whl", hash = "sha256:7f6e0f787790d29c4464123bae92712ceb2dd1e05eef1ea90182663c4e4762a3"}, + {file = "litellm-1.48.6.tar.gz", hash = "sha256:44584867d115ba0c1bb5f39efbc8a6131642e63d078e6a9cf2e7abe969d5edf6"}, ] [package.dependencies] @@ -9675,4 +9675,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "5acb0e1ac5538c10add8f72b0f5c2762bea1a08cce7548deccd263934f043cfb" +content-hash = "96a302abea5291a44d97c2e4c813a8db2e6f3b1327b1c4f7dbf6d00eb8e19560" diff --git a/pyproject.toml b/pyproject.toml index a3d94c1404..a77d2ccd28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ packages = [ python = "^3.11" datasets = "*" pandas = "*" -litellm = "*" +litellm = "^1.48.6" google-generativeai = "*" # To use litellm with Gemini Pro API termcolor = "*" seaborn = "*"