bump to new runtime w/o parallel

2026-03-22 13:47:19 +08:00 · 2024-10-01 17:03:57 +00:00
parent cc03b59238
commit e5c5e1c4e5
9 changed files with 43 additions and 29 deletions
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
 ENV RUN_AS_OPENHANDS=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENHANDS_USER_ID=42420
-ENV SANDBOX_API_HOSTNAME=host.docker.internal
+ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -69,7 +69,7 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
 # ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
 # This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ```
@@ -163,7 +163,8 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
 # ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 # This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
 ```

--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -81,6 +81,7 @@ def get_config(instance: pd.Series) -> AppConfig:
            # large enough timeout, since some testcases take very long to run
            timeout=1800,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -2,10 +2,10 @@


 # API base URL
-BASE_URL="https://api.all-hands.dev/v0"
+BASE_URL="https://runtime.eval.all-hands.dev"

 # Get the list of runtimes
-response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
  --header "X-API-Key: ${ALLHANDS_API_KEY}")

 n_runtimes=$(echo $response | jq -r '.total')
@@ -16,7 +16,7 @@ runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 counter=1
 for runtime_id in $runtime_ids; do
  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
+  curl --silent --location --request POST "${BASE_URL}/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -183,7 +183,8 @@ class SandboxConfig:
    """Configuration for the sandbox.

    Attributes:
-        api_hostname: The hostname for the EventStream Runtime API.
+        remote_runtime_api_url: The hostname for the Remote Runtime API.
+        local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments
        base_container_image: The base container image from which to build the runtime image.
        runtime_container_image: The runtime container image to use.
        user_id: The user ID for the sandbox.
@@ -204,7 +205,8 @@ class SandboxConfig:
            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
    """

-    api_hostname: str = 'localhost'
+    remote_runtime_api_url: str = 'http://localhost:8000'
+    local_runtime_url: str = 'http://localhost'
    api_key: str | None = None
    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
    runtime_container_image: str | None = None
@@ -755,6 +757,18 @@ def get_parser() -> argparse.ArgumentParser:
        type=str,
        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
    )
+    # Map-reduce arguments for evaluation
+    parser.add_argument(
+        '--eval-map-reduce-write-inputs',
+        action='store_true',
+        help='write inputs to output_dir/mr_inputs',
+    )
+    parser.add_argument(
+        '--eval-map-reduce-read-input-file',
+        type=str,
+        default=None,
+        help='read input (arguments for process_instance) from this file, run it, and write output to output_dir/mr_outputs',
+    )
    return parser


--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@@ -124,9 +124,7 @@ class EventStreamRuntime(Runtime):
        self.config = config
        self._host_port = 30000  # initial dummy value
        self._container_port = 30001  # initial dummy value
-        self.api_url = (
-            f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
-        )
+        self.api_url = f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
        self.session = requests.Session()
        self.instance_id = (
            sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
@@ -212,7 +210,7 @@ class EventStreamRuntime(Runtime):
                self._host_port
            )  # in future this might differ from host port
            self.api_url = (
-                f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+                f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
            )

            use_host_network = self.config.sandbox.use_host_network
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@@ -57,13 +57,6 @@ class RemoteRuntime(Runtime):
        env_vars: dict[str, str] | None = None,
    ):
        self.config = config
-        if self.config.sandbox.api_hostname == 'localhost':
-            self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
-            logger.info(
-                'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
-                'Setting it to default value: api.all-hands.dev/v0/runtime'
-            )
-        self.api_url = f'https://{self.config.sandbox.api_hostname.rstrip("/")}'

        if self.config.sandbox.api_key is None:
            raise ValueError(
@@ -80,7 +73,7 @@ class RemoteRuntime(Runtime):
            )

        self.runtime_builder = RemoteRuntimeBuilder(
-            self.api_url, self.config.sandbox.api_key
+            self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key
        )
        self.runtime_id: str | None = None
        self.runtime_url: str | None = None
@@ -95,7 +88,11 @@ class RemoteRuntime(Runtime):
        self.container_image: str = self.config.sandbox.base_container_image
        self.container_name = 'oh-remote-runtime-' + self.instance_id
        logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
-        response = send_request(self.session, 'GET', f'{self.api_url}/registry_prefix')
+        response = send_request(
+            self.session,
+            'GET',
+            f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
+        )
        response_json = response.json()
        registry_prefix = response_json['registry_prefix']
        os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = (
@@ -121,7 +118,7 @@ class RemoteRuntime(Runtime):
        response = send_request(
            self.session,
            'GET',
-            f'{self.api_url}/image_exists',
+            f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
            params={'image': self.container_image},
        )
        if response.status_code != 200 or not response.json()['exists']:
@@ -155,7 +152,10 @@ class RemoteRuntime(Runtime):

        # Start the sandbox using the /start endpoint
        response = send_request(
-            self.session, 'POST', f'{self.api_url}/start', json=start_request
+            self.session,
+            'POST',
+            f'{self.config.sandbox.remote_runtime_api_url}/start',
+            json=start_request,
        )
        if response.status_code != 201:
            raise RuntimeError(f'Failed to start sandbox: {response.text}')
@@ -211,7 +211,7 @@ class RemoteRuntime(Runtime):
                response = send_request(
                    self.session,
                    'POST',
-                    f'{self.api_url}/stop',
+                    f'{self.config.sandbox.remote_runtime_api_url}/stop',
                    json={'runtime_id': self.runtime_id},
                )
                if response.status_code != 200:
--- a/poetry.lock
+++ b/poetry.lock
@@ -3761,13 +3761,13 @@ types-tqdm = "*"

 [[package]]
 name = "litellm"
-version = "1.46.1"
+version = "1.48.6"
 description = "Library to easily interface with LLM API providers"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.46.1-py3-none-any.whl", hash = "sha256:f6b78278cf21a38da0d10a8b3e7b1084b6410012552c0a413774d1c43706e5ba"},
-    {file = "litellm-1.46.1.tar.gz", hash = "sha256:993c23d6f5e1d0f070b250d858a6ee87750a032e38f460f8c82385be854bc45f"},
+    {file = "litellm-1.48.6-py3-none-any.whl", hash = "sha256:7f6e0f787790d29c4464123bae92712ceb2dd1e05eef1ea90182663c4e4762a3"},
+    {file = "litellm-1.48.6.tar.gz", hash = "sha256:44584867d115ba0c1bb5f39efbc8a6131642e63d078e6a9cf2e7abe969d5edf6"},
 ]

 [package.dependencies]
@@ -9675,4 +9675,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "5acb0e1ac5538c10add8f72b0f5c2762bea1a08cce7548deccd263934f043cfb"
+content-hash = "96a302abea5291a44d97c2e4c813a8db2e6f3b1327b1c4f7dbf6d00eb8e19560"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ packages = [
 python = "^3.11"
 datasets = "*"
 pandas = "*"
-litellm = "*"
+litellm = "^1.48.6"
 google-generativeai = "*" # To use litellm with Gemini Pro API
 termcolor = "*"
 seaborn = "*"