fix: improve remote runtime reliability on large-scale evaluation (#4869)

2026-03-22 13:47:19 +08:00 · 2024-11-09 14:17:10 -06:00
parent be82832eb1
commit a07e8272da
4 changed files with 18 additions and 8 deletions
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -83,6 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
            timeout=1800,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_init_timeout=1800,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -146,6 +146,7 @@ def get_config(
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_remote_runtime_alive=False,
+            remote_runtime_init_timeout=1800,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -14,7 +14,8 @@ class SandboxConfig:
        base_container_image: The base container image from which to build the runtime image.
        runtime_container_image: The runtime container image to use.
        user_id: The user ID for the sandbox.
-        timeout: The timeout for the sandbox.
+        timeout: The timeout for the default sandbox action execution.
+        remote_runtime_init_timeout: The timeout for the remote runtime to start.
        enable_auto_lint: Whether to enable auto-lint.
        use_host_network: Whether to use the host network.
        initialize_plugins: Whether to initialize plugins.
@@ -41,6 +42,7 @@ class SandboxConfig:
    runtime_container_image: str | None = None
    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
    timeout: int = 120
+    remote_runtime_init_timeout: int = 180
    enable_auto_lint: bool = (
        False  # once enabled, OpenHands would lint files after editing
    )
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -1,7 +1,7 @@
 import os
-from pathlib import Path
 import tempfile
 import threading
+from pathlib import Path
 from typing import Callable, Optional
 from zipfile import ZipFile

@@ -260,13 +260,19 @@ class RemoteRuntime(Runtime):
                {'X-Session-API-Key': start_response['session_api_key']}
            )

-    @tenacity.retry(
-        stop=tenacity.stop_after_delay(180) | stop_if_should_exit(),
-        reraise=True,
-        retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
-        wait=tenacity.wait_fixed(2),
-    )
    def _wait_until_alive(self):
+        retry_decorator = tenacity.retry(
+            stop=tenacity.stop_after_delay(
+                self.config.sandbox.remote_runtime_init_timeout
+            )
+            | stop_if_should_exit(),
+            reraise=True,
+            retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
+            wait=tenacity.wait_fixed(2),
+        )
+        return retry_decorator(self._wait_until_alive_impl)()
+
+    def _wait_until_alive_impl(self):
        self.log('debug', f'Waiting for runtime to be alive at url: {self.runtime_url}')
        runtime_info_response = self._send_request(
            'GET',