fix: improve remote runtime reliability on large-scale evaluation (#4869)

This commit is contained in:
Xingyao Wang
2024-11-09 14:17:10 -06:00
committed by GitHub
parent be82832eb1
commit a07e8272da
4 changed files with 18 additions and 8 deletions

View File

@@ -83,6 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
timeout=1800,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,

View File

@@ -146,6 +146,7 @@ def get_config(
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,

View File

@@ -14,7 +14,8 @@ class SandboxConfig:
base_container_image: The base container image from which to build the runtime image.
runtime_container_image: The runtime container image to use.
user_id: The user ID for the sandbox.
timeout: The timeout for the sandbox.
timeout: The timeout for the default sandbox action execution.
remote_runtime_init_timeout: The timeout for the remote runtime to start.
enable_auto_lint: Whether to enable auto-lint.
use_host_network: Whether to use the host network.
initialize_plugins: Whether to initialize plugins.
@@ -41,6 +42,7 @@ class SandboxConfig:
runtime_container_image: str | None = None
user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
timeout: int = 120
remote_runtime_init_timeout: int = 180
enable_auto_lint: bool = (
False # once enabled, OpenHands would lint files after editing
)

View File

@@ -1,7 +1,7 @@
import os
from pathlib import Path
import tempfile
import threading
from pathlib import Path
from typing import Callable, Optional
from zipfile import ZipFile
@@ -260,13 +260,19 @@ class RemoteRuntime(Runtime):
{'X-Session-API-Key': start_response['session_api_key']}
)
@tenacity.retry(
stop=tenacity.stop_after_delay(180) | stop_if_should_exit(),
reraise=True,
retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
)
def _wait_until_alive(self):
retry_decorator = tenacity.retry(
stop=tenacity.stop_after_delay(
self.config.sandbox.remote_runtime_init_timeout
)
| stop_if_should_exit(),
reraise=True,
retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
)
return retry_decorator(self._wait_until_alive_impl)()
def _wait_until_alive_impl(self):
self.log('debug', f'Waiting for runtime to be alive at url: {self.runtime_url}')
runtime_info_response = self._send_request(
'GET',