mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[Runtime] Audit HTTP Retry timeouts (#4282)
This commit is contained in:
parent
be5f9772e6
commit
5097c4fe71
@ -7,7 +7,7 @@ import requests
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.runtime.builder import RuntimeBuilder
|
||||
from openhands.runtime.utils.request import send_request
|
||||
from openhands.runtime.utils.request import send_request_with_retry
|
||||
from openhands.runtime.utils.shutdown_listener import (
|
||||
should_continue,
|
||||
sleep_if_should_continue,
|
||||
@ -44,9 +44,13 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
|
||||
for tag in tags[1:]:
|
||||
files.append(('tags', (None, tag)))
|
||||
|
||||
# Send the POST request to /build
|
||||
response = send_request(
|
||||
self.session, 'POST', f'{self.api_url}/build', files=files
|
||||
# Send the POST request to /build (Begins the build process)
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/build',
|
||||
files=files,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if response.status_code != 202:
|
||||
@ -65,11 +69,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
|
||||
logger.error('Build timed out after 30 minutes')
|
||||
raise RuntimeError('Build timed out after 30 minutes')
|
||||
|
||||
status_response = send_request(
|
||||
status_response = send_request_with_retry(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.api_url}/build_status',
|
||||
params={'build_id': build_id},
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if status_response.status_code != 200:
|
||||
@ -106,8 +111,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
|
||||
def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
|
||||
"""Checks if an image exists in the remote registry using the /image_exists endpoint."""
|
||||
params = {'image': image_name}
|
||||
response = send_request(
|
||||
self.session, 'GET', f'{self.api_url}/image_exists', params=params
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.api_url}/image_exists',
|
||||
params=params,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
|
||||
@ -35,9 +35,7 @@ from openhands.runtime.builder import DockerRuntimeBuilder
|
||||
from openhands.runtime.plugins import PluginRequirement
|
||||
from openhands.runtime.runtime import Runtime
|
||||
from openhands.runtime.utils import find_available_tcp_port
|
||||
from openhands.runtime.utils.request import (
|
||||
send_request,
|
||||
)
|
||||
from openhands.runtime.utils.request import send_request_with_retry
|
||||
from openhands.runtime.utils.runtime_build import build_runtime_image
|
||||
from openhands.utils.tenacity_stop import stop_if_should_exit
|
||||
|
||||
@ -336,11 +334,12 @@ class EventStreamRuntime(Runtime):
|
||||
if not (self.log_buffer and self.log_buffer.client_ready):
|
||||
raise RuntimeError('Runtime client is not ready.')
|
||||
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.api_url}/alive',
|
||||
retry_exceptions=[ConnectionRefusedError],
|
||||
timeout=300, # 5 minutes gives the container time to be alive 🧟♂️
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return
|
||||
@ -424,7 +423,7 @@ class EventStreamRuntime(Runtime):
|
||||
assert action.timeout is not None
|
||||
|
||||
try:
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/execute_action',
|
||||
@ -505,12 +504,13 @@ class EventStreamRuntime(Runtime):
|
||||
|
||||
params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
|
||||
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/upload_file',
|
||||
files=upload_data,
|
||||
params=params,
|
||||
timeout=300,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return
|
||||
@ -539,11 +539,12 @@ class EventStreamRuntime(Runtime):
|
||||
if path is not None:
|
||||
data['path'] = path
|
||||
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/list_files',
|
||||
json=data,
|
||||
timeout=30, # 30 seconds because the container should already be alive
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response_json = response.json()
|
||||
|
||||
@ -39,7 +39,7 @@ from openhands.runtime.runtime import Runtime
|
||||
from openhands.runtime.utils.request import (
|
||||
DEFAULT_RETRY_EXCEPTIONS,
|
||||
is_404_error,
|
||||
send_request,
|
||||
send_request_with_retry,
|
||||
)
|
||||
from openhands.runtime.utils.runtime_build import build_runtime_image
|
||||
from openhands.utils.tenacity_stop import stop_if_should_exit
|
||||
@ -97,10 +97,11 @@ class RemoteRuntime(Runtime):
|
||||
f'Building remote runtime with base image: {self.config.sandbox.base_container_image}'
|
||||
)
|
||||
logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
|
||||
timeout=30,
|
||||
)
|
||||
response_json = response.json()
|
||||
registry_prefix = response_json['registry_prefix']
|
||||
@ -125,11 +126,12 @@ class RemoteRuntime(Runtime):
|
||||
force_rebuild=self.config.sandbox.force_rebuild_runtime,
|
||||
)
|
||||
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
|
||||
params={'image': self.container_image},
|
||||
timeout=30,
|
||||
)
|
||||
if response.status_code != 200 or not response.json()['exists']:
|
||||
raise RuntimeError(
|
||||
@ -164,11 +166,12 @@ class RemoteRuntime(Runtime):
|
||||
|
||||
self.send_status_message('STATUS$WAITING_FOR_CLIENT')
|
||||
# Start the sandbox using the /start endpoint
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/start',
|
||||
json=start_request,
|
||||
timeout=300,
|
||||
)
|
||||
if response.status_code != 201:
|
||||
raise RuntimeError(f'Failed to start sandbox: {response.text}')
|
||||
@ -216,7 +219,7 @@ class RemoteRuntime(Runtime):
|
||||
)
|
||||
def _wait_until_alive(self):
|
||||
logger.info(f'Waiting for runtime to be alive at url: {self.runtime_url}')
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.runtime_url}/alive',
|
||||
@ -235,7 +238,7 @@ class RemoteRuntime(Runtime):
|
||||
def close(self, timeout: int = 10):
|
||||
if self.runtime_id:
|
||||
try:
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/stop',
|
||||
@ -273,7 +276,7 @@ class RemoteRuntime(Runtime):
|
||||
logger.info('Executing action')
|
||||
request_body = {'action': event_to_dict(action)}
|
||||
logger.debug(f'Request body: {request_body}')
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.runtime_url}/execute_action',
|
||||
@ -351,7 +354,7 @@ class RemoteRuntime(Runtime):
|
||||
|
||||
params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
|
||||
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.runtime_url}/upload_file',
|
||||
@ -360,6 +363,7 @@ class RemoteRuntime(Runtime):
|
||||
retry_exceptions=list(
|
||||
filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
|
||||
),
|
||||
timeout=300,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
logger.info(
|
||||
@ -385,7 +389,7 @@ class RemoteRuntime(Runtime):
|
||||
if path is not None:
|
||||
data['path'] = path
|
||||
|
||||
response = send_request(
|
||||
response = send_request_with_retry(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.runtime_url}/list_files',
|
||||
@ -393,6 +397,7 @@ class RemoteRuntime(Runtime):
|
||||
retry_exceptions=list(
|
||||
filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
|
||||
),
|
||||
timeout=30, # The runtime sbould already be running here
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response_json = response.json()
|
||||
|
||||
@ -33,13 +33,13 @@ DEFAULT_RETRY_EXCEPTIONS = [
|
||||
]
|
||||
|
||||
|
||||
def send_request(
|
||||
def send_request_with_retry(
|
||||
session: requests.Session,
|
||||
method: str,
|
||||
url: str,
|
||||
timeout: int,
|
||||
retry_exceptions: list[Type[Exception]] | None = None,
|
||||
retry_fns: list[Callable[[Exception], bool]] | None = None,
|
||||
timeout: int = 120,
|
||||
**kwargs: Any,
|
||||
) -> requests.Response:
|
||||
exceptions_to_catch = retry_exceptions or DEFAULT_RETRY_EXCEPTIONS
|
||||
@ -54,7 +54,7 @@ def send_request(
|
||||
|
||||
@retry(
|
||||
stop=stop_after_delay(timeout) | stop_if_should_exit(),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=20),
|
||||
retry=retry_condition,
|
||||
reraise=True,
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user