[Runtime] Audit HTTP Retry timeouts (#4282)

This commit is contained in:
tofarr 2024-10-08 19:31:25 -06:00 committed by GitHub
parent be5f9772e6
commit 5097c4fe71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 41 additions and 26 deletions

View File

@ -7,7 +7,7 @@ import requests
from openhands.core.logger import openhands_logger as logger
from openhands.runtime.builder import RuntimeBuilder
from openhands.runtime.utils.request import send_request
from openhands.runtime.utils.request import send_request_with_retry
from openhands.runtime.utils.shutdown_listener import (
should_continue,
sleep_if_should_continue,
@ -44,9 +44,13 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
for tag in tags[1:]:
files.append(('tags', (None, tag)))
# Send the POST request to /build
response = send_request(
self.session, 'POST', f'{self.api_url}/build', files=files
# Send the POST request to /build (Begins the build process)
response = send_request_with_retry(
self.session,
'POST',
f'{self.api_url}/build',
files=files,
timeout=30,
)
if response.status_code != 202:
@ -65,11 +69,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
logger.error('Build timed out after 30 minutes')
raise RuntimeError('Build timed out after 30 minutes')
status_response = send_request(
status_response = send_request_with_retry(
self.session,
'GET',
f'{self.api_url}/build_status',
params={'build_id': build_id},
timeout=30,
)
if status_response.status_code != 200:
@ -106,8 +111,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
"""Checks if an image exists in the remote registry using the /image_exists endpoint."""
params = {'image': image_name}
response = send_request(
self.session, 'GET', f'{self.api_url}/image_exists', params=params
response = send_request_with_retry(
self.session,
'GET',
f'{self.api_url}/image_exists',
params=params,
timeout=30,
)
if response.status_code != 200:

View File

@ -35,9 +35,7 @@ from openhands.runtime.builder import DockerRuntimeBuilder
from openhands.runtime.plugins import PluginRequirement
from openhands.runtime.runtime import Runtime
from openhands.runtime.utils import find_available_tcp_port
from openhands.runtime.utils.request import (
send_request,
)
from openhands.runtime.utils.request import send_request_with_retry
from openhands.runtime.utils.runtime_build import build_runtime_image
from openhands.utils.tenacity_stop import stop_if_should_exit
@ -336,11 +334,12 @@ class EventStreamRuntime(Runtime):
if not (self.log_buffer and self.log_buffer.client_ready):
raise RuntimeError('Runtime client is not ready.')
response = send_request(
response = send_request_with_retry(
self.session,
'GET',
f'{self.api_url}/alive',
retry_exceptions=[ConnectionRefusedError],
timeout=300, # 5 minutes gives the container time to be alive 🧟‍♂️
)
if response.status_code == 200:
return
@ -424,7 +423,7 @@ class EventStreamRuntime(Runtime):
assert action.timeout is not None
try:
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.api_url}/execute_action',
@ -505,12 +504,13 @@ class EventStreamRuntime(Runtime):
params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.api_url}/upload_file',
files=upload_data,
params=params,
timeout=300,
)
if response.status_code == 200:
return
@ -539,11 +539,12 @@ class EventStreamRuntime(Runtime):
if path is not None:
data['path'] = path
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.api_url}/list_files',
json=data,
timeout=30, # 30 seconds because the container should already be alive
)
if response.status_code == 200:
response_json = response.json()

View File

@ -39,7 +39,7 @@ from openhands.runtime.runtime import Runtime
from openhands.runtime.utils.request import (
DEFAULT_RETRY_EXCEPTIONS,
is_404_error,
send_request,
send_request_with_retry,
)
from openhands.runtime.utils.runtime_build import build_runtime_image
from openhands.utils.tenacity_stop import stop_if_should_exit
@ -97,10 +97,11 @@ class RemoteRuntime(Runtime):
f'Building remote runtime with base image: {self.config.sandbox.base_container_image}'
)
logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
response = send_request(
response = send_request_with_retry(
self.session,
'GET',
f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
timeout=30,
)
response_json = response.json()
registry_prefix = response_json['registry_prefix']
@ -125,11 +126,12 @@ class RemoteRuntime(Runtime):
force_rebuild=self.config.sandbox.force_rebuild_runtime,
)
response = send_request(
response = send_request_with_retry(
self.session,
'GET',
f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
params={'image': self.container_image},
timeout=30,
)
if response.status_code != 200 or not response.json()['exists']:
raise RuntimeError(
@ -164,11 +166,12 @@ class RemoteRuntime(Runtime):
self.send_status_message('STATUS$WAITING_FOR_CLIENT')
# Start the sandbox using the /start endpoint
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/start',
json=start_request,
timeout=300,
)
if response.status_code != 201:
raise RuntimeError(f'Failed to start sandbox: {response.text}')
@ -216,7 +219,7 @@ class RemoteRuntime(Runtime):
)
def _wait_until_alive(self):
logger.info(f'Waiting for runtime to be alive at url: {self.runtime_url}')
response = send_request(
response = send_request_with_retry(
self.session,
'GET',
f'{self.runtime_url}/alive',
@ -235,7 +238,7 @@ class RemoteRuntime(Runtime):
def close(self, timeout: int = 10):
if self.runtime_id:
try:
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/stop',
@ -273,7 +276,7 @@ class RemoteRuntime(Runtime):
logger.info('Executing action')
request_body = {'action': event_to_dict(action)}
logger.debug(f'Request body: {request_body}')
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.runtime_url}/execute_action',
@ -351,7 +354,7 @@ class RemoteRuntime(Runtime):
params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.runtime_url}/upload_file',
@ -360,6 +363,7 @@ class RemoteRuntime(Runtime):
retry_exceptions=list(
filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
),
timeout=300,
)
if response.status_code == 200:
logger.info(
@ -385,7 +389,7 @@ class RemoteRuntime(Runtime):
if path is not None:
data['path'] = path
response = send_request(
response = send_request_with_retry(
self.session,
'POST',
f'{self.runtime_url}/list_files',
@ -393,6 +397,7 @@ class RemoteRuntime(Runtime):
retry_exceptions=list(
filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
),
timeout=30, # The runtime sbould already be running here
)
if response.status_code == 200:
response_json = response.json()

View File

@ -33,13 +33,13 @@ DEFAULT_RETRY_EXCEPTIONS = [
]
def send_request(
def send_request_with_retry(
session: requests.Session,
method: str,
url: str,
timeout: int,
retry_exceptions: list[Type[Exception]] | None = None,
retry_fns: list[Callable[[Exception], bool]] | None = None,
timeout: int = 120,
**kwargs: Any,
) -> requests.Response:
exceptions_to_catch = retry_exceptions or DEFAULT_RETRY_EXCEPTIONS
@ -54,7 +54,7 @@ def send_request(
@retry(
stop=stop_after_delay(timeout) | stop_if_should_exit(),
wait=wait_exponential(multiplier=1, min=4, max=60),
wait=wait_exponential(multiplier=1, min=4, max=20),
retry=retry_condition,
reraise=True,
)