From 5097c4fe71013d03174d3175a9c23e8cfa69f31c Mon Sep 17 00:00:00 2001 From: tofarr Date: Tue, 8 Oct 2024 19:31:25 -0600 Subject: [PATCH] [Runtime] Audit HTTP Retry timeouts (#4282) --- openhands/runtime/builder/remote.py | 23 ++++++++++++++++------- openhands/runtime/client/runtime.py | 15 ++++++++------- openhands/runtime/remote/runtime.py | 23 ++++++++++++++--------- openhands/runtime/utils/request.py | 6 +++--- 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/openhands/runtime/builder/remote.py b/openhands/runtime/builder/remote.py index 801496e8ff..0057b1a2e6 100644 --- a/openhands/runtime/builder/remote.py +++ b/openhands/runtime/builder/remote.py @@ -7,7 +7,7 @@ import requests from openhands.core.logger import openhands_logger as logger from openhands.runtime.builder import RuntimeBuilder -from openhands.runtime.utils.request import send_request +from openhands.runtime.utils.request import send_request_with_retry from openhands.runtime.utils.shutdown_listener import ( should_continue, sleep_if_should_continue, @@ -44,9 +44,13 @@ class RemoteRuntimeBuilder(RuntimeBuilder): for tag in tags[1:]: files.append(('tags', (None, tag))) - # Send the POST request to /build - response = send_request( - self.session, 'POST', f'{self.api_url}/build', files=files + # Send the POST request to /build (Begins the build process) + response = send_request_with_retry( + self.session, + 'POST', + f'{self.api_url}/build', + files=files, + timeout=30, ) if response.status_code != 202: @@ -65,11 +69,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder): logger.error('Build timed out after 30 minutes') raise RuntimeError('Build timed out after 30 minutes') - status_response = send_request( + status_response = send_request_with_retry( self.session, 'GET', f'{self.api_url}/build_status', params={'build_id': build_id}, + timeout=30, ) if status_response.status_code != 200: @@ -106,8 +111,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder): def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool: """Checks if an image exists in the remote registry using the /image_exists endpoint.""" params = {'image': image_name} - response = send_request( - self.session, 'GET', f'{self.api_url}/image_exists', params=params + response = send_request_with_retry( + self.session, + 'GET', + f'{self.api_url}/image_exists', + params=params, + timeout=30, ) if response.status_code != 200: diff --git a/openhands/runtime/client/runtime.py b/openhands/runtime/client/runtime.py index cb2a9ce025..8c46b01803 100644 --- a/openhands/runtime/client/runtime.py +++ b/openhands/runtime/client/runtime.py @@ -35,9 +35,7 @@ from openhands.runtime.builder import DockerRuntimeBuilder from openhands.runtime.plugins import PluginRequirement from openhands.runtime.runtime import Runtime from openhands.runtime.utils import find_available_tcp_port -from openhands.runtime.utils.request import ( - send_request, -) +from openhands.runtime.utils.request import send_request_with_retry from openhands.runtime.utils.runtime_build import build_runtime_image from openhands.utils.tenacity_stop import stop_if_should_exit @@ -336,11 +334,12 @@ class EventStreamRuntime(Runtime): if not (self.log_buffer and self.log_buffer.client_ready): raise RuntimeError('Runtime client is not ready.') - response = send_request( + response = send_request_with_retry( self.session, 'GET', f'{self.api_url}/alive', retry_exceptions=[ConnectionRefusedError], + timeout=300, # 5 minutes gives the container time to be alive 🧟‍♂️ ) if response.status_code == 200: return @@ -424,7 +423,7 @@ class EventStreamRuntime(Runtime): assert action.timeout is not None try: - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.api_url}/execute_action', @@ -505,12 +504,13 @@ class EventStreamRuntime(Runtime): params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()} - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.api_url}/upload_file', files=upload_data, params=params, + timeout=300, ) if response.status_code == 200: return @@ -539,11 +539,12 @@ class EventStreamRuntime(Runtime): if path is not None: data['path'] = path - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.api_url}/list_files', json=data, + timeout=30, # 30 seconds because the container should already be alive ) if response.status_code == 200: response_json = response.json() diff --git a/openhands/runtime/remote/runtime.py b/openhands/runtime/remote/runtime.py index 1104270b39..072968e41c 100644 --- a/openhands/runtime/remote/runtime.py +++ b/openhands/runtime/remote/runtime.py @@ -39,7 +39,7 @@ from openhands.runtime.runtime import Runtime from openhands.runtime.utils.request import ( DEFAULT_RETRY_EXCEPTIONS, is_404_error, - send_request, + send_request_with_retry, ) from openhands.runtime.utils.runtime_build import build_runtime_image from openhands.utils.tenacity_stop import stop_if_should_exit @@ -97,10 +97,11 @@ class RemoteRuntime(Runtime): f'Building remote runtime with base image: {self.config.sandbox.base_container_image}' ) logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}') - response = send_request( + response = send_request_with_retry( self.session, 'GET', f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix', + timeout=30, ) response_json = response.json() registry_prefix = response_json['registry_prefix'] @@ -125,11 +126,12 @@ class RemoteRuntime(Runtime): force_rebuild=self.config.sandbox.force_rebuild_runtime, ) - response = send_request( + response = send_request_with_retry( self.session, 'GET', f'{self.config.sandbox.remote_runtime_api_url}/image_exists', params={'image': self.container_image}, + timeout=30, ) if response.status_code != 200 or not response.json()['exists']: raise RuntimeError( @@ -164,11 +166,12 @@ class RemoteRuntime(Runtime): self.send_status_message('STATUS$WAITING_FOR_CLIENT') # Start the sandbox using the /start endpoint - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.config.sandbox.remote_runtime_api_url}/start', json=start_request, + timeout=300, ) if response.status_code != 201: raise RuntimeError(f'Failed to start sandbox: {response.text}') @@ -216,7 +219,7 @@ class RemoteRuntime(Runtime): ) def _wait_until_alive(self): logger.info(f'Waiting for runtime to be alive at url: {self.runtime_url}') - response = send_request( + response = send_request_with_retry( self.session, 'GET', f'{self.runtime_url}/alive', @@ -235,7 +238,7 @@ class RemoteRuntime(Runtime): def close(self, timeout: int = 10): if self.runtime_id: try: - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.config.sandbox.remote_runtime_api_url}/stop', @@ -273,7 +276,7 @@ class RemoteRuntime(Runtime): logger.info('Executing action') request_body = {'action': event_to_dict(action)} logger.debug(f'Request body: {request_body}') - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.runtime_url}/execute_action', @@ -351,7 +354,7 @@ class RemoteRuntime(Runtime): params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()} - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.runtime_url}/upload_file', @@ -360,6 +363,7 @@ class RemoteRuntime(Runtime): retry_exceptions=list( filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS) ), + timeout=300, ) if response.status_code == 200: logger.info( @@ -385,7 +389,7 @@ class RemoteRuntime(Runtime): if path is not None: data['path'] = path - response = send_request( + response = send_request_with_retry( self.session, 'POST', f'{self.runtime_url}/list_files', @@ -393,6 +397,7 @@ class RemoteRuntime(Runtime): retry_exceptions=list( filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS) ), + timeout=30, # The runtime sbould already be running here ) if response.status_code == 200: response_json = response.json() diff --git a/openhands/runtime/utils/request.py b/openhands/runtime/utils/request.py index 7d68df8d8e..63789824a0 100644 --- a/openhands/runtime/utils/request.py +++ b/openhands/runtime/utils/request.py @@ -33,13 +33,13 @@ DEFAULT_RETRY_EXCEPTIONS = [ ] -def send_request( +def send_request_with_retry( session: requests.Session, method: str, url: str, + timeout: int, retry_exceptions: list[Type[Exception]] | None = None, retry_fns: list[Callable[[Exception], bool]] | None = None, - timeout: int = 120, **kwargs: Any, ) -> requests.Response: exceptions_to_catch = retry_exceptions or DEFAULT_RETRY_EXCEPTIONS @@ -54,7 +54,7 @@ def send_request( @retry( stop=stop_after_delay(timeout) | stop_if_should_exit(), - wait=wait_exponential(multiplier=1, min=4, max=60), + wait=wait_exponential(multiplier=1, min=4, max=20), retry=retry_condition, reraise=True, )