fix(RemoteRuntime): add retry for pod status after /start (#4825)

This commit is contained in:
Xingyao Wang 2024-11-07 10:22:47 -06:00 committed by GitHub
parent 5615d54f81
commit f9fa1d95cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -90,9 +90,8 @@ class RemoteRuntime(Runtime):
self.runtime_url: str | None = None
async def connect(self):
await call_sync_from_async(self._start_or_attach_to_runtime)
try:
await call_sync_from_async(self._wait_until_alive)
await call_sync_from_async(self._start_or_attach_to_runtime)
except RuntimeNotReadyError:
self.log('error', 'Runtime failed to start, timed out before ready')
raise
@ -277,6 +276,14 @@ class RemoteRuntime(Runtime):
assert runtime_data['runtime_id'] == self.runtime_id
assert 'pod_status' in runtime_data
pod_status = runtime_data['pod_status']
# FIXME: We should fix it at the backend of /start endpoint, make sure
# the pod is created before returning the response.
# Retry a period of time to give the cluster time to start the pod
if pod_status == 'Not Found':
raise RuntimeNotReadyError(
f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
)
if pod_status == 'Ready':
try:
self._send_request(
@ -291,7 +298,7 @@ class RemoteRuntime(Runtime):
f'Runtime /alive failed to respond with 200: {e}'
)
return
if pod_status in ('Failed', 'Unknown', 'Not Found'):
if pod_status in ('Failed', 'Unknown'):
# clean up the runtime
self.close()
raise RuntimeError(