fix(RemoteRuntime): add retry for pod status after /start (#4825)

2025-12-26 05:48:36 +08:00 · 2024-11-07 10:22:47 -06:00 · 2024-11-07 10:22:47 -06:00 · f9fa1d95cb
commit f9fa1d95cb
parent 5615d54f81
1 changed files with 10 additions and 3 deletions
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@ -90,9 +90,8 @@ class RemoteRuntime(Runtime):
        self.runtime_url: str | None = None

    async def connect(self):
-        await call_sync_from_async(self._start_or_attach_to_runtime)
        try:
-            await call_sync_from_async(self._wait_until_alive)
+            await call_sync_from_async(self._start_or_attach_to_runtime)
        except RuntimeNotReadyError:
            self.log('error', 'Runtime failed to start, timed out before ready')
            raise
@ -277,6 +276,14 @@ class RemoteRuntime(Runtime):
        assert runtime_data['runtime_id'] == self.runtime_id
        assert 'pod_status' in runtime_data
        pod_status = runtime_data['pod_status']
+
+        # FIXME: We should fix it at the backend of /start endpoint, make sure
+        # the pod is created before returning the response.
+        # Retry a period of time to give the cluster time to start the pod
+        if pod_status == 'Not Found':
+            raise RuntimeNotReadyError(
+                f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
+            )
        if pod_status == 'Ready':
            try:
                self._send_request(
@ -291,7 +298,7 @@ class RemoteRuntime(Runtime):
                    f'Runtime /alive failed to respond with 200: {e}'
                )
            return
-        if pod_status in ('Failed', 'Unknown', 'Not Found'):
+        if pod_status in ('Failed', 'Unknown'):
            # clean up the runtime
            self.close()
            raise RuntimeError(