Fix for issue where retries continue on a closed runtime (#6564)

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
2025-12-26 05:48:36 +08:00 · 2025-02-03 08:44:09 -07:00 · 2025-02-03 08:44:09 -07:00 · bbfdc62139
commit bbfdc62139
parent 622fc5213d
24 changed files with 38 additions and 7 deletions
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@ -69,6 +69,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@ -53,6 +53,7 @@ def get_config(
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=3600,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@ -61,6 +61,7 @@ def get_config(
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=1800,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@ -67,6 +67,7 @@ def get_config(
            base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@ -80,6 +80,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@ -45,6 +45,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@ -135,6 +135,7 @@ def get_config(
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=3600,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@ -71,6 +71,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@ -56,6 +56,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@ -49,6 +49,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@ -70,6 +70,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@ -91,6 +91,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@ -55,6 +55,7 @@ def get_config(
            enable_auto_lint=True,
            use_host_network=False,
            runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@ -70,6 +70,7 @@ def get_config(
            remote_runtime_init_timeout=1800,
            keep_runtime_alive=False,
            timeout=120,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@ -113,6 +113,7 @@ def get_config(
            enable_auto_lint=True,
            use_host_network=False,
            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@ -73,6 +73,7 @@ def get_config(
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@ -144,6 +144,7 @@ def get_config(
                dataset_name=metadata.dataset,
                instance_id=instance['instance_id'],
            ),
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@ -50,6 +50,7 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_enable_retries=True,
        ),
        # we mount trajectories path so that trajectories, generated by OpenHands
        # controller, can be accessible to the evaluator file in the runtime container
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@ -50,6 +50,7 @@ def get_config(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@ -79,6 +79,7 @@ def get_config(
                'VWA_HOMEPAGE': f'{base_url}:4399',
            },
            timeout=300,
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@ -71,6 +71,7 @@ def get_config(
                'MAP': f'{base_url}:3000',
                'HOMEPAGE': f'{base_url}:4399',
            },
+            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@ -51,6 +51,7 @@ class SandboxConfig(BaseModel):
    timeout: int = Field(default=120)
    remote_runtime_init_timeout: int = Field(default=180)
    remote_runtime_api_timeout: int = Field(default=10)
+    remote_runtime_enable_retries: bool = Field(default=False)
    enable_auto_lint: bool = Field(
        default=False  # once enabled, OpenHands would lint files after editing
    )
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@ -291,7 +291,7 @@ class RemoteRuntime(ActionExecutionClient):
            stop=tenacity.stop_after_delay(
                self.config.sandbox.remote_runtime_init_timeout
            )
-            | stop_if_should_exit(),
+            | stop_if_should_exit() | self._stop_if_closed,
            reraise=True,
            retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError),
            wait=tenacity.wait_fixed(2),
@ -388,12 +388,18 @@ class RemoteRuntime(ActionExecutionClient):
            )
            raise

-    @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(ConnectionError),
-        stop=tenacity.stop_after_attempt(3) | stop_if_should_exit(),
-        wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
-    )
    def _send_action_server_request(self, method, url, **kwargs):
+        if not self.config.sandbox.remote_runtime_enable_retries:
+            return self._send_action_server_request(method, url, **kwargs)
+
+        retry_decorator = tenacity.retry(
+            retry=tenacity.retry_if_exception_type(ConnectionError),
+            stop=tenacity.stop_after_attempt(3) | stop_if_should_exit() | self._stop_if_closed,
+            wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
+        )
+        return retry_decorator(self._send_action_server_request_impl)(method, url, **kwargs)
+
+    def _send_action_server_request_impl(self, method, url, **kwargs):
        try:
            return super()._send_action_server_request(method, url, **kwargs)
        except requests.Timeout:
@ -424,3 +430,6 @@ class RemoteRuntime(ActionExecutionClient):
                    ) from e
            else:
                raise e
+            
+    def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
+        return self._runtime_closed
--- a/openhands/server/conversation_manager/standalone_conversation_manager.py
+++ b/openhands/server/conversation_manager/standalone_conversation_manager.py
@ -150,7 +150,7 @@ class StandaloneConversationManager(ConversationManager):
                )
                return
            except Exception as e:
-                logger.warning(f'error_cleaning_stale: {str(e)}')
+                logger.error(f'error_cleaning_stale')
                await asyncio.sleep(_CLEANUP_INTERVAL)

    async def get_running_agent_loops(