From bbfdc62139d73e3b573f988b5b3870519ab8651e Mon Sep 17 00:00:00 2001
From: tofarr <tofarr@gmail.com>
Date: Mon, 3 Feb 2025 08:44:09 -0700
Subject: [PATCH] Fix for issue where retries continue on a closed runtime
 (#6564)

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
---
 evaluation/benchmarks/EDA/run_infer.py        |  1 +
 .../benchmarks/agent_bench/run_infer.py       |  1 +
 .../benchmarks/aider_bench/run_infer.py       |  1 +
 evaluation/benchmarks/biocoder/run_infer.py   |  1 +
 evaluation/benchmarks/bird/run_infer.py       |  1 +
 .../browsing_delegation/run_infer.py          |  1 +
 .../benchmarks/commit0_bench/run_infer.py     |  1 +
 .../benchmarks/discoverybench/run_infer.py    |  1 +
 evaluation/benchmarks/gaia/run_infer.py       |  1 +
 evaluation/benchmarks/gorilla/run_infer.py    |  1 +
 evaluation/benchmarks/gpqa/run_infer.py       |  1 +
 .../benchmarks/humanevalfix/run_infer.py      |  1 +
 .../benchmarks/logic_reasoning/run_infer.py   |  1 +
 evaluation/benchmarks/miniwob/run_infer.py    |  1 +
 evaluation/benchmarks/mint/run_infer.py       |  1 +
 .../benchmarks/scienceagentbench/run_infer.py |  1 +
 evaluation/benchmarks/swe_bench/run_infer.py  |  1 +
 .../benchmarks/the_agent_company/run_infer.py |  1 +
 evaluation/benchmarks/toolqa/run_infer.py     |  1 +
 .../benchmarks/visualwebarena/run_infer.py    |  1 +
 evaluation/benchmarks/webarena/run_infer.py   |  1 +
 openhands/core/config/sandbox_config.py       |  1 +
 .../runtime/impl/remote/remote_runtime.py     | 21 +++++++++++++------
 .../standalone_conversation_manager.py        |  2 +-
 24 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py
index 26756d3ea5..2d65e19438 100644
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -69,6 +69,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=False,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
index fb221e15ba..8c1f08b377 100644
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -53,6 +53,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 926f4634ed..8045f948d3 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -61,6 +61,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=1800,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
index dc6dc8fd3b..20f3dc4870 100644
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -67,6 +67,7 @@ def get_config(
             base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
index 8570e07b66..02d92aa3ee 100644
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -80,6 +80,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py
index 33d6ef4805..164e117e26 100644
--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -45,6 +45,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=False,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         workspace_base=None,
         workspace_mount_path=None,
diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
index e690952ab9..2e0fc528f7 100644
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -135,6 +135,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py
index 30af2d19d4..fc5d74b135 100644
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -71,6 +71,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
index b4c704e497..a8b4428192 100644
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -56,6 +56,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py
index e97be5ed83..d107151fc5 100644
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -49,6 +49,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py
index cf4106b971..b92a30b859 100644
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -70,6 +70,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
index fec040079c..c2cccf90c7 100644
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -91,6 +91,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py
index acd07edef2..e37c5b4ab0 100644
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -55,6 +55,7 @@ def get_config(
             enable_auto_lint=True,
             use_host_network=False,
             runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py
index acc1431c81..023cbe9cab 100644
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@@ -70,6 +70,7 @@ def get_config(
             remote_runtime_init_timeout=1800,
             keep_runtime_alive=False,
             timeout=120,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
index ddfef0ea68..4c356f26d9 100644
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -113,6 +113,7 @@ def get_config(
             enable_auto_lint=True,
             use_host_network=False,
             runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
index 2ca5c2b403..09619fb718 100644
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -73,6 +73,7 @@ def get_config(
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index e1a1764f21..5e3f0e6a5b 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -144,6 +144,7 @@ def get_config(
                 dataset_name=metadata.dataset,
                 instance_id=instance['instance_id'],
             ),
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
index 376df6c47c..5cd7c027e2 100644
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -50,6 +50,7 @@ def get_config(
             # large enough timeout, since some testcases take very long to run
             timeout=300,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_enable_retries=True,
         ),
         # we mount trajectories path so that trajectories, generated by OpenHands
         # controller, can be accessible to the evaluator file in the runtime container
diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
index 8306292d8f..45b9febed2 100644
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -50,6 +50,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py
index 5010daa42e..8986d3ab8f 100644
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -79,6 +79,7 @@ def get_config(
                 'VWA_HOMEPAGE': f'{base_url}:4399',
             },
             timeout=300,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py
index 79b7fc4371..ad846190d8 100644
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -71,6 +71,7 @@ def get_config(
                 'MAP': f'{base_url}:3000',
                 'HOMEPAGE': f'{base_url}:4399',
             },
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/openhands/core/config/sandbox_config.py b/openhands/core/config/sandbox_config.py
index 55ff384e0c..12edbbd4d9 100644
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -51,6 +51,7 @@ class SandboxConfig(BaseModel):
     timeout: int = Field(default=120)
     remote_runtime_init_timeout: int = Field(default=180)
     remote_runtime_api_timeout: int = Field(default=10)
+    remote_runtime_enable_retries: bool = Field(default=False)
     enable_auto_lint: bool = Field(
         default=False  # once enabled, OpenHands would lint files after editing
     )
diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index 02e31fa364..eb1b58440b 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -291,7 +291,7 @@ class RemoteRuntime(ActionExecutionClient):
             stop=tenacity.stop_after_delay(
                 self.config.sandbox.remote_runtime_init_timeout
             )
-            | stop_if_should_exit(),
+            | stop_if_should_exit() | self._stop_if_closed,
             reraise=True,
             retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError),
             wait=tenacity.wait_fixed(2),
@@ -388,12 +388,18 @@ class RemoteRuntime(ActionExecutionClient):
             )
             raise
 
-    @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(ConnectionError),
-        stop=tenacity.stop_after_attempt(3) | stop_if_should_exit(),
-        wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
-    )
     def _send_action_server_request(self, method, url, **kwargs):
+        if not self.config.sandbox.remote_runtime_enable_retries:
+            return self._send_action_server_request(method, url, **kwargs)
+
+        retry_decorator = tenacity.retry(
+            retry=tenacity.retry_if_exception_type(ConnectionError),
+            stop=tenacity.stop_after_attempt(3) | stop_if_should_exit() | self._stop_if_closed,
+            wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
+        )
+        return retry_decorator(self._send_action_server_request_impl)(method, url, **kwargs)
+
+    def _send_action_server_request_impl(self, method, url, **kwargs):
         try:
             return super()._send_action_server_request(method, url, **kwargs)
         except requests.Timeout:
@@ -424,3 +430,6 @@ class RemoteRuntime(ActionExecutionClient):
                     ) from e
             else:
                 raise e
+            
+    def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
+        return self._runtime_closed
diff --git a/openhands/server/conversation_manager/standalone_conversation_manager.py b/openhands/server/conversation_manager/standalone_conversation_manager.py
index 078f012a53..a1748038d6 100644
--- a/openhands/server/conversation_manager/standalone_conversation_manager.py
+++ b/openhands/server/conversation_manager/standalone_conversation_manager.py
@@ -150,7 +150,7 @@ class StandaloneConversationManager(ConversationManager):
                 )
                 return
             except Exception as e:
-                logger.warning(f'error_cleaning_stale: {str(e)}')
+                logger.error(f'error_cleaning_stale')
                 await asyncio.sleep(_CLEANUP_INTERVAL)
 
     async def get_running_agent_loops(