fix: add exponential backoff retry for env var export when bash session is busy (#12748)

Co-authored-by: mkdev11 <MkDev11@users.noreply.github.com>
2026-03-22 05:37:20 +08:00 · 2026-02-06 04:07:17 -08:00
parent dd8be12809
commit 4fa2ad0f47
2 changed files with 291 additions and 16 deletions
--- a/openhands/runtime/base.py
+++ b/openhands/runtime/base.py
@@ -8,6 +8,7 @@ import shlex
 import shutil
 import string
 import tempfile
+import time
 from abc import abstractmethod
 from pathlib import Path
 from types import MappingProxyType
@@ -78,6 +79,11 @@ from openhands.utils.async_utils import (

 DISABLE_VSCODE_PLUGIN = os.getenv('DISABLE_VSCODE_PLUGIN', 'false').lower() == 'true'

+# Command retry config for bash session busy race condition (issue #12265)
+CMD_RETRY_MAX_ATTEMPTS = 3
+CMD_RETRY_BASE_DELAY_SECONDS = 1.0
+CMD_RETRY_TIMEOUT_EXIT_CODE = -1
+

 def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
    ret = {}
@@ -242,11 +248,68 @@ class Runtime(FileEditRuntimeMixin):
        self, runtime_status: RuntimeStatus, msg: str = '', level: str = 'info'
    ):
        """Sends a status message if the callback function was provided."""
+
        self.runtime_status = runtime_status
        if self.status_callback:
            self.status_callback(level, runtime_status, msg)

-    # ====================================================================
+    def _is_bash_session_timeout(self, obs: Observation) -> bool:
+        return (
+            isinstance(obs, CmdOutputObservation)
+            and obs.exit_code == CMD_RETRY_TIMEOUT_EXIT_CODE
+        )
+
+    def _calculate_retry_delay(self, attempt: int) -> float:
+        return CMD_RETRY_BASE_DELAY_SECONDS * (2**attempt)
+
+    def _run_cmd_with_retry(
+        self,
+        cmd: str,
+        error_context: str,
+        max_retries: int = CMD_RETRY_MAX_ATTEMPTS,
+    ) -> CmdOutputObservation:
+        """Run command with exponential backoff retry on bash session timeout."""
+
+        if not cmd or not cmd.strip():
+            raise ValueError('Command cannot be empty')
+        if max_retries < 1:
+            raise ValueError('max_retries must be at least 1')
+
+        last_obs: Observation | None = None
+
+        for attempt in range(max_retries):
+            obs = self.run(CmdRunAction(cmd))
+
+            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+                if attempt > 0:
+                    logger.info(f'Command succeeded after {attempt + 1} attempts')
+                return obs
+
+            last_obs = obs
+            is_retryable = self._is_bash_session_timeout(obs)
+
+            if is_retryable and attempt < max_retries - 1:
+                delay = self._calculate_retry_delay(attempt)
+                logger.warning(
+                    f'Bash session busy, retrying in {delay:.1f}s '
+                    f'(attempt {attempt + 1}/{max_retries})'
+                )
+                time.sleep(delay)
+                continue
+
+            break
+
+        error_content = self._extract_error_content(last_obs)
+        raise RuntimeError(f'{error_context}: {error_content}')
+
+    def _extract_error_content(self, obs: Observation | None) -> str:
+        if obs is None:
+            return 'No observation received'
+        if isinstance(obs, CmdOutputObservation):
+            return f'{obs.content} (exit_code={obs.exit_code})'
+        if isinstance(obs, ErrorObservation):
+            return obs.content
+        return str(obs)

    def add_env_vars(self, env_vars: dict[str, str]) -> None:
        env_vars = {key.upper(): value for key, value in env_vars.items()}
@@ -282,11 +345,9 @@ class Runtime(FileEditRuntimeMixin):
            cmd = cmd.strip()
            logger.debug('Adding env vars to PowerShell')  # don't log the values

-            obs = self.run(CmdRunAction(cmd))
-            if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
-                raise RuntimeError(
-                    f'Failed to add env vars [{env_vars.keys()}] to environment: {obs.content}'
-                )
+            self._run_cmd_with_retry(
+                cmd, f'Failed to add env vars [{env_vars.keys()}] to environment'
+            )

            # We don't add to profile persistence on Windows as it's more complex
            # and varies between PowerShell versions
@@ -308,20 +369,16 @@ class Runtime(FileEditRuntimeMixin):
            cmd = cmd.strip()
            logger.debug('Adding env vars to bash')  # don't log the values

-            obs = self.run(CmdRunAction(cmd))
-            if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
-                raise RuntimeError(
-                    f'Failed to add env vars [{env_vars.keys()}] to environment: {obs.content}'
-                )
+            self._run_cmd_with_retry(
+                cmd, f'Failed to add env vars [{env_vars.keys()}] to environment'
+            )

            # Add to .bashrc for persistence
            bashrc_cmd = bashrc_cmd.strip()
            logger.debug(f'Adding env var to .bashrc: {env_vars.keys()}')
-            obs = self.run(CmdRunAction(bashrc_cmd))
-            if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
-                raise RuntimeError(
-                    f'Failed to add env vars [{env_vars.keys()}] to .bashrc: {obs.content}'
-                )
+            self._run_cmd_with_retry(
+                bashrc_cmd, f'Failed to add env vars [{env_vars.keys()}] to .bashrc'
+            )

    def on_event(self, event: Event) -> None:
        if isinstance(event, Action):
--- a/tests/unit/runtime/test_cmd_retry.py
+++ b/tests/unit/runtime/test_cmd_retry.py
@@ -0,0 +1,218 @@
+"""Tests for command retry logic with exponential backoff.
+
+Tests the fix for GitHub issue #12265 where GitHub token export fails
+when bash session is busy (race condition).
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from openhands.events.observation import CmdOutputObservation, ErrorObservation
+from openhands.runtime.base import (
+    CMD_RETRY_BASE_DELAY_SECONDS,
+    CMD_RETRY_MAX_ATTEMPTS,
+    CMD_RETRY_TIMEOUT_EXIT_CODE,
+)
+
+
+class TestCmdRetryHelpers:
+    """Tests for the helper methods used in command retry logic."""
+
+    @pytest.fixture
+    def mock_runtime(self):
+        """Create a mock runtime with the retry methods."""
+        from openhands.runtime.base import Runtime
+
+        # Create a minimal mock that has the methods we need to test
+        runtime = MagicMock(spec=Runtime)
+
+        # Bind the actual methods to our mock
+        runtime._is_bash_session_timeout = Runtime._is_bash_session_timeout.__get__(
+            runtime, Runtime
+        )
+        runtime._calculate_retry_delay = Runtime._calculate_retry_delay.__get__(
+            runtime, Runtime
+        )
+        runtime._extract_error_content = Runtime._extract_error_content.__get__(
+            runtime, Runtime
+        )
+        runtime._run_cmd_with_retry = Runtime._run_cmd_with_retry.__get__(
+            runtime, Runtime
+        )
+
+        return runtime
+
+    def test_is_bash_session_timeout_with_timeout_exit_code(self, mock_runtime):
+        """Test that timeout exit code (-1) is correctly identified."""
+        obs = CmdOutputObservation(content='', command='test', exit_code=-1)
+        assert mock_runtime._is_bash_session_timeout(obs) is True
+
+    def test_is_bash_session_timeout_with_success(self, mock_runtime):
+        """Test that successful commands are not identified as timeout."""
+        obs = CmdOutputObservation(content='output', command='test', exit_code=0)
+        assert mock_runtime._is_bash_session_timeout(obs) is False
+
+    def test_is_bash_session_timeout_with_other_error(self, mock_runtime):
+        """Test that other error codes are not identified as timeout."""
+        obs = CmdOutputObservation(content='error', command='test', exit_code=1)
+        assert mock_runtime._is_bash_session_timeout(obs) is False
+
+    def test_is_bash_session_timeout_with_error_observation(self, mock_runtime):
+        """Test that ErrorObservation is not identified as timeout."""
+        obs = ErrorObservation(content='some error')
+        assert mock_runtime._is_bash_session_timeout(obs) is False
+
+    def test_calculate_retry_delay_exponential(self, mock_runtime):
+        """Test exponential backoff delay calculation."""
+        assert mock_runtime._calculate_retry_delay(0) == CMD_RETRY_BASE_DELAY_SECONDS
+        assert (
+            mock_runtime._calculate_retry_delay(1) == CMD_RETRY_BASE_DELAY_SECONDS * 2
+        )
+        assert (
+            mock_runtime._calculate_retry_delay(2) == CMD_RETRY_BASE_DELAY_SECONDS * 4
+        )
+
+    def test_extract_error_content_from_none(self, mock_runtime):
+        """Test error extraction from None observation."""
+        assert mock_runtime._extract_error_content(None) == 'No observation received'
+
+    def test_extract_error_content_from_cmd_output(self, mock_runtime):
+        """Test error extraction from CmdOutputObservation."""
+        obs = CmdOutputObservation(
+            content='command failed', command='test', exit_code=1
+        )
+        result = mock_runtime._extract_error_content(obs)
+        assert 'command failed' in result
+        assert 'exit_code=1' in result
+
+    def test_extract_error_content_from_error_observation(self, mock_runtime):
+        """Test error extraction from ErrorObservation."""
+        obs = ErrorObservation(content='something went wrong')
+        assert mock_runtime._extract_error_content(obs) == 'something went wrong'
+
+
+class TestRunCmdWithRetry:
+    """Tests for the main _run_cmd_with_retry method."""
+
+    @pytest.fixture
+    def mock_runtime(self):
+        """Create a mock runtime for retry testing."""
+        from openhands.runtime.base import Runtime
+
+        runtime = MagicMock(spec=Runtime)
+
+        # Bind actual methods
+        runtime._is_bash_session_timeout = Runtime._is_bash_session_timeout.__get__(
+            runtime, Runtime
+        )
+        runtime._calculate_retry_delay = Runtime._calculate_retry_delay.__get__(
+            runtime, Runtime
+        )
+        runtime._extract_error_content = Runtime._extract_error_content.__get__(
+            runtime, Runtime
+        )
+        runtime._run_cmd_with_retry = Runtime._run_cmd_with_retry.__get__(
+            runtime, Runtime
+        )
+
+        return runtime
+
+    def test_success_on_first_attempt(self, mock_runtime):
+        """Test successful command execution on first try."""
+        success_obs = CmdOutputObservation(
+            content='success', command='echo test', exit_code=0
+        )
+        mock_runtime.run = MagicMock(return_value=success_obs)
+
+        result = mock_runtime._run_cmd_with_retry('echo test', 'Test error')
+
+        assert result == success_obs
+        assert mock_runtime.run.call_count == 1
+
+    @patch('openhands.runtime.base.time.sleep')
+    def test_retry_on_timeout_then_success(self, mock_sleep, mock_runtime):
+        """Test retry behavior when first attempt times out."""
+        timeout_obs = CmdOutputObservation(
+            content='', command='export VAR=value', exit_code=-1
+        )
+        success_obs = CmdOutputObservation(
+            content='', command='export VAR=value', exit_code=0
+        )
+
+        mock_runtime.run = MagicMock(side_effect=[timeout_obs, success_obs])
+
+        result = mock_runtime._run_cmd_with_retry(
+            'export VAR=value', 'Failed to export'
+        )
+
+        assert result == success_obs
+        assert mock_runtime.run.call_count == 2
+        mock_sleep.assert_called_once()
+
+    @patch('openhands.runtime.base.time.sleep')
+    def test_retry_exhaustion_raises_error(self, mock_sleep, mock_runtime):
+        """Test that RuntimeError is raised after all retries fail."""
+        timeout_obs = CmdOutputObservation(
+            content='timeout', command='cmd', exit_code=-1
+        )
+        mock_runtime.run = MagicMock(return_value=timeout_obs)
+
+        with pytest.raises(RuntimeError) as exc_info:
+            mock_runtime._run_cmd_with_retry('cmd', 'Command failed', max_retries=3)
+
+        assert 'Command failed' in str(exc_info.value)
+        assert mock_runtime.run.call_count == 3
+        assert mock_sleep.call_count == 2  # Called between retries, not after last
+
+    def test_non_timeout_error_fails_immediately(self, mock_runtime):
+        """Test that non-timeout errors don't trigger retry."""
+        error_obs = CmdOutputObservation(
+            content='permission denied', command='cmd', exit_code=1
+        )
+        mock_runtime.run = MagicMock(return_value=error_obs)
+
+        with pytest.raises(RuntimeError) as exc_info:
+            mock_runtime._run_cmd_with_retry('cmd', 'Command failed')
+
+        assert 'Command failed' in str(exc_info.value)
+        assert mock_runtime.run.call_count == 1  # No retries for non-timeout
+
+    def test_empty_command_raises_value_error(self, mock_runtime):
+        """Test that empty command raises ValueError."""
+        with pytest.raises(ValueError) as exc_info:
+            mock_runtime._run_cmd_with_retry('', 'Error')
+        assert 'empty' in str(exc_info.value).lower()
+
+    def test_invalid_max_retries_raises_value_error(self, mock_runtime):
+        """Test that max_retries < 1 raises ValueError."""
+        with pytest.raises(ValueError) as exc_info:
+            mock_runtime._run_cmd_with_retry('cmd', 'Error', max_retries=0)
+        assert 'max_retries' in str(exc_info.value).lower()
+
+    @patch('openhands.runtime.base.time.sleep')
+    def test_exponential_backoff_delays(self, mock_sleep, mock_runtime):
+        """Test that delays follow exponential backoff pattern."""
+        timeout_obs = CmdOutputObservation(content='', command='cmd', exit_code=-1)
+        mock_runtime.run = MagicMock(return_value=timeout_obs)
+
+        with pytest.raises(RuntimeError):
+            mock_runtime._run_cmd_with_retry('cmd', 'Error', max_retries=3)
+
+        # Verify exponential delays: 1s, 2s (not called after 3rd attempt)
+        calls = mock_sleep.call_args_list
+        assert len(calls) == 2
+        assert calls[0][0][0] == CMD_RETRY_BASE_DELAY_SECONDS  # 1s
+        assert calls[1][0][0] == CMD_RETRY_BASE_DELAY_SECONDS * 2  # 2s
+
+
+class TestConstants:
+    """Tests for retry configuration constants."""
+
+    def test_constants_have_sensible_values(self):
+        """Test that constants are configured reasonably."""
+        assert CMD_RETRY_MAX_ATTEMPTS >= 1
+        assert CMD_RETRY_BASE_DELAY_SECONDS > 0
+        assert (
+            CMD_RETRY_TIMEOUT_EXIT_CODE == -1
+        )  # Must match NO_CHANGE_TIMEOUT behavior