fix: add exponential backoff retry for env var export when bash session is busy (#12748)

Co-authored-by: mkdev11 <MkDev11@users.noreply.github.com>
This commit is contained in:
MkDev11
2026-02-06 04:07:17 -08:00
committed by GitHub
parent dd8be12809
commit 4fa2ad0f47
2 changed files with 291 additions and 16 deletions

View File

@@ -8,6 +8,7 @@ import shlex
import shutil
import string
import tempfile
import time
from abc import abstractmethod
from pathlib import Path
from types import MappingProxyType
@@ -78,6 +79,11 @@ from openhands.utils.async_utils import (
DISABLE_VSCODE_PLUGIN = os.getenv('DISABLE_VSCODE_PLUGIN', 'false').lower() == 'true'
# Command retry config for bash session busy race condition (issue #12265)
CMD_RETRY_MAX_ATTEMPTS = 3
CMD_RETRY_BASE_DELAY_SECONDS = 1.0
CMD_RETRY_TIMEOUT_EXIT_CODE = -1
def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
ret = {}
@@ -242,11 +248,68 @@ class Runtime(FileEditRuntimeMixin):
self, runtime_status: RuntimeStatus, msg: str = '', level: str = 'info'
):
"""Sends a status message if the callback function was provided."""
self.runtime_status = runtime_status
if self.status_callback:
self.status_callback(level, runtime_status, msg)
# ====================================================================
def _is_bash_session_timeout(self, obs: Observation) -> bool:
return (
isinstance(obs, CmdOutputObservation)
and obs.exit_code == CMD_RETRY_TIMEOUT_EXIT_CODE
)
def _calculate_retry_delay(self, attempt: int) -> float:
return CMD_RETRY_BASE_DELAY_SECONDS * (2**attempt)
def _run_cmd_with_retry(
self,
cmd: str,
error_context: str,
max_retries: int = CMD_RETRY_MAX_ATTEMPTS,
) -> CmdOutputObservation:
"""Run command with exponential backoff retry on bash session timeout."""
if not cmd or not cmd.strip():
raise ValueError('Command cannot be empty')
if max_retries < 1:
raise ValueError('max_retries must be at least 1')
last_obs: Observation | None = None
for attempt in range(max_retries):
obs = self.run(CmdRunAction(cmd))
if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
if attempt > 0:
logger.info(f'Command succeeded after {attempt + 1} attempts')
return obs
last_obs = obs
is_retryable = self._is_bash_session_timeout(obs)
if is_retryable and attempt < max_retries - 1:
delay = self._calculate_retry_delay(attempt)
logger.warning(
f'Bash session busy, retrying in {delay:.1f}s '
f'(attempt {attempt + 1}/{max_retries})'
)
time.sleep(delay)
continue
break
error_content = self._extract_error_content(last_obs)
raise RuntimeError(f'{error_context}: {error_content}')
def _extract_error_content(self, obs: Observation | None) -> str:
if obs is None:
return 'No observation received'
if isinstance(obs, CmdOutputObservation):
return f'{obs.content} (exit_code={obs.exit_code})'
if isinstance(obs, ErrorObservation):
return obs.content
return str(obs)
def add_env_vars(self, env_vars: dict[str, str]) -> None:
env_vars = {key.upper(): value for key, value in env_vars.items()}
@@ -282,11 +345,9 @@ class Runtime(FileEditRuntimeMixin):
cmd = cmd.strip()
logger.debug('Adding env vars to PowerShell') # don't log the values
obs = self.run(CmdRunAction(cmd))
if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
raise RuntimeError(
f'Failed to add env vars [{env_vars.keys()}] to environment: {obs.content}'
)
self._run_cmd_with_retry(
cmd, f'Failed to add env vars [{env_vars.keys()}] to environment'
)
# We don't add to profile persistence on Windows as it's more complex
# and varies between PowerShell versions
@@ -308,20 +369,16 @@ class Runtime(FileEditRuntimeMixin):
cmd = cmd.strip()
logger.debug('Adding env vars to bash') # don't log the values
obs = self.run(CmdRunAction(cmd))
if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
raise RuntimeError(
f'Failed to add env vars [{env_vars.keys()}] to environment: {obs.content}'
)
self._run_cmd_with_retry(
cmd, f'Failed to add env vars [{env_vars.keys()}] to environment'
)
# Add to .bashrc for persistence
bashrc_cmd = bashrc_cmd.strip()
logger.debug(f'Adding env var to .bashrc: {env_vars.keys()}')
obs = self.run(CmdRunAction(bashrc_cmd))
if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
raise RuntimeError(
f'Failed to add env vars [{env_vars.keys()}] to .bashrc: {obs.content}'
)
self._run_cmd_with_retry(
bashrc_cmd, f'Failed to add env vars [{env_vars.keys()}] to .bashrc'
)
def on_event(self, event: Event) -> None:
if isinstance(event, Action):

View File

@@ -0,0 +1,218 @@
"""Tests for command retry logic with exponential backoff.
Tests the fix for GitHub issue #12265 where GitHub token export fails
when bash session is busy (race condition).
"""
from unittest.mock import MagicMock, patch
import pytest
from openhands.events.observation import CmdOutputObservation, ErrorObservation
from openhands.runtime.base import (
CMD_RETRY_BASE_DELAY_SECONDS,
CMD_RETRY_MAX_ATTEMPTS,
CMD_RETRY_TIMEOUT_EXIT_CODE,
)
class TestCmdRetryHelpers:
"""Tests for the helper methods used in command retry logic."""
@pytest.fixture
def mock_runtime(self):
"""Create a mock runtime with the retry methods."""
from openhands.runtime.base import Runtime
# Create a minimal mock that has the methods we need to test
runtime = MagicMock(spec=Runtime)
# Bind the actual methods to our mock
runtime._is_bash_session_timeout = Runtime._is_bash_session_timeout.__get__(
runtime, Runtime
)
runtime._calculate_retry_delay = Runtime._calculate_retry_delay.__get__(
runtime, Runtime
)
runtime._extract_error_content = Runtime._extract_error_content.__get__(
runtime, Runtime
)
runtime._run_cmd_with_retry = Runtime._run_cmd_with_retry.__get__(
runtime, Runtime
)
return runtime
def test_is_bash_session_timeout_with_timeout_exit_code(self, mock_runtime):
"""Test that timeout exit code (-1) is correctly identified."""
obs = CmdOutputObservation(content='', command='test', exit_code=-1)
assert mock_runtime._is_bash_session_timeout(obs) is True
def test_is_bash_session_timeout_with_success(self, mock_runtime):
"""Test that successful commands are not identified as timeout."""
obs = CmdOutputObservation(content='output', command='test', exit_code=0)
assert mock_runtime._is_bash_session_timeout(obs) is False
def test_is_bash_session_timeout_with_other_error(self, mock_runtime):
"""Test that other error codes are not identified as timeout."""
obs = CmdOutputObservation(content='error', command='test', exit_code=1)
assert mock_runtime._is_bash_session_timeout(obs) is False
def test_is_bash_session_timeout_with_error_observation(self, mock_runtime):
"""Test that ErrorObservation is not identified as timeout."""
obs = ErrorObservation(content='some error')
assert mock_runtime._is_bash_session_timeout(obs) is False
def test_calculate_retry_delay_exponential(self, mock_runtime):
"""Test exponential backoff delay calculation."""
assert mock_runtime._calculate_retry_delay(0) == CMD_RETRY_BASE_DELAY_SECONDS
assert (
mock_runtime._calculate_retry_delay(1) == CMD_RETRY_BASE_DELAY_SECONDS * 2
)
assert (
mock_runtime._calculate_retry_delay(2) == CMD_RETRY_BASE_DELAY_SECONDS * 4
)
def test_extract_error_content_from_none(self, mock_runtime):
"""Test error extraction from None observation."""
assert mock_runtime._extract_error_content(None) == 'No observation received'
def test_extract_error_content_from_cmd_output(self, mock_runtime):
"""Test error extraction from CmdOutputObservation."""
obs = CmdOutputObservation(
content='command failed', command='test', exit_code=1
)
result = mock_runtime._extract_error_content(obs)
assert 'command failed' in result
assert 'exit_code=1' in result
def test_extract_error_content_from_error_observation(self, mock_runtime):
"""Test error extraction from ErrorObservation."""
obs = ErrorObservation(content='something went wrong')
assert mock_runtime._extract_error_content(obs) == 'something went wrong'
class TestRunCmdWithRetry:
"""Tests for the main _run_cmd_with_retry method."""
@pytest.fixture
def mock_runtime(self):
"""Create a mock runtime for retry testing."""
from openhands.runtime.base import Runtime
runtime = MagicMock(spec=Runtime)
# Bind actual methods
runtime._is_bash_session_timeout = Runtime._is_bash_session_timeout.__get__(
runtime, Runtime
)
runtime._calculate_retry_delay = Runtime._calculate_retry_delay.__get__(
runtime, Runtime
)
runtime._extract_error_content = Runtime._extract_error_content.__get__(
runtime, Runtime
)
runtime._run_cmd_with_retry = Runtime._run_cmd_with_retry.__get__(
runtime, Runtime
)
return runtime
def test_success_on_first_attempt(self, mock_runtime):
"""Test successful command execution on first try."""
success_obs = CmdOutputObservation(
content='success', command='echo test', exit_code=0
)
mock_runtime.run = MagicMock(return_value=success_obs)
result = mock_runtime._run_cmd_with_retry('echo test', 'Test error')
assert result == success_obs
assert mock_runtime.run.call_count == 1
@patch('openhands.runtime.base.time.sleep')
def test_retry_on_timeout_then_success(self, mock_sleep, mock_runtime):
"""Test retry behavior when first attempt times out."""
timeout_obs = CmdOutputObservation(
content='', command='export VAR=value', exit_code=-1
)
success_obs = CmdOutputObservation(
content='', command='export VAR=value', exit_code=0
)
mock_runtime.run = MagicMock(side_effect=[timeout_obs, success_obs])
result = mock_runtime._run_cmd_with_retry(
'export VAR=value', 'Failed to export'
)
assert result == success_obs
assert mock_runtime.run.call_count == 2
mock_sleep.assert_called_once()
@patch('openhands.runtime.base.time.sleep')
def test_retry_exhaustion_raises_error(self, mock_sleep, mock_runtime):
"""Test that RuntimeError is raised after all retries fail."""
timeout_obs = CmdOutputObservation(
content='timeout', command='cmd', exit_code=-1
)
mock_runtime.run = MagicMock(return_value=timeout_obs)
with pytest.raises(RuntimeError) as exc_info:
mock_runtime._run_cmd_with_retry('cmd', 'Command failed', max_retries=3)
assert 'Command failed' in str(exc_info.value)
assert mock_runtime.run.call_count == 3
assert mock_sleep.call_count == 2 # Called between retries, not after last
def test_non_timeout_error_fails_immediately(self, mock_runtime):
"""Test that non-timeout errors don't trigger retry."""
error_obs = CmdOutputObservation(
content='permission denied', command='cmd', exit_code=1
)
mock_runtime.run = MagicMock(return_value=error_obs)
with pytest.raises(RuntimeError) as exc_info:
mock_runtime._run_cmd_with_retry('cmd', 'Command failed')
assert 'Command failed' in str(exc_info.value)
assert mock_runtime.run.call_count == 1 # No retries for non-timeout
def test_empty_command_raises_value_error(self, mock_runtime):
"""Test that empty command raises ValueError."""
with pytest.raises(ValueError) as exc_info:
mock_runtime._run_cmd_with_retry('', 'Error')
assert 'empty' in str(exc_info.value).lower()
def test_invalid_max_retries_raises_value_error(self, mock_runtime):
"""Test that max_retries < 1 raises ValueError."""
with pytest.raises(ValueError) as exc_info:
mock_runtime._run_cmd_with_retry('cmd', 'Error', max_retries=0)
assert 'max_retries' in str(exc_info.value).lower()
@patch('openhands.runtime.base.time.sleep')
def test_exponential_backoff_delays(self, mock_sleep, mock_runtime):
"""Test that delays follow exponential backoff pattern."""
timeout_obs = CmdOutputObservation(content='', command='cmd', exit_code=-1)
mock_runtime.run = MagicMock(return_value=timeout_obs)
with pytest.raises(RuntimeError):
mock_runtime._run_cmd_with_retry('cmd', 'Error', max_retries=3)
# Verify exponential delays: 1s, 2s (not called after 3rd attempt)
calls = mock_sleep.call_args_list
assert len(calls) == 2
assert calls[0][0][0] == CMD_RETRY_BASE_DELAY_SECONDS # 1s
assert calls[1][0][0] == CMD_RETRY_BASE_DELAY_SECONDS * 2 # 2s
class TestConstants:
"""Tests for retry configuration constants."""
def test_constants_have_sensible_values(self):
"""Test that constants are configured reasonably."""
assert CMD_RETRY_MAX_ATTEMPTS >= 1
assert CMD_RETRY_BASE_DELAY_SECONDS > 0
assert (
CMD_RETRY_TIMEOUT_EXIT_CODE == -1
) # Must match NO_CHANGE_TIMEOUT behavior