Fix runtime status error on conversation resume (#12718)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-03-22 05:37:20 +08:00 · 2026-02-02 14:04:45 -07:00
parent d6c11fe517
commit 1bb4c844d4
2 changed files with 147 additions and 0 deletions
--- a/openhands/server/routes/manage_conversations.py
+++ b/openhands/server/routes/manage_conversations.py
@@ -15,6 +15,7 @@ import re
 import uuid
 from datetime import datetime, timedelta, timezone
 from typing import Annotated
+from urllib.parse import urlparse

 import base62
 import httpx
@@ -40,6 +41,7 @@ from openhands.app_server.config import (
    depends_httpx_client,
    depends_sandbox_service,
 )
+from openhands.app_server.sandbox.sandbox_models import SandboxStatus
 from openhands.app_server.sandbox.sandbox_service import SandboxService
 from openhands.app_server.services.db_session_injector import set_db_session_keep_open
 from openhands.app_server.services.httpx_client_injector import (
@@ -119,6 +121,7 @@ app_conversation_info_service_dependency = depends_app_conversation_info_service
 sandbox_service_dependency = depends_sandbox_service()
 db_session_dependency = depends_db_session()
 httpx_client_dependency = depends_httpx_client()
+_RESUME_GRACE_PERIOD = 60


 def _filter_conversations_by_age(
@@ -470,6 +473,7 @@ async def get_conversation(
    conversation_id: str = Depends(validate_conversation_id),
    conversation_store: ConversationStore = Depends(get_conversation_store),
    app_conversation_service: AppConversationService = app_conversation_service_dependency,
+    httpx_client: httpx.AsyncClient = httpx_client_dependency,
 ) -> ConversationInfo | None:
    """Get a single conversation by ID.

@@ -484,6 +488,44 @@ async def get_conversation(
                conversation_uuid
            )
            if app_conversation:
+                if (
+                    app_conversation.sandbox_status == SandboxStatus.RUNNING
+                    and app_conversation.execution_status is None
+                ):
+                    # The sandbox is running, but we were unable to determine a status for
+                    # the conversation. It may be that it is still starting, or that the
+                    # conversation has been stopped / deleted.
+                    try:
+                        # Check the server info is available
+                        conversation_url = urlparse(app_conversation.conversation_url)
+                        sandbox_info_url = f'{str(conversation_url.scheme)}://{str(conversation_url.netloc)}/server_info'
+                        response = await httpx_client.get(sandbox_info_url)
+                        response.raise_for_status()
+                        server_info = response.json()
+
+                        # If the server has not been running long, we consider it still starting
+                        uptime = int(server_info.get('uptime'))
+                        if uptime < _RESUME_GRACE_PERIOD:
+                            app_conversation.sandbox_status = SandboxStatus.STARTING
+
+                    except Exception:
+                        # The sandbox is marked as RUNNING, but the server is not responding.
+                        # There is a bug in runtime API which means that the server is marked
+                        # as RUNNING before it is actually started. (Primarily affecting resumed
+                        # runtimes) As a temporary work around for this, we mark the server as
+                        # STARTING. If the sandbox is actually in an error state, the API will
+                        # discover this quite quickly and mark the sandbox as ERROR
+                        logger.warning(
+                            'get_sandbox_info_failed',
+                            extra={
+                                'conversation_id': app_conversation.id,
+                                'sandbox_id': app_conversation.sandbox_id,
+                            },
+                            exc_info=True,
+                            stack_info=True,
+                        )
+                        app_conversation.sandbox_status = SandboxStatus.STARTING
+
                return _to_conversation_info(app_conversation)
        except (ValueError, TypeError, Exception):
            # Not a V1 conversation or service error
--- a/tests/unit/server/routes/test_conversation_routes.py
+++ b/tests/unit/server/routes/test_conversation_routes.py
@@ -3,6 +3,7 @@ from datetime import datetime, timezone
 from unittest.mock import AsyncMock, MagicMock, patch
 from uuid import uuid4

+import httpx
 import pytest
 from fastapi import status
 from fastapi.responses import JSONResponse
@@ -12,6 +13,7 @@ from openhands.app_server.app_conversation.app_conversation_info_service import
 )
 from openhands.app_server.app_conversation.app_conversation_models import (
    AgentType,
+    AppConversation,
    AppConversationInfo,
    AppConversationPage,
    AppConversationStartRequest,
@@ -21,8 +23,10 @@ from openhands.app_server.app_conversation.app_conversation_models import (
 from openhands.app_server.app_conversation.app_conversation_service import (
    AppConversationService,
 )
+from openhands.app_server.sandbox.sandbox_models import SandboxStatus
 from openhands.microagent.microagent import KnowledgeMicroagent, RepoMicroagent
 from openhands.microagent.types import MicroagentMetadata, MicroagentType
+from openhands.server.data_models.conversation_info import ConversationStatus
 from openhands.server.data_models.conversation_info_result_set import (
    ConversationInfoResultSet,
 )
@@ -32,7 +36,9 @@ from openhands.server.routes.conversation import (
    get_microagents,
 )
 from openhands.server.routes.manage_conversations import (
+    _RESUME_GRACE_PERIOD,
    UpdateConversationRequest,
+    get_conversation,
    search_conversations,
    update_conversation,
 )
@@ -1459,3 +1465,102 @@ async def test_search_conversations_include_sub_conversations_with_other_filters
                assert call_kwargs.get('include_sub_conversations') is True
                assert call_kwargs.get('page_id') == 'test_v1_page_id'
                assert call_kwargs.get('limit') == 50
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    'sandbox_status,execution_status,server_uptime,server_error,expected_status,should_call_server',
+    [
+        # RUNNING sandbox, no execution_status, server responds within grace period -> STARTING
+        (SandboxStatus.RUNNING, None, 30, None, ConversationStatus.STARTING, True),
+        # RUNNING sandbox, no execution_status, server responds past grace period -> RUNNING
+        (
+            SandboxStatus.RUNNING,
+            None,
+            _RESUME_GRACE_PERIOD + 10,
+            None,
+            ConversationStatus.RUNNING,
+            True,
+        ),
+        # RUNNING sandbox, no execution_status, server unresponsive -> STARTING
+        (
+            SandboxStatus.RUNNING,
+            None,
+            None,
+            httpx.ConnectError('Connection refused'),
+            ConversationStatus.STARTING,
+            True,
+        ),
+        # RUNNING sandbox, WITH execution_status -> RUNNING, skip server check
+        (SandboxStatus.RUNNING, 'IDLE', None, None, ConversationStatus.RUNNING, False),
+        # Non-RUNNING sandbox -> STARTING, skip server check
+        (SandboxStatus.STARTING, None, None, None, ConversationStatus.STARTING, False),
+    ],
+    ids=[
+        'running_no_exec_status_within_grace_period',
+        'running_no_exec_status_past_grace_period',
+        'running_no_exec_status_server_unresponsive',
+        'running_with_exec_status_skips_check',
+        'non_running_skips_check',
+    ],
+)
+async def test_get_conversation_resume_status_handling(
+    sandbox_status,
+    execution_status,
+    server_uptime,
+    server_error,
+    expected_status,
+    should_call_server,
+):
+    """Test get_conversation handles resume status correctly for various scenarios."""
+    from openhands.sdk.conversation.state import ConversationExecutionStatus
+
+    conversation_id = uuid4()
+
+    # Convert string execution_status to enum if provided
+    exec_status = None
+    if execution_status == 'IDLE':
+        exec_status = ConversationExecutionStatus.IDLE
+
+    mock_app_conversation = AppConversation(
+        id=conversation_id,
+        created_by_user_id='test_user',
+        sandbox_id='test_sandbox',
+        sandbox_status=sandbox_status,
+        execution_status=exec_status,
+        conversation_url='https://sandbox.example.com/conversation',
+        created_at=datetime.now(timezone.utc),
+        updated_at=datetime.now(timezone.utc),
+    )
+
+    mock_app_conversation_service = AsyncMock(spec=AppConversationService)
+    mock_app_conversation_service.get_app_conversation.return_value = (
+        mock_app_conversation
+    )
+    mock_conversation_store = AsyncMock(spec=ConversationStore)
+    mock_httpx_client = AsyncMock(spec=httpx.AsyncClient)
+
+    if server_error:
+        mock_httpx_client.get.side_effect = server_error
+    elif server_uptime is not None:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {'uptime': server_uptime}
+        mock_response.raise_for_status = MagicMock()
+        mock_httpx_client.get.return_value = mock_response
+
+    result = await get_conversation(
+        conversation_id=str(conversation_id),
+        conversation_store=mock_conversation_store,
+        app_conversation_service=mock_app_conversation_service,
+        httpx_client=mock_httpx_client,
+    )
+
+    assert result is not None
+    assert result.status == expected_status
+
+    if should_call_server:
+        mock_httpx_client.get.assert_called_once_with(
+            'https://sandbox.example.com/server_info'
+        )
+    else:
+        mock_httpx_client.get.assert_not_called()