Fix runtime status error on conversation resume (#12718)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Tim O'Farrell
2026-02-02 14:04:45 -07:00
committed by GitHub
parent d6c11fe517
commit 1bb4c844d4
2 changed files with 147 additions and 0 deletions

View File

@@ -15,6 +15,7 @@ import re
import uuid
from datetime import datetime, timedelta, timezone
from typing import Annotated
from urllib.parse import urlparse
import base62
import httpx
@@ -40,6 +41,7 @@ from openhands.app_server.config import (
depends_httpx_client,
depends_sandbox_service,
)
from openhands.app_server.sandbox.sandbox_models import SandboxStatus
from openhands.app_server.sandbox.sandbox_service import SandboxService
from openhands.app_server.services.db_session_injector import set_db_session_keep_open
from openhands.app_server.services.httpx_client_injector import (
@@ -119,6 +121,7 @@ app_conversation_info_service_dependency = depends_app_conversation_info_service
sandbox_service_dependency = depends_sandbox_service()
db_session_dependency = depends_db_session()
httpx_client_dependency = depends_httpx_client()
_RESUME_GRACE_PERIOD = 60
def _filter_conversations_by_age(
@@ -470,6 +473,7 @@ async def get_conversation(
conversation_id: str = Depends(validate_conversation_id),
conversation_store: ConversationStore = Depends(get_conversation_store),
app_conversation_service: AppConversationService = app_conversation_service_dependency,
httpx_client: httpx.AsyncClient = httpx_client_dependency,
) -> ConversationInfo | None:
"""Get a single conversation by ID.
@@ -484,6 +488,44 @@ async def get_conversation(
conversation_uuid
)
if app_conversation:
if (
app_conversation.sandbox_status == SandboxStatus.RUNNING
and app_conversation.execution_status is None
):
# The sandbox is running, but we were unable to determine a status for
# the conversation. It may be that it is still starting, or that the
# conversation has been stopped / deleted.
try:
# Check the server info is available
conversation_url = urlparse(app_conversation.conversation_url)
sandbox_info_url = f'{str(conversation_url.scheme)}://{str(conversation_url.netloc)}/server_info'
response = await httpx_client.get(sandbox_info_url)
response.raise_for_status()
server_info = response.json()
# If the server has not been running long, we consider it still starting
uptime = int(server_info.get('uptime'))
if uptime < _RESUME_GRACE_PERIOD:
app_conversation.sandbox_status = SandboxStatus.STARTING
except Exception:
# The sandbox is marked as RUNNING, but the server is not responding.
# There is a bug in runtime API which means that the server is marked
# as RUNNING before it is actually started. (Primarily affecting resumed
# runtimes) As a temporary work around for this, we mark the server as
# STARTING. If the sandbox is actually in an error state, the API will
# discover this quite quickly and mark the sandbox as ERROR
logger.warning(
'get_sandbox_info_failed',
extra={
'conversation_id': app_conversation.id,
'sandbox_id': app_conversation.sandbox_id,
},
exc_info=True,
stack_info=True,
)
app_conversation.sandbox_status = SandboxStatus.STARTING
return _to_conversation_info(app_conversation)
except (ValueError, TypeError, Exception):
# Not a V1 conversation or service error