From 19fcf427baa4d7b88afdfabab8e7f77e4a9afff9 Mon Sep 17 00:00:00 2001 From: tofarr Date: Thu, 5 Jun 2025 16:42:18 -0600 Subject: [PATCH] Improved WebSocket Error Handling (#8924) Co-authored-by: openhands --- frontend/src/api/open-hands.ts | 20 +++++++++++++++++ .../features/controls/agent-status-bar.tsx | 2 +- frontend/src/context/ws-client-provider.tsx | 22 +++++++++++++------ .../hooks/query/use-active-conversation.ts | 2 +- frontend/src/i18n/declaration.ts | 1 + frontend/src/i18n/translation.json | 16 ++++++++++++++ frontend/src/routes/conversation.tsx | 9 ++++++-- .../docker_nested_conversation_manager.py | 18 ++++++++++++++- .../standalone_conversation_manager.py | 2 ++ .../test_standalone_conversation_manager.py | 7 ++++++ 10 files changed, 87 insertions(+), 12 deletions(-) diff --git a/frontend/src/api/open-hands.ts b/frontend/src/api/open-hands.ts index 705c56a679..1c20f0c302 100644 --- a/frontend/src/api/open-hands.ts +++ b/frontend/src/api/open-hands.ts @@ -236,6 +236,26 @@ class OpenHands { return data; } + static async startConversation( + conversationId: string, + ): Promise { + const { data } = await openHands.post( + `/api/conversations/${conversationId}/start`, + ); + + return data; + } + + static async stopConversation( + conversationId: string, + ): Promise { + const { data } = await openHands.post( + `/api/conversations/${conversationId}/stop`, + ); + + return data; + } + /** * Get the settings from the server or use the default settings if not found */ diff --git a/frontend/src/components/features/controls/agent-status-bar.tsx b/frontend/src/components/features/controls/agent-status-bar.tsx index efa9be15b7..4900d9661d 100644 --- a/frontend/src/components/features/controls/agent-status-bar.tsx +++ b/frontend/src/components/features/controls/agent-status-bar.tsx @@ -84,7 +84,7 @@ export function AgentStatusBar() { setStatusMessage(t(I18nKey.STATUS$STARTING_RUNTIME)); setIndicatorColor(IndicatorColor.RED); } else if (status === WsClientProviderStatus.DISCONNECTED) { - setStatusMessage(t(I18nKey.STATUS$CONNECTED)); // Using STATUS$CONNECTED instead of STATUS$CONNECTING + setStatusMessage(t(I18nKey.STATUS$WEBSOCKET_CLOSED)); setIndicatorColor(IndicatorColor.RED); } else { setStatusMessage(AGENT_STATUS_MAP[curAgentState].message); diff --git a/frontend/src/context/ws-client-provider.tsx b/frontend/src/context/ws-client-provider.tsx index fd3e0434ae..6edd3cded4 100644 --- a/frontend/src/context/ws-client-provider.tsx +++ b/frontend/src/context/ws-client-provider.tsx @@ -150,7 +150,8 @@ export function WsClientProvider({ const { providers } = useUserProviders(); const messageRateHandler = useRate({ threshold: 250 }); - const { data: conversation } = useActiveConversation(); + const { data: conversation, refetch: refetchConversation } = + useActiveConversation(); function send(event: Record) { if (!sioRef.current) { @@ -269,14 +270,11 @@ export function WsClientProvider({ sio.io.opts.query.latest_event_id = lastEventRef.current?.id; updateStatusWhenErrorMessagePresent(data); - setErrorMessage( - hasValidMessageProperty(data) - ? data.message - : "The WebSocket connection was closed.", - ); + setErrorMessage(hasValidMessageProperty(data) ? data.message : ""); } function handleError(data: unknown) { + // set status setStatus(WsClientProviderStatus.DISCONNECTED); updateStatusWhenErrorMessagePresent(data); @@ -285,6 +283,9 @@ export function WsClientProvider({ ? data.message : "An unknown error occurred on the WebSocket connection.", ); + + // check if something went wrong with the conversation. + refetchConversation(); } React.useEffect(() => { @@ -300,12 +301,19 @@ export function WsClientProvider({ if (!conversationId) { throw new Error("No conversation ID provided"); } - if (!conversation || conversation.status === "STARTING") { + if ( + !conversation || + ["STOPPED", "STARTING"].includes(conversation.status) + ) { return () => undefined; // conversation not yet loaded } let sio = sioRef.current; + if (sio?.connected) { + sio.disconnect(); + } + const lastEvent = lastEventRef.current; const query = { latest_event_id: lastEvent?.id ?? -1, diff --git a/frontend/src/hooks/query/use-active-conversation.ts b/frontend/src/hooks/query/use-active-conversation.ts index 5e3b208d0f..141a4c0744 100644 --- a/frontend/src/hooks/query/use-active-conversation.ts +++ b/frontend/src/hooks/query/use-active-conversation.ts @@ -9,7 +9,7 @@ export const useActiveConversation = () => { const { conversationId } = useConversationId(); const userConversation = useUserConversation(conversationId, (query) => { if (query.state.data?.status === "STARTING") { - return 2000; // 2 seconds + return 3000; // 3 seconds } return FIVE_MINUTES; }); diff --git a/frontend/src/i18n/declaration.ts b/frontend/src/i18n/declaration.ts index 4227a7386e..f051bd0817 100644 --- a/frontend/src/i18n/declaration.ts +++ b/frontend/src/i18n/declaration.ts @@ -1,5 +1,6 @@ // this file generate by script, don't modify it manually!!! export enum I18nKey { + STATUS$WEBSOCKET_CLOSED = "STATUS$WEBSOCKET_CLOSED", HOME$LAUNCH_FROM_SCRATCH = "HOME$LAUNCH_FROM_SCRATCH", HOME$READ_THIS = "HOME$READ_THIS", AUTH$LOGGING_BACK_IN = "AUTH$LOGGING_BACK_IN", diff --git a/frontend/src/i18n/translation.json b/frontend/src/i18n/translation.json index f7b79332f6..c22d4a3581 100644 --- a/frontend/src/i18n/translation.json +++ b/frontend/src/i18n/translation.json @@ -1,4 +1,20 @@ { + "STATUS$WEBSOCKET_CLOSED": { + "en": "The WebSocket connection was closed.", + "ja": "WebSocket接続が閉じられました。", + "zh-CN": "WebSocket连接已关闭。", + "zh-TW": "WebSocket連接已關閉。", + "ko-KR": "WebSocket 연결이 닫혔습니다.", + "no": "WebSocket-tilkoblingen ble lukket.", + "it": "La connessione WebSocket è stata chiusa.", + "pt": "A conexão WebSocket foi fechada.", + "es": "La conexión WebSocket se ha cerrado.", + "ar": "تم إغلاق اتصال WebSocket.", + "fr": "La connexion WebSocket a été fermée.", + "tr": "WebSocket bağlantısı kapatıldı.", + "de": "Die WebSocket-Verbindung wurde geschlossen.", + "uk": "З'єднання WebSocket було закрито." + }, "HOME$LAUNCH_FROM_SCRATCH": { "en": "Launch from Scratch", "ja": "ゼロから始める", diff --git a/frontend/src/routes/conversation.tsx b/frontend/src/routes/conversation.tsx index 140c3e3062..7361d34d43 100644 --- a/frontend/src/routes/conversation.tsx +++ b/frontend/src/routes/conversation.tsx @@ -43,7 +43,7 @@ function AppContent() { const { t } = useTranslation(); const { data: settings } = useSettings(); const { conversationId } = useConversationId(); - const { data: conversation, isFetched } = useActiveConversation(); + const { data: conversation, isFetched, refetch } = useActiveConversation(); const { data: isAuthed } = useIsAuthed(); const { curAgentState } = useSelector((state: RootState) => state.agent); @@ -61,8 +61,13 @@ function AppContent() { "This conversation does not exist, or you do not have permission to access it.", ); navigate("/"); + } else if (conversation?.status === "STOPPED") { + // start the conversation if the state is stopped on initial load + OpenHands.startConversation(conversation.conversation_id).then(() => + refetch(), + ); } - }, [conversation, isFetched, isAuthed]); + }, [conversation?.conversation_id, isFetched, isAuthed]); React.useEffect(() => { dispatch(clearTerminal()); diff --git a/openhands/server/conversation_manager/docker_nested_conversation_manager.py b/openhands/server/conversation_manager/docker_nested_conversation_manager.py index 671bebe2d7..73a0648100 100644 --- a/openhands/server/conversation_manager/docker_nested_conversation_manager.py +++ b/openhands/server/conversation_manager/docker_nested_conversation_manager.py @@ -281,7 +281,23 @@ class DockerNestedConversationManager(ConversationManager): raise ValueError('unsupported_operation') async def close_session(self, sid: str): - stop_all_containers(f'openhands-runtime-{sid}') + # First try to graceful stop server. + try: + container = self.docker_client.containers.get(f'openhands-runtime-{sid}') + except docker.errors.NotFound as e: + return + try: + nested_url = self.get_nested_url_for_container(container) + async with httpx.AsyncClient( + headers={ + 'X-Session-API-Key': self._get_session_api_key_for_conversation(sid) + } + ) as client: + response = await client.post(f'{nested_url}/api/conversations/{sid}/stop') + response.raise_for_status() + except Exception: + logger.exception("error_stopping_container") + container.stop() async def get_agent_loop_info(self, user_id: str | None = None, filter_to_sids: set[str] | None = None) -> list[AgentLoopInfo]: results = [] diff --git a/openhands/server/conversation_manager/standalone_conversation_manager.py b/openhands/server/conversation_manager/standalone_conversation_manager.py index a0b8f9e950..c27aa03966 100644 --- a/openhands/server/conversation_manager/standalone_conversation_manager.py +++ b/openhands/server/conversation_manager/standalone_conversation_manager.py @@ -369,7 +369,9 @@ class StandaloneConversationManager(ConversationManager): f'removing connections: {connection_ids_to_remove}', extra={'session_id': sid}, ) + # Perform a graceful shutdown of each connection for connection_id in connection_ids_to_remove: + await self.sio.disconnect(connection_id) self._local_connection_id_to_session_id.pop(connection_id, None) session = self._local_agent_loops_by_sid.pop(sid, None) diff --git a/tests/unit/test_standalone_conversation_manager.py b/tests/unit/test_standalone_conversation_manager.py index 5e71cda96b..e584691852 100644 --- a/tests/unit/test_standalone_conversation_manager.py +++ b/tests/unit/test_standalone_conversation_manager.py @@ -167,6 +167,7 @@ async def test_add_to_local_event_stream(): @pytest.mark.asyncio async def test_cleanup_session_connections(): sio = get_mock_sio() + sio.disconnect = AsyncMock() # Mock the disconnect method async with StandaloneConversationManager( sio, OpenHandsConfig(), InMemoryFileStore(), MonitoringListener() ) as conversation_manager: @@ -181,6 +182,7 @@ async def test_cleanup_session_connections(): await conversation_manager._close_session('session1') + # Check that connections were removed from the dictionary remaining_connections = conversation_manager._local_connection_id_to_session_id assert 'conn1' not in remaining_connections assert 'conn2' not in remaining_connections @@ -188,3 +190,8 @@ async def test_cleanup_session_connections(): assert 'conn4' in remaining_connections assert remaining_connections['conn3'] == 'session2' assert remaining_connections['conn4'] == 'session2' + + # Check that disconnect was called for each connection + assert sio.disconnect.await_count == 2 + sio.disconnect.assert_any_call('conn1') + sio.disconnect.assert_any_call('conn2')