chore: Cleanup runtime exception handling (#5696)

2025-12-26 05:48:36 +08:00 · 2024-12-19 12:28:29 -05:00 · 2024-12-19 12:28:29 -05:00 · e9cafb0372
commit e9cafb0372
parent 13097f9d1d
16 changed files with 219 additions and 95 deletions
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    assert_and_raise,
    codeact_user_response,
+    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@ -400,11 +401,7 @@ def process_instance(
        )

        # if fatal error, throw EvalError to trigger re-run
-        if (
-            state.last_error
-            and 'fatal error during agent execution' in state.last_error
-            and 'stuck in a loop' not in state.last_error
-        ):
+        if is_fatal_evaluation_error(state.last_error):
            raise EvalException('Fatal error detected: ' + state.last_error)

        # ======= THIS IS SWE-Bench specific =======
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@ -16,6 +16,16 @@ from tqdm import tqdm

 from openhands.controller.state.state import State
 from openhands.core.config import LLMConfig
+from openhands.core.exceptions import (
+    AgentRuntimeBuildError,
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeNotReadyError,
+    AgentRuntimeTimeoutError,
+    AgentRuntimeUnavailableError,
+    AgentStuckInLoopError,
+)
 from openhands.core.logger import get_console_handler
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import Action
@ -503,3 +513,25 @@ def compatibility_for_eval_history_pairs(
        history_pairs.append((event_to_dict(action), event_to_dict(observation)))

    return history_pairs
+
+
+def is_fatal_evaluation_error(error: str | None) -> bool:
+    if not error:
+        return False
+
+    FATAL_EXCEPTIONS = [
+        AgentRuntimeError,
+        AgentRuntimeBuildError,
+        AgentRuntimeTimeoutError,
+        AgentRuntimeUnavailableError,
+        AgentRuntimeNotReadyError,
+        AgentRuntimeDisconnectedError,
+        AgentRuntimeNotFoundError,
+        AgentStuckInLoopError,
+    ]
+
+    if any(exception.__name__ in error for exception in FATAL_EXCEPTIONS):
+        logger.error(f'Fatal evaluation error detected: {error}')
+        return True
+
+    return False
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@ -12,6 +12,7 @@ from openhands.controller.state.state import State, TrafficControlState
 from openhands.controller.stuck import StuckDetector
 from openhands.core.config import AgentConfig, LLMConfig
 from openhands.core.exceptions import (
+    AgentStuckInLoopError,
    FunctionCallNotExistsError,
    FunctionCallValidationError,
    LLMMalformedActionError,
@ -196,7 +197,7 @@ class AgentController:
            err_id = ''
            if isinstance(e, litellm.AuthenticationError):
                err_id = 'STATUS$ERROR_LLM_AUTHENTICATION'
-            self.status_callback('error', err_id, str(e))
+            self.status_callback('error', err_id, type(e).__name__ + ': ' + str(e))

    async def start_step_loop(self):
        """The main loop for the agent's step-by-step execution."""
@ -502,7 +503,9 @@ class AgentController:
            return

        if self._is_stuck():
-            await self._react_to_exception(RuntimeError('Agent got stuck in a loop'))
+            await self._react_to_exception(
+                AgentStuckInLoopError('Agent got stuck in a loop')
+            )
            return

        self.update_state_before_step()
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@ -1,14 +1,25 @@
-class AgentNoInstructionError(Exception):
+# ============================================
+# Agent Exceptions
+# ============================================
+
+
+class AgentError(Exception):
+    """Base class for all agent exceptions."""
+
+    pass
+
+
+class AgentNoInstructionError(AgentError):
    def __init__(self, message='Instruction must be provided'):
        super().__init__(message)


-class AgentEventTypeError(Exception):
+class AgentEventTypeError(AgentError):
    def __init__(self, message='Event must be a dictionary'):
        super().__init__(message)


-class AgentAlreadyRegisteredError(Exception):
+class AgentAlreadyRegisteredError(AgentError):
    def __init__(self, name=None):
        if name is not None:
            message = f"Agent class already registered under '{name}'"
@ -17,7 +28,7 @@ class AgentAlreadyRegisteredError(Exception):
        super().__init__(message)


-class AgentNotRegisteredError(Exception):
+class AgentNotRegisteredError(AgentError):
    def __init__(self, name=None):
        if name is not None:
            message = f"No agent class registered under '{name}'"
@ -26,6 +37,16 @@ class AgentNotRegisteredError(Exception):
        super().__init__(message)


+class AgentStuckInLoopError(AgentError):
+    def __init__(self, message='Agent got stuck in a loop'):
+        super().__init__(message)
+
+
+# ============================================
+# Agent Controller Exceptions
+# ============================================
+
+
 class TaskInvalidStateError(Exception):
    def __init__(self, state=None):
        if state is not None:
@ -35,17 +56,9 @@ class TaskInvalidStateError(Exception):
        super().__init__(message)


-class BrowserInitException(Exception):
-    def __init__(self, message='Failed to initialize browser environment'):
-        super().__init__(message)
-
-
-class BrowserUnavailableException(Exception):
-    def __init__(
-        self,
-        message='Browser environment is not available, please check if has been initialized',
-    ):
-        super().__init__(message)
+# ============================================
+# LLM Exceptions
+# ============================================


 # This exception gets sent back to the LLM
@ -96,6 +109,11 @@ class CloudFlareBlockageError(Exception):
    pass


+# ============================================
+# LLM function calling Exceptions
+# ============================================
+
+
 class FunctionCallConversionError(Exception):
    """Exception raised when FunctionCallingConverter failed to convert a non-function call message to a function call message.

@ -121,3 +139,68 @@ class FunctionCallNotExistsError(Exception):

    def __init__(self, message):
        super().__init__(message)
+
+
+# ============================================
+# Agent Runtime Exceptions
+# ============================================
+
+
+class AgentRuntimeError(Exception):
+    """Base class for all agent runtime exceptions."""
+
+    pass
+
+
+class AgentRuntimeBuildError(AgentRuntimeError):
+    """Exception raised when an agent runtime build operation fails."""
+
+    pass
+
+
+class AgentRuntimeTimeoutError(AgentRuntimeError):
+    """Exception raised when an agent runtime operation times out."""
+
+    pass
+
+
+class AgentRuntimeUnavailableError(AgentRuntimeError):
+    """Exception raised when an agent runtime is unavailable."""
+
+    pass
+
+
+class AgentRuntimeNotReadyError(AgentRuntimeUnavailableError):
+    """Exception raised when an agent runtime is not ready."""
+
+    pass
+
+
+class AgentRuntimeDisconnectedError(AgentRuntimeUnavailableError):
+    """Exception raised when an agent runtime is disconnected."""
+
+    pass
+
+
+class AgentRuntimeNotFoundError(AgentRuntimeUnavailableError):
+    """Exception raised when an agent runtime is not found."""
+
+    pass
+
+
+# ============================================
+# Browser Exceptions
+# ============================================
+
+
+class BrowserInitException(Exception):
+    def __init__(self, message='Failed to initialize browser environment'):
+        super().__init__(message)
+
+
+class BrowserUnavailableException(Exception):
+    def __init__(
+        self,
+        message='Browser environment is not available, please check if has been initialized',
+    ):
+        super().__init__(message)
--- a/openhands/runtime/base.py
+++ b/openhands/runtime/base.py
@ -9,6 +9,7 @@ from typing import Callable
 from requests.exceptions import ConnectionError

 from openhands.core.config import AppConfig, SandboxConfig
+from openhands.core.exceptions import AgentRuntimeDisconnectedError
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventSource, EventStream, EventStreamSubscriber
 from openhands.events.action import (
@ -47,22 +48,6 @@ STATUS_MESSAGES = {
 }


-class RuntimeUnavailableError(Exception):
-    pass
-
-
-class RuntimeNotReadyError(RuntimeUnavailableError):
-    pass
-
-
-class RuntimeDisconnectedError(RuntimeUnavailableError):
-    pass
-
-
-class RuntimeNotFoundError(RuntimeUnavailableError):
-    pass
-
-
 def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
    ret = {}
    for key in os.environ:
@ -193,7 +178,7 @@ class Runtime(FileEditRuntimeMixin):
            except Exception as e:
                err_id = ''
                if isinstance(e, ConnectionError) or isinstance(
-                    e, RuntimeDisconnectedError
+                    e, AgentRuntimeDisconnectedError
                ):
                    err_id = 'STATUS$ERROR_RUNTIME_DISCONNECTED'
                logger.error(
--- a/openhands/runtime/builder/base.py
+++ b/openhands/runtime/builder/base.py
@ -24,7 +24,7 @@ class RuntimeBuilder(abc.ABC):
                registry prefix). This should be used for subsequent use (e.g., `docker run`).

        Raises:
-            RuntimeError: If the build failed.
+            AgentRuntimeBuildError: If the build failed.
        """
        pass

--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@ -6,6 +6,7 @@ import time
 import docker

 from openhands import __version__ as oh_version
+from openhands.core.exceptions import AgentRuntimeBuildError
 from openhands.core.logger import RollingLogger
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder.base import RuntimeBuilder
@ -19,7 +20,9 @@ class DockerRuntimeBuilder(RuntimeBuilder):
        version_info = self.docker_client.version()
        server_version = version_info.get('Version', '').replace('-', '.')
        if tuple(map(int, server_version.split('.')[:2])) < (18, 9):
-            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+            raise AgentRuntimeBuildError(
+                'Docker server version must be >= 18.09 to use BuildKit'
+            )

        self.rolling_logger = RollingLogger(max_lines=10)

@ -44,7 +47,7 @@ class DockerRuntimeBuilder(RuntimeBuilder):
            str: The name of the built Docker image.

        Raises:
-            RuntimeError: If the Docker server version is incompatible or if the build process fails.
+            AgentRuntimeBuildError: If the Docker server version is incompatible or if the build process fails.

        Note:
            This method uses Docker BuildKit for improved build performance and caching capabilities.
@ -55,7 +58,9 @@ class DockerRuntimeBuilder(RuntimeBuilder):
        version_info = self.docker_client.version()
        server_version = version_info.get('Version', '').replace('-', '.')
        if tuple(map(int, server_version.split('.'))) < (18, 9):
-            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+            raise AgentRuntimeBuildError(
+                'Docker server version must be >= 18.09 to use BuildKit'
+            )

        target_image_hash_name = tags[0]
        target_image_repo, target_image_source_tag = target_image_hash_name.split(':')
@ -154,7 +159,7 @@ class DockerRuntimeBuilder(RuntimeBuilder):
        # Check if the image is built successfully
        image = self.docker_client.images.get(target_image_hash_name)
        if image is None:
-            raise RuntimeError(
+            raise AgentRuntimeBuildError(
                f'Build failed: Image {target_image_hash_name} not found'
            )

--- a/openhands/runtime/builder/remote.py
+++ b/openhands/runtime/builder/remote.py
@ -5,6 +5,7 @@ import time

 import requests

+from openhands.core.exceptions import AgentRuntimeBuildError
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import RuntimeBuilder
 from openhands.runtime.utils.request import send_request
@ -77,7 +78,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
        while should_continue():
            if time.time() - start_time > timeout:
                logger.error('Build timed out after 30 minutes')
-                raise RuntimeError('Build timed out after 30 minutes')
+                raise AgentRuntimeBuildError('Build timed out after 30 minutes')

            status_response = send_request(
                self.session,
@ -88,7 +89,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):

            if status_response.status_code != 200:
                logger.error(f'Failed to get build status: {status_response.text}')
-                raise RuntimeError(
+                raise AgentRuntimeBuildError(
                    f'Failed to get build status: {status_response.text}'
                )

@ -110,12 +111,14 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
                    'error', f'Build failed with status: {status}. Build ID: {build_id}'
                )
                logger.error(error_message)
-                raise RuntimeError(error_message)
+                raise AgentRuntimeBuildError(error_message)

            # Wait before polling again
            sleep_if_should_continue(30)

-        raise RuntimeError('Build interrupted (likely received SIGTERM or SIGINT).')
+        raise AgentRuntimeBuildError(
+            'Build interrupted (likely received SIGTERM or SIGINT).'
+        )

    def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
        """Checks if an image exists in the remote registry using the /image_exists endpoint."""
@ -129,7 +132,9 @@ class RemoteRuntimeBuilder(RuntimeBuilder):

        if response.status_code != 200:
            logger.error(f'Failed to check image existence: {response.text}')
-            raise RuntimeError(f'Failed to check image existence: {response.text}')
+            raise AgentRuntimeBuildError(
+                f'Failed to check image existence: {response.text}'
+            )

        result = response.json()

--- a/openhands/runtime/impl/eventstream/eventstream_runtime.py
+++ b/openhands/runtime/impl/eventstream/eventstream_runtime.py
@ -12,6 +12,13 @@ import requests
 import tenacity

 from openhands.core.config import AppConfig
+from openhands.core.exceptions import (
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeNotReadyError,
+    AgentRuntimeTimeoutError,
+)
 from openhands.core.logger import DEBUG
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
@ -34,11 +41,7 @@ from openhands.events.observation import (
 )
 from openhands.events.serialization import event_to_dict, observation_from_dict
 from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
-from openhands.runtime.base import (
-    Runtime,
-    RuntimeDisconnectedError,
-    RuntimeNotFoundError,
-)
+from openhands.runtime.base import Runtime
 from openhands.runtime.builder import DockerRuntimeBuilder
 from openhands.runtime.impl.eventstream.containers import remove_all_containers
 from openhands.runtime.plugins import PluginRequirement
@ -358,14 +361,16 @@ class EventStreamRuntime(Runtime):
        try:
            container = self.docker_client.containers.get(self.container_name)
            if container.status == 'exited':
-                raise RuntimeDisconnectedError(
+                raise AgentRuntimeDisconnectedError(
                    f'Container {self.container_name} has exited.'
                )
        except docker.errors.NotFound:
-            raise RuntimeNotFoundError(f'Container {self.container_name} not found.')
+            raise AgentRuntimeNotFoundError(
+                f'Container {self.container_name} not found.'
+            )

        if not self.log_streamer:
-            raise RuntimeError('Runtime client is not ready.')
+            raise AgentRuntimeNotReadyError('Runtime client is not ready.')

        with send_request(
            self.session,
@ -445,7 +450,7 @@ class EventStreamRuntime(Runtime):
                    obs = observation_from_dict(output)
                    obs._cause = action.id  # type: ignore[attr-defined]
            except requests.Timeout:
-                raise RuntimeError(
+                raise AgentRuntimeTimeoutError(
                    f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s'
                )

@ -514,9 +519,9 @@ class EventStreamRuntime(Runtime):
                pass

        except requests.Timeout:
-            raise TimeoutError('Copy operation timed out')
+            raise AgentRuntimeTimeoutError('Copy operation timed out')
        except Exception as e:
-            raise RuntimeError(f'Copy operation failed: {str(e)}')
+            raise AgentRuntimeError(f'Copy operation failed: {str(e)}')
        finally:
            if recursive:
                os.unlink(temp_zip_path)
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@ -10,6 +10,14 @@ import requests
 import tenacity

 from openhands.core.config import AppConfig
+from openhands.core.exceptions import (
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeNotReadyError,
+    AgentRuntimeTimeoutError,
+    AgentRuntimeUnavailableError,
+)
 from openhands.events import EventStream
 from openhands.events.action import (
    BrowseInteractiveAction,
@ -28,13 +36,7 @@ from openhands.events.observation import (
 )
 from openhands.events.serialization import event_to_dict, observation_from_dict
 from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
-from openhands.runtime.base import (
-    Runtime,
-    RuntimeDisconnectedError,
-    RuntimeNotFoundError,
-    RuntimeNotReadyError,
-    RuntimeUnavailableError,
-)
+from openhands.runtime.base import Runtime
 from openhands.runtime.builder.remote import RemoteRuntimeBuilder
 from openhands.runtime.plugins import PluginRequirement
 from openhands.runtime.utils.command import get_remote_startup_command
@ -100,7 +102,7 @@ class RemoteRuntime(Runtime):
    async def connect(self):
        try:
            await call_sync_from_async(self._start_or_attach_to_runtime)
-        except RuntimeNotReadyError:
+        except AgentRuntimeNotReadyError:
            self.log('error', 'Runtime failed to start, timed out before ready')
            raise
        await call_sync_from_async(self.setup_initial_env)
@ -111,7 +113,7 @@ class RemoteRuntime(Runtime):
        if existing_runtime:
            self.log('debug', f'Using existing runtime with ID: {self.runtime_id}')
        elif self.attach_to_existing:
-            raise RuntimeNotFoundError(
+            raise AgentRuntimeNotFoundError(
                f'Could not find existing runtime for SID: {self.sid}'
            )
        else:
@ -215,7 +217,7 @@ class RemoteRuntime(Runtime):
            timeout=60,
        ) as response:
            if not response.json()['exists']:
-                raise RuntimeError(
+                raise AgentRuntimeError(
                    f'Container image {self.container_image} does not exist'
                )

@ -262,7 +264,7 @@ class RemoteRuntime(Runtime):
            )
        except requests.HTTPError as e:
            self.log('error', f'Unable to start runtime: {e}')
-            raise RuntimeUnavailableError() from e
+            raise AgentRuntimeUnavailableError() from e

    def _resume_runtime(self):
        with self._send_request(
@ -322,7 +324,7 @@ class RemoteRuntime(Runtime):
            )
            | stop_if_should_exit(),
            reraise=True,
-            retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
+            retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError),
            wait=tenacity.wait_fixed(2),
        )
        return retry_decorator(self._wait_until_alive_impl)()
@ -356,7 +358,7 @@ class RemoteRuntime(Runtime):
                self.log(
                    'warning', f"Runtime /alive failed, but pod says it's ready: {e}"
                )
-                raise RuntimeNotReadyError(
+                raise AgentRuntimeNotReadyError(
                    f'Runtime /alive failed to respond with 200: {e}'
                )
            return
@ -365,14 +367,14 @@ class RemoteRuntime(Runtime):
            or pod_status == 'pending'
            or pod_status == 'running'
        ):  # nb: Running is not yet Ready
-            raise RuntimeNotReadyError(
+            raise AgentRuntimeNotReadyError(
                f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
            )
        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
            # clean up the runtime
            self.close()
-            raise RuntimeError(
-                f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}'
+            raise AgentRuntimeUnavailableError(
+                f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}. Pod Logs:\n{runtime_data.get("pod_logs", "N/A")}'
            )
        else:
            # Maybe this should be a hard failure, but passing through in case the API changes
@ -382,7 +384,7 @@ class RemoteRuntime(Runtime):
            'debug',
            f'Waiting for runtime pod to be active. Current status: {pod_status}',
        )
-        raise RuntimeNotReadyError()
+        raise AgentRuntimeNotReadyError()

    def close(self, timeout: int = 10):
        if self.config.sandbox.keep_runtime_alive or self.attach_to_existing:
@ -437,7 +439,7 @@ class RemoteRuntime(Runtime):
                obs = observation_from_dict(output)
                obs._cause = action.id  # type: ignore[attr-defined]
            except requests.Timeout:
-                raise RuntimeError(
+                raise AgentRuntimeTimeoutError(
                    f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s'
                )
            return obs
@ -451,7 +453,7 @@ class RemoteRuntime(Runtime):
            raise
        except requests.HTTPError as e:
            if is_runtime_request and e.response.status_code == 404:
-                raise RuntimeDisconnectedError(
+                raise AgentRuntimeDisconnectedError(
                    f'404 error while connecting to {self.runtime_url}'
                )
            elif is_runtime_request and e.response.status_code == 503:
--- a/openhands/runtime/impl/runloop/runloop_runtime.py
+++ b/openhands/runtime/impl/runloop/runloop_runtime.py
@ -10,6 +10,10 @@ from runloop_api_client.types import DevboxView
 from runloop_api_client.types.shared_params import LaunchParameters

 from openhands.core.config import AppConfig
+from openhands.core.exceptions import (
+    AgentRuntimeNotReadyError,
+    AgentRuntimeUnavailableError,
+)
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
 from openhands.runtime.impl.eventstream.eventstream_runtime import EventStreamRuntime
@ -227,7 +231,7 @@ class RunloopRuntime(EventStreamRuntime):
    )
    def _wait_until_alive(self):
        if not self.log_streamer:
-            raise RuntimeError('Runtime client is not ready.')
+            raise AgentRuntimeNotReadyError('Runtime client is not ready.')
        response = send_request(
            self.session,
            'GET',
@ -239,7 +243,7 @@ class RunloopRuntime(EventStreamRuntime):
        else:
            msg = f'Action execution API is not alive. Response: {response}'
            logger.error(msg)
-            raise RuntimeError(msg)
+            raise AgentRuntimeUnavailableError(msg)

    def close(self, rm_all_containers: bool | None = True):
        if self.log_streamer:
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@ -14,6 +14,7 @@ from jinja2 import Environment, FileSystemLoader

 import openhands
 from openhands import __version__ as oh_version
+from openhands.core.exceptions import AgentRuntimeBuildError
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import DockerRuntimeBuilder, RuntimeBuilder

@ -364,7 +365,7 @@ def _build_sandbox_image(
        extra_build_args=extra_build_args,
    )
    if not image_name:
-        raise RuntimeError(f'Build failed for image {names}')
+        raise AgentRuntimeBuildError(f'Build failed for image {names}')

    return image_name

--- a/openhands/server/routes/files.py
+++ b/openhands/server/routes/files.py
@ -13,6 +13,7 @@ from fastapi.responses import FileResponse, JSONResponse
 from pathspec import PathSpec
 from pathspec.patterns import GitWildMatchPattern

+from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
    FileReadAction,
@ -23,7 +24,7 @@ from openhands.events.observation import (
    FileReadObservation,
    FileWriteObservation,
 )
-from openhands.runtime.base import Runtime, RuntimeUnavailableError
+from openhands.runtime.base import Runtime
 from openhands.server.file_config import (
    FILES_TO_IGNORE,
    MAX_FILE_SIZE_MB,
@ -66,7 +67,7 @@ async def list_files(request: Request, path: str | None = None):
    runtime: Runtime = request.state.conversation.runtime
    try:
        file_list = await call_sync_from_async(runtime.list_files, path)
-    except RuntimeUnavailableError as e:
+    except AgentRuntimeUnavailableError as e:
        logger.error(f'Error listing files: {e}', exc_info=True)
        return JSONResponse(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@ -93,7 +94,7 @@ async def list_files(request: Request, path: str | None = None):

    try:
        file_list = await filter_for_gitignore(file_list, '')
-    except RuntimeUnavailableError as e:
+    except AgentRuntimeUnavailableError as e:
        logger.error(f'Error filtering files: {e}', exc_info=True)
        return JSONResponse(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@ -129,7 +130,7 @@ async def select_file(file: str, request: Request):
    read_action = FileReadAction(file)
    try:
        observation = await call_sync_from_async(runtime.run_action, read_action)
-    except RuntimeUnavailableError as e:
+    except AgentRuntimeUnavailableError as e:
        logger.error(f'Error opening file {file}: {e}', exc_info=True)
        return JSONResponse(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@ -205,7 +206,7 @@ async def upload_file(request: Request, files: list[UploadFile]):
                        tmp_file_path,
                        runtime.config.workspace_mount_path_in_sandbox,
                    )
-                except RuntimeUnavailableError as e:
+                except AgentRuntimeUnavailableError as e:
                    logger.error(
                        f'Error saving file {safe_filename}: {e}', exc_info=True
                    )
@ -282,7 +283,7 @@ async def save_file(request: Request):
        write_action = FileWriteAction(file_path, content)
        try:
            observation = await call_sync_from_async(runtime.run_action, write_action)
-        except RuntimeUnavailableError as e:
+        except AgentRuntimeUnavailableError as e:
            logger.error(f'Error saving file: {e}', exc_info=True)
            return JSONResponse(
                status_code=500,
@ -317,7 +318,7 @@ async def zip_current_workspace(request: Request, background_tasks: BackgroundTa
        path = runtime.config.workspace_mount_path_in_sandbox
        try:
            zip_file = await call_sync_from_async(runtime.copy_from, path)
-        except RuntimeUnavailableError as e:
+        except AgentRuntimeUnavailableError as e:
            logger.error(f'Error zipping workspace: {e}', exc_info=True)
            return JSONResponse(
                status_code=500,
--- a/openhands/server/session/agent_session.py
+++ b/openhands/server/session/agent_session.py
@ -5,13 +5,14 @@ from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig, AppConfig, LLMConfig
+from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema.agent import AgentState
 from openhands.events.action import ChangeAgentStateAction
 from openhands.events.event import EventSource
 from openhands.events.stream import EventStream
 from openhands.runtime import get_runtime_cls
-from openhands.runtime.base import Runtime, RuntimeUnavailableError
+from openhands.runtime.base import Runtime
 from openhands.security import SecurityAnalyzer, options
 from openhands.storage.files import FileStore
 from openhands.utils.async_utils import call_async_from_sync
@ -222,7 +223,7 @@ class AgentSession:

        try:
            await self.runtime.connect()
-        except RuntimeUnavailableError as e:
+        except AgentRuntimeUnavailableError as e:
            logger.error(f'Runtime initialization failed: {e}', exc_info=True)
            if self._status_callback:
                self._status_callback(
--- a/openhands/server/session/manager.py
+++ b/openhands/server/session/manager.py
@ -6,9 +6,9 @@ from dataclasses import dataclass, field
 import socketio

 from openhands.core.config import AppConfig
+from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.stream import EventStream, session_exists
-from openhands.runtime.base import RuntimeUnavailableError
 from openhands.server.session.conversation import Conversation
 from openhands.server.session.session import ROOM_KEY, Session
 from openhands.server.session.session_init_data import SessionInitData
@ -160,7 +160,7 @@ class SessionManager:
            c = Conversation(sid, file_store=self.file_store, config=self.config)
            try:
                await c.connect()
-            except RuntimeUnavailableError as e:
+            except AgentRuntimeUnavailableError as e:
                logger.error(f'Error connecting to conversation {c.sid}: {e}')
                return None
            end_time = time.time()
--- a/tests/unit/test_agent_controller.py
+++ b/tests/unit/test_agent_controller.py
@ -161,7 +161,7 @@ async def test_run_controller_with_fatal_error(mock_agent, mock_event_stream):
    print(f'event_stream: {list(event_stream.get_events())}')
    assert state.iteration == 4
    assert state.agent_state == AgentState.ERROR
-    assert state.last_error == 'Agent got stuck in a loop'
+    assert state.last_error == 'AgentStuckInLoopError: Agent got stuck in a loop'
    assert len(list(event_stream.get_events())) == 11


@ -227,7 +227,7 @@ async def test_run_controller_stop_with_stuck():
    assert last_event['observation'] == 'agent_state_changed'

    assert state.agent_state == AgentState.ERROR
-    assert state.last_error == 'Agent got stuck in a loop'
+    assert state.last_error == 'AgentStuckInLoopError: Agent got stuck in a loop'


@pytest.mark.asyncio