From e9cafb0372d2b0926cbb6e35567fefb0a35cadd7 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Dec 2024 12:28:29 -0500
Subject: [PATCH] chore: Cleanup runtime exception handling (#5696)

---
 evaluation/benchmarks/swe_bench/run_infer.py  |   7 +-
 evaluation/utils/shared.py                    |  32 +++++
 openhands/controller/agent_controller.py      |   7 +-
 openhands/core/exceptions.py                  | 113 +++++++++++++++---
 openhands/runtime/base.py                     |  19 +--
 openhands/runtime/builder/base.py             |   2 +-
 openhands/runtime/builder/docker.py           |  13 +-
 openhands/runtime/builder/remote.py           |  15 ++-
 .../impl/eventstream/eventstream_runtime.py   |  27 +++--
 .../runtime/impl/remote/remote_runtime.py     |  40 ++++---
 .../runtime/impl/runloop/runloop_runtime.py   |   8 +-
 openhands/runtime/utils/runtime_build.py      |   3 +-
 openhands/server/routes/files.py              |  15 +--
 openhands/server/session/agent_session.py     |   5 +-
 openhands/server/session/manager.py           |   4 +-
 tests/unit/test_agent_controller.py           |   4 +-
 16 files changed, 219 insertions(+), 95 deletions(-)

diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index b97b5d9361..8da11d517f 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     assert_and_raise,
     codeact_user_response,
+    is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -400,11 +401,7 @@ def process_instance(
         )
 
         # if fatal error, throw EvalError to trigger re-run
-        if (
-            state.last_error
-            and 'fatal error during agent execution' in state.last_error
-            and 'stuck in a loop' not in state.last_error
-        ):
+        if is_fatal_evaluation_error(state.last_error):
             raise EvalException('Fatal error detected: ' + state.last_error)
 
         # ======= THIS IS SWE-Bench specific =======
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 517ecc5235..5a4fa2a2ad 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -16,6 +16,16 @@ from tqdm import tqdm
 
 from openhands.controller.state.state import State
 from openhands.core.config import LLMConfig
+from openhands.core.exceptions import (
+    AgentRuntimeBuildError,
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeNotReadyError,
+    AgentRuntimeTimeoutError,
+    AgentRuntimeUnavailableError,
+    AgentStuckInLoopError,
+)
 from openhands.core.logger import get_console_handler
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import Action
@@ -503,3 +513,25 @@ def compatibility_for_eval_history_pairs(
         history_pairs.append((event_to_dict(action), event_to_dict(observation)))
 
     return history_pairs
+
+
+def is_fatal_evaluation_error(error: str | None) -> bool:
+    if not error:
+        return False
+
+    FATAL_EXCEPTIONS = [
+        AgentRuntimeError,
+        AgentRuntimeBuildError,
+        AgentRuntimeTimeoutError,
+        AgentRuntimeUnavailableError,
+        AgentRuntimeNotReadyError,
+        AgentRuntimeDisconnectedError,
+        AgentRuntimeNotFoundError,
+        AgentStuckInLoopError,
+    ]
+
+    if any(exception.__name__ in error for exception in FATAL_EXCEPTIONS):
+        logger.error(f'Fatal evaluation error detected: {error}')
+        return True
+
+    return False
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index e749334187..a6b666f136 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -12,6 +12,7 @@ from openhands.controller.state.state import State, TrafficControlState
 from openhands.controller.stuck import StuckDetector
 from openhands.core.config import AgentConfig, LLMConfig
 from openhands.core.exceptions import (
+    AgentStuckInLoopError,
     FunctionCallNotExistsError,
     FunctionCallValidationError,
     LLMMalformedActionError,
@@ -196,7 +197,7 @@ class AgentController:
             err_id = ''
             if isinstance(e, litellm.AuthenticationError):
                 err_id = 'STATUS$ERROR_LLM_AUTHENTICATION'
-            self.status_callback('error', err_id, str(e))
+            self.status_callback('error', err_id, type(e).__name__ + ': ' + str(e))
 
     async def start_step_loop(self):
         """The main loop for the agent's step-by-step execution."""
@@ -502,7 +503,9 @@ class AgentController:
             return
 
         if self._is_stuck():
-            await self._react_to_exception(RuntimeError('Agent got stuck in a loop'))
+            await self._react_to_exception(
+                AgentStuckInLoopError('Agent got stuck in a loop')
+            )
             return
 
         self.update_state_before_step()
diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py
index bf5a29f607..a33c821b7e 100644
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -1,14 +1,25 @@
-class AgentNoInstructionError(Exception):
+# ============================================
+# Agent Exceptions
+# ============================================
+
+
+class AgentError(Exception):
+    """Base class for all agent exceptions."""
+
+    pass
+
+
+class AgentNoInstructionError(AgentError):
     def __init__(self, message='Instruction must be provided'):
         super().__init__(message)
 
 
-class AgentEventTypeError(Exception):
+class AgentEventTypeError(AgentError):
     def __init__(self, message='Event must be a dictionary'):
         super().__init__(message)
 
 
-class AgentAlreadyRegisteredError(Exception):
+class AgentAlreadyRegisteredError(AgentError):
     def __init__(self, name=None):
         if name is not None:
             message = f"Agent class already registered under '{name}'"
@@ -17,7 +28,7 @@ class AgentAlreadyRegisteredError(Exception):
         super().__init__(message)
 
 
-class AgentNotRegisteredError(Exception):
+class AgentNotRegisteredError(AgentError):
     def __init__(self, name=None):
         if name is not None:
             message = f"No agent class registered under '{name}'"
@@ -26,6 +37,16 @@ class AgentNotRegisteredError(Exception):
         super().__init__(message)
 
 
+class AgentStuckInLoopError(AgentError):
+    def __init__(self, message='Agent got stuck in a loop'):
+        super().__init__(message)
+
+
+# ============================================
+# Agent Controller Exceptions
+# ============================================
+
+
 class TaskInvalidStateError(Exception):
     def __init__(self, state=None):
         if state is not None:
@@ -35,17 +56,9 @@ class TaskInvalidStateError(Exception):
         super().__init__(message)
 
 
-class BrowserInitException(Exception):
-    def __init__(self, message='Failed to initialize browser environment'):
-        super().__init__(message)
-
-
-class BrowserUnavailableException(Exception):
-    def __init__(
-        self,
-        message='Browser environment is not available, please check if has been initialized',
-    ):
-        super().__init__(message)
+# ============================================
+# LLM Exceptions
+# ============================================
 
 
 # This exception gets sent back to the LLM
@@ -96,6 +109,11 @@ class CloudFlareBlockageError(Exception):
     pass
 
 
+# ============================================
+# LLM function calling Exceptions
+# ============================================
+
+
 class FunctionCallConversionError(Exception):
     """Exception raised when FunctionCallingConverter failed to convert a non-function call message to a function call message.
 
@@ -121,3 +139,68 @@ class FunctionCallNotExistsError(Exception):
 
     def __init__(self, message):
         super().__init__(message)
+
+
+# ============================================
+# Agent Runtime Exceptions
+# ============================================
+
+
+class AgentRuntimeError(Exception):
+    """Base class for all agent runtime exceptions."""
+
+    pass
+
+
+class AgentRuntimeBuildError(AgentRuntimeError):
+    """Exception raised when an agent runtime build operation fails."""
+
+    pass
+
+
+class AgentRuntimeTimeoutError(AgentRuntimeError):
+    """Exception raised when an agent runtime operation times out."""
+
+    pass
+
+
+class AgentRuntimeUnavailableError(AgentRuntimeError):
+    """Exception raised when an agent runtime is unavailable."""
+
+    pass
+
+
+class AgentRuntimeNotReadyError(AgentRuntimeUnavailableError):
+    """Exception raised when an agent runtime is not ready."""
+
+    pass
+
+
+class AgentRuntimeDisconnectedError(AgentRuntimeUnavailableError):
+    """Exception raised when an agent runtime is disconnected."""
+
+    pass
+
+
+class AgentRuntimeNotFoundError(AgentRuntimeUnavailableError):
+    """Exception raised when an agent runtime is not found."""
+
+    pass
+
+
+# ============================================
+# Browser Exceptions
+# ============================================
+
+
+class BrowserInitException(Exception):
+    def __init__(self, message='Failed to initialize browser environment'):
+        super().__init__(message)
+
+
+class BrowserUnavailableException(Exception):
+    def __init__(
+        self,
+        message='Browser environment is not available, please check if has been initialized',
+    ):
+        super().__init__(message)
diff --git a/openhands/runtime/base.py b/openhands/runtime/base.py
index e2d8044ba7..1dcafdcb75 100644
--- a/openhands/runtime/base.py
+++ b/openhands/runtime/base.py
@@ -9,6 +9,7 @@ from typing import Callable
 from requests.exceptions import ConnectionError
 
 from openhands.core.config import AppConfig, SandboxConfig
+from openhands.core.exceptions import AgentRuntimeDisconnectedError
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventSource, EventStream, EventStreamSubscriber
 from openhands.events.action import (
@@ -47,22 +48,6 @@ STATUS_MESSAGES = {
 }
 
 
-class RuntimeUnavailableError(Exception):
-    pass
-
-
-class RuntimeNotReadyError(RuntimeUnavailableError):
-    pass
-
-
-class RuntimeDisconnectedError(RuntimeUnavailableError):
-    pass
-
-
-class RuntimeNotFoundError(RuntimeUnavailableError):
-    pass
-
-
 def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
     ret = {}
     for key in os.environ:
@@ -193,7 +178,7 @@ class Runtime(FileEditRuntimeMixin):
             except Exception as e:
                 err_id = ''
                 if isinstance(e, ConnectionError) or isinstance(
-                    e, RuntimeDisconnectedError
+                    e, AgentRuntimeDisconnectedError
                 ):
                     err_id = 'STATUS$ERROR_RUNTIME_DISCONNECTED'
                 logger.error(
diff --git a/openhands/runtime/builder/base.py b/openhands/runtime/builder/base.py
index acfe3c60fb..6bc1155d7f 100644
--- a/openhands/runtime/builder/base.py
+++ b/openhands/runtime/builder/base.py
@@ -24,7 +24,7 @@ class RuntimeBuilder(abc.ABC):
                 registry prefix). This should be used for subsequent use (e.g., `docker run`).
 
         Raises:
-            RuntimeError: If the build failed.
+            AgentRuntimeBuildError: If the build failed.
         """
         pass
 
diff --git a/openhands/runtime/builder/docker.py b/openhands/runtime/builder/docker.py
index 880b1c73c5..d15aa4fa4e 100644
--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@@ -6,6 +6,7 @@ import time
 import docker
 
 from openhands import __version__ as oh_version
+from openhands.core.exceptions import AgentRuntimeBuildError
 from openhands.core.logger import RollingLogger
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder.base import RuntimeBuilder
@@ -19,7 +20,9 @@ class DockerRuntimeBuilder(RuntimeBuilder):
         version_info = self.docker_client.version()
         server_version = version_info.get('Version', '').replace('-', '.')
         if tuple(map(int, server_version.split('.')[:2])) < (18, 9):
-            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+            raise AgentRuntimeBuildError(
+                'Docker server version must be >= 18.09 to use BuildKit'
+            )
 
         self.rolling_logger = RollingLogger(max_lines=10)
 
@@ -44,7 +47,7 @@ class DockerRuntimeBuilder(RuntimeBuilder):
             str: The name of the built Docker image.
 
         Raises:
-            RuntimeError: If the Docker server version is incompatible or if the build process fails.
+            AgentRuntimeBuildError: If the Docker server version is incompatible or if the build process fails.
 
         Note:
             This method uses Docker BuildKit for improved build performance and caching capabilities.
@@ -55,7 +58,9 @@ class DockerRuntimeBuilder(RuntimeBuilder):
         version_info = self.docker_client.version()
         server_version = version_info.get('Version', '').replace('-', '.')
         if tuple(map(int, server_version.split('.'))) < (18, 9):
-            raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
+            raise AgentRuntimeBuildError(
+                'Docker server version must be >= 18.09 to use BuildKit'
+            )
 
         target_image_hash_name = tags[0]
         target_image_repo, target_image_source_tag = target_image_hash_name.split(':')
@@ -154,7 +159,7 @@ class DockerRuntimeBuilder(RuntimeBuilder):
         # Check if the image is built successfully
         image = self.docker_client.images.get(target_image_hash_name)
         if image is None:
-            raise RuntimeError(
+            raise AgentRuntimeBuildError(
                 f'Build failed: Image {target_image_hash_name} not found'
             )
 
diff --git a/openhands/runtime/builder/remote.py b/openhands/runtime/builder/remote.py
index 5cfe1a4943..2e2c67c5a8 100644
--- a/openhands/runtime/builder/remote.py
+++ b/openhands/runtime/builder/remote.py
@@ -5,6 +5,7 @@ import time
 
 import requests
 
+from openhands.core.exceptions import AgentRuntimeBuildError
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import RuntimeBuilder
 from openhands.runtime.utils.request import send_request
@@ -77,7 +78,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
         while should_continue():
             if time.time() - start_time > timeout:
                 logger.error('Build timed out after 30 minutes')
-                raise RuntimeError('Build timed out after 30 minutes')
+                raise AgentRuntimeBuildError('Build timed out after 30 minutes')
 
             status_response = send_request(
                 self.session,
@@ -88,7 +89,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
 
             if status_response.status_code != 200:
                 logger.error(f'Failed to get build status: {status_response.text}')
-                raise RuntimeError(
+                raise AgentRuntimeBuildError(
                     f'Failed to get build status: {status_response.text}'
                 )
 
@@ -110,12 +111,14 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
                     'error', f'Build failed with status: {status}. Build ID: {build_id}'
                 )
                 logger.error(error_message)
-                raise RuntimeError(error_message)
+                raise AgentRuntimeBuildError(error_message)
 
             # Wait before polling again
             sleep_if_should_continue(30)
 
-        raise RuntimeError('Build interrupted (likely received SIGTERM or SIGINT).')
+        raise AgentRuntimeBuildError(
+            'Build interrupted (likely received SIGTERM or SIGINT).'
+        )
 
     def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
         """Checks if an image exists in the remote registry using the /image_exists endpoint."""
@@ -129,7 +132,9 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
 
         if response.status_code != 200:
             logger.error(f'Failed to check image existence: {response.text}')
-            raise RuntimeError(f'Failed to check image existence: {response.text}')
+            raise AgentRuntimeBuildError(
+                f'Failed to check image existence: {response.text}'
+            )
 
         result = response.json()
 
diff --git a/openhands/runtime/impl/eventstream/eventstream_runtime.py b/openhands/runtime/impl/eventstream/eventstream_runtime.py
index becff94fb1..384cfd5e48 100644
--- a/openhands/runtime/impl/eventstream/eventstream_runtime.py
+++ b/openhands/runtime/impl/eventstream/eventstream_runtime.py
@@ -12,6 +12,13 @@ import requests
 import tenacity
 
 from openhands.core.config import AppConfig
+from openhands.core.exceptions import (
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeNotReadyError,
+    AgentRuntimeTimeoutError,
+)
 from openhands.core.logger import DEBUG
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
@@ -34,11 +41,7 @@ from openhands.events.observation import (
 )
 from openhands.events.serialization import event_to_dict, observation_from_dict
 from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
-from openhands.runtime.base import (
-    Runtime,
-    RuntimeDisconnectedError,
-    RuntimeNotFoundError,
-)
+from openhands.runtime.base import Runtime
 from openhands.runtime.builder import DockerRuntimeBuilder
 from openhands.runtime.impl.eventstream.containers import remove_all_containers
 from openhands.runtime.plugins import PluginRequirement
@@ -358,14 +361,16 @@ class EventStreamRuntime(Runtime):
         try:
             container = self.docker_client.containers.get(self.container_name)
             if container.status == 'exited':
-                raise RuntimeDisconnectedError(
+                raise AgentRuntimeDisconnectedError(
                     f'Container {self.container_name} has exited.'
                 )
         except docker.errors.NotFound:
-            raise RuntimeNotFoundError(f'Container {self.container_name} not found.')
+            raise AgentRuntimeNotFoundError(
+                f'Container {self.container_name} not found.'
+            )
 
         if not self.log_streamer:
-            raise RuntimeError('Runtime client is not ready.')
+            raise AgentRuntimeNotReadyError('Runtime client is not ready.')
 
         with send_request(
             self.session,
@@ -445,7 +450,7 @@ class EventStreamRuntime(Runtime):
                     obs = observation_from_dict(output)
                     obs._cause = action.id  # type: ignore[attr-defined]
             except requests.Timeout:
-                raise RuntimeError(
+                raise AgentRuntimeTimeoutError(
                     f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s'
                 )
 
@@ -514,9 +519,9 @@ class EventStreamRuntime(Runtime):
                 pass
 
         except requests.Timeout:
-            raise TimeoutError('Copy operation timed out')
+            raise AgentRuntimeTimeoutError('Copy operation timed out')
         except Exception as e:
-            raise RuntimeError(f'Copy operation failed: {str(e)}')
+            raise AgentRuntimeError(f'Copy operation failed: {str(e)}')
         finally:
             if recursive:
                 os.unlink(temp_zip_path)
diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index ab3aac4ec9..9dc3827268 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -10,6 +10,14 @@ import requests
 import tenacity
 
 from openhands.core.config import AppConfig
+from openhands.core.exceptions import (
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeNotReadyError,
+    AgentRuntimeTimeoutError,
+    AgentRuntimeUnavailableError,
+)
 from openhands.events import EventStream
 from openhands.events.action import (
     BrowseInteractiveAction,
@@ -28,13 +36,7 @@ from openhands.events.observation import (
 )
 from openhands.events.serialization import event_to_dict, observation_from_dict
 from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
-from openhands.runtime.base import (
-    Runtime,
-    RuntimeDisconnectedError,
-    RuntimeNotFoundError,
-    RuntimeNotReadyError,
-    RuntimeUnavailableError,
-)
+from openhands.runtime.base import Runtime
 from openhands.runtime.builder.remote import RemoteRuntimeBuilder
 from openhands.runtime.plugins import PluginRequirement
 from openhands.runtime.utils.command import get_remote_startup_command
@@ -100,7 +102,7 @@ class RemoteRuntime(Runtime):
     async def connect(self):
         try:
             await call_sync_from_async(self._start_or_attach_to_runtime)
-        except RuntimeNotReadyError:
+        except AgentRuntimeNotReadyError:
             self.log('error', 'Runtime failed to start, timed out before ready')
             raise
         await call_sync_from_async(self.setup_initial_env)
@@ -111,7 +113,7 @@ class RemoteRuntime(Runtime):
         if existing_runtime:
             self.log('debug', f'Using existing runtime with ID: {self.runtime_id}')
         elif self.attach_to_existing:
-            raise RuntimeNotFoundError(
+            raise AgentRuntimeNotFoundError(
                 f'Could not find existing runtime for SID: {self.sid}'
             )
         else:
@@ -215,7 +217,7 @@ class RemoteRuntime(Runtime):
             timeout=60,
         ) as response:
             if not response.json()['exists']:
-                raise RuntimeError(
+                raise AgentRuntimeError(
                     f'Container image {self.container_image} does not exist'
                 )
 
@@ -262,7 +264,7 @@ class RemoteRuntime(Runtime):
             )
         except requests.HTTPError as e:
             self.log('error', f'Unable to start runtime: {e}')
-            raise RuntimeUnavailableError() from e
+            raise AgentRuntimeUnavailableError() from e
 
     def _resume_runtime(self):
         with self._send_request(
@@ -322,7 +324,7 @@ class RemoteRuntime(Runtime):
             )
             | stop_if_should_exit(),
             reraise=True,
-            retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
+            retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError),
             wait=tenacity.wait_fixed(2),
         )
         return retry_decorator(self._wait_until_alive_impl)()
@@ -356,7 +358,7 @@ class RemoteRuntime(Runtime):
                 self.log(
                     'warning', f"Runtime /alive failed, but pod says it's ready: {e}"
                 )
-                raise RuntimeNotReadyError(
+                raise AgentRuntimeNotReadyError(
                     f'Runtime /alive failed to respond with 200: {e}'
                 )
             return
@@ -365,14 +367,14 @@ class RemoteRuntime(Runtime):
             or pod_status == 'pending'
             or pod_status == 'running'
         ):  # nb: Running is not yet Ready
-            raise RuntimeNotReadyError(
+            raise AgentRuntimeNotReadyError(
                 f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
             )
         elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
             # clean up the runtime
             self.close()
-            raise RuntimeError(
-                f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}'
+            raise AgentRuntimeUnavailableError(
+                f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}. Pod Logs:\n{runtime_data.get("pod_logs", "N/A")}'
             )
         else:
             # Maybe this should be a hard failure, but passing through in case the API changes
@@ -382,7 +384,7 @@ class RemoteRuntime(Runtime):
             'debug',
             f'Waiting for runtime pod to be active. Current status: {pod_status}',
         )
-        raise RuntimeNotReadyError()
+        raise AgentRuntimeNotReadyError()
 
     def close(self, timeout: int = 10):
         if self.config.sandbox.keep_runtime_alive or self.attach_to_existing:
@@ -437,7 +439,7 @@ class RemoteRuntime(Runtime):
                 obs = observation_from_dict(output)
                 obs._cause = action.id  # type: ignore[attr-defined]
             except requests.Timeout:
-                raise RuntimeError(
+                raise AgentRuntimeTimeoutError(
                     f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s'
                 )
             return obs
@@ -451,7 +453,7 @@ class RemoteRuntime(Runtime):
             raise
         except requests.HTTPError as e:
             if is_runtime_request and e.response.status_code == 404:
-                raise RuntimeDisconnectedError(
+                raise AgentRuntimeDisconnectedError(
                     f'404 error while connecting to {self.runtime_url}'
                 )
             elif is_runtime_request and e.response.status_code == 503:
diff --git a/openhands/runtime/impl/runloop/runloop_runtime.py b/openhands/runtime/impl/runloop/runloop_runtime.py
index 064aa104c3..368244a03c 100644
--- a/openhands/runtime/impl/runloop/runloop_runtime.py
+++ b/openhands/runtime/impl/runloop/runloop_runtime.py
@@ -10,6 +10,10 @@ from runloop_api_client.types import DevboxView
 from runloop_api_client.types.shared_params import LaunchParameters
 
 from openhands.core.config import AppConfig
+from openhands.core.exceptions import (
+    AgentRuntimeNotReadyError,
+    AgentRuntimeUnavailableError,
+)
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
 from openhands.runtime.impl.eventstream.eventstream_runtime import EventStreamRuntime
@@ -227,7 +231,7 @@ class RunloopRuntime(EventStreamRuntime):
     )
     def _wait_until_alive(self):
         if not self.log_streamer:
-            raise RuntimeError('Runtime client is not ready.')
+            raise AgentRuntimeNotReadyError('Runtime client is not ready.')
         response = send_request(
             self.session,
             'GET',
@@ -239,7 +243,7 @@ class RunloopRuntime(EventStreamRuntime):
         else:
             msg = f'Action execution API is not alive. Response: {response}'
             logger.error(msg)
-            raise RuntimeError(msg)
+            raise AgentRuntimeUnavailableError(msg)
 
     def close(self, rm_all_containers: bool | None = True):
         if self.log_streamer:
diff --git a/openhands/runtime/utils/runtime_build.py b/openhands/runtime/utils/runtime_build.py
index de939efd9a..bbb83ac7f9 100644
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@@ -14,6 +14,7 @@ from jinja2 import Environment, FileSystemLoader
 
 import openhands
 from openhands import __version__ as oh_version
+from openhands.core.exceptions import AgentRuntimeBuildError
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import DockerRuntimeBuilder, RuntimeBuilder
 
@@ -364,7 +365,7 @@ def _build_sandbox_image(
         extra_build_args=extra_build_args,
     )
     if not image_name:
-        raise RuntimeError(f'Build failed for image {names}')
+        raise AgentRuntimeBuildError(f'Build failed for image {names}')
 
     return image_name
 
diff --git a/openhands/server/routes/files.py b/openhands/server/routes/files.py
index c2d37350c8..3193376286 100644
--- a/openhands/server/routes/files.py
+++ b/openhands/server/routes/files.py
@@ -13,6 +13,7 @@ from fastapi.responses import FileResponse, JSONResponse
 from pathspec import PathSpec
 from pathspec.patterns import GitWildMatchPattern
 
+from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
     FileReadAction,
@@ -23,7 +24,7 @@ from openhands.events.observation import (
     FileReadObservation,
     FileWriteObservation,
 )
-from openhands.runtime.base import Runtime, RuntimeUnavailableError
+from openhands.runtime.base import Runtime
 from openhands.server.file_config import (
     FILES_TO_IGNORE,
     MAX_FILE_SIZE_MB,
@@ -66,7 +67,7 @@ async def list_files(request: Request, path: str | None = None):
     runtime: Runtime = request.state.conversation.runtime
     try:
         file_list = await call_sync_from_async(runtime.list_files, path)
-    except RuntimeUnavailableError as e:
+    except AgentRuntimeUnavailableError as e:
         logger.error(f'Error listing files: {e}', exc_info=True)
         return JSONResponse(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -93,7 +94,7 @@ async def list_files(request: Request, path: str | None = None):
 
     try:
         file_list = await filter_for_gitignore(file_list, '')
-    except RuntimeUnavailableError as e:
+    except AgentRuntimeUnavailableError as e:
         logger.error(f'Error filtering files: {e}', exc_info=True)
         return JSONResponse(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -129,7 +130,7 @@ async def select_file(file: str, request: Request):
     read_action = FileReadAction(file)
     try:
         observation = await call_sync_from_async(runtime.run_action, read_action)
-    except RuntimeUnavailableError as e:
+    except AgentRuntimeUnavailableError as e:
         logger.error(f'Error opening file {file}: {e}', exc_info=True)
         return JSONResponse(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -205,7 +206,7 @@ async def upload_file(request: Request, files: list[UploadFile]):
                         tmp_file_path,
                         runtime.config.workspace_mount_path_in_sandbox,
                     )
-                except RuntimeUnavailableError as e:
+                except AgentRuntimeUnavailableError as e:
                     logger.error(
                         f'Error saving file {safe_filename}: {e}', exc_info=True
                     )
@@ -282,7 +283,7 @@ async def save_file(request: Request):
         write_action = FileWriteAction(file_path, content)
         try:
             observation = await call_sync_from_async(runtime.run_action, write_action)
-        except RuntimeUnavailableError as e:
+        except AgentRuntimeUnavailableError as e:
             logger.error(f'Error saving file: {e}', exc_info=True)
             return JSONResponse(
                 status_code=500,
@@ -317,7 +318,7 @@ async def zip_current_workspace(request: Request, background_tasks: BackgroundTa
         path = runtime.config.workspace_mount_path_in_sandbox
         try:
             zip_file = await call_sync_from_async(runtime.copy_from, path)
-        except RuntimeUnavailableError as e:
+        except AgentRuntimeUnavailableError as e:
             logger.error(f'Error zipping workspace: {e}', exc_info=True)
             return JSONResponse(
                 status_code=500,
diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py
index 17bd3c6d1c..7a7108c1f1 100644
--- a/openhands/server/session/agent_session.py
+++ b/openhands/server/session/agent_session.py
@@ -5,13 +5,14 @@ from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig, AppConfig, LLMConfig
+from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema.agent import AgentState
 from openhands.events.action import ChangeAgentStateAction
 from openhands.events.event import EventSource
 from openhands.events.stream import EventStream
 from openhands.runtime import get_runtime_cls
-from openhands.runtime.base import Runtime, RuntimeUnavailableError
+from openhands.runtime.base import Runtime
 from openhands.security import SecurityAnalyzer, options
 from openhands.storage.files import FileStore
 from openhands.utils.async_utils import call_async_from_sync
@@ -222,7 +223,7 @@ class AgentSession:
 
         try:
             await self.runtime.connect()
-        except RuntimeUnavailableError as e:
+        except AgentRuntimeUnavailableError as e:
             logger.error(f'Runtime initialization failed: {e}', exc_info=True)
             if self._status_callback:
                 self._status_callback(
diff --git a/openhands/server/session/manager.py b/openhands/server/session/manager.py
index 7ab8d2a817..1a90cc48fd 100644
--- a/openhands/server/session/manager.py
+++ b/openhands/server/session/manager.py
@@ -6,9 +6,9 @@ from dataclasses import dataclass, field
 import socketio
 
 from openhands.core.config import AppConfig
+from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.stream import EventStream, session_exists
-from openhands.runtime.base import RuntimeUnavailableError
 from openhands.server.session.conversation import Conversation
 from openhands.server.session.session import ROOM_KEY, Session
 from openhands.server.session.session_init_data import SessionInitData
@@ -160,7 +160,7 @@ class SessionManager:
             c = Conversation(sid, file_store=self.file_store, config=self.config)
             try:
                 await c.connect()
-            except RuntimeUnavailableError as e:
+            except AgentRuntimeUnavailableError as e:
                 logger.error(f'Error connecting to conversation {c.sid}: {e}')
                 return None
             end_time = time.time()
diff --git a/tests/unit/test_agent_controller.py b/tests/unit/test_agent_controller.py
index 08fe0e0f55..48c9d633c0 100644
--- a/tests/unit/test_agent_controller.py
+++ b/tests/unit/test_agent_controller.py
@@ -161,7 +161,7 @@ async def test_run_controller_with_fatal_error(mock_agent, mock_event_stream):
     print(f'event_stream: {list(event_stream.get_events())}')
     assert state.iteration == 4
     assert state.agent_state == AgentState.ERROR
-    assert state.last_error == 'Agent got stuck in a loop'
+    assert state.last_error == 'AgentStuckInLoopError: Agent got stuck in a loop'
     assert len(list(event_stream.get_events())) == 11
 
 
@@ -227,7 +227,7 @@ async def test_run_controller_stop_with_stuck():
     assert last_event['observation'] == 'agent_state_changed'
 
     assert state.agent_state == AgentState.ERROR
-    assert state.last_error == 'Agent got stuck in a loop'
+    assert state.last_error == 'AgentStuckInLoopError: Agent got stuck in a loop'
 
 
 @pytest.mark.asyncio