Merge branch 'main' of github.com:All-Hands-AI/OpenHands into enyst/loggers

2026-03-22 13:47:19 +08:00 · 2024-09-24 23:19:12 +02:00
parent b3a9adcb7b 1b1d8f0b02
commit 7170642fed
61 changed files with 2675 additions and 1268 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -228,3 +228,4 @@ runtime_*.tar
 # docker build
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
+containers/runtime/code
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -5,6 +5,7 @@ from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
+from openhands.core.exceptions import OperationCancelled
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
@@ -153,7 +154,10 @@ class CodeActAgent(Agent):
            text = truncate_content(text, max_message_chars)
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentDelegateObservation):
-            text = obs_prefix + truncate_content(str(obs.outputs), max_message_chars)
+            text = obs_prefix + truncate_content(
+                obs.outputs['content'] if 'content' in obs.outputs else '',
+                max_message_chars,
+            )
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, ErrorObservation):
            text = obs_prefix + truncate_content(obs.content, max_message_chars)
@@ -200,7 +204,6 @@ class CodeActAgent(Agent):
                '</execute_bash>',
                '</execute_browse>',
            ],
-            'temperature': 0.0,
        }

        if self.llm.is_caching_prompt_active():
@@ -208,8 +211,11 @@ class CodeActAgent(Agent):
                'anthropic-beta': 'prompt-caching-2024-07-31',
            }

+        # TODO: move exception handling to agent_controller
        try:
            response = self.llm.completion(**params)
+        except OperationCancelled as e:
+            raise e
        except Exception as e:
            logger.error(f'{e}')
            error_message = '{}: {}'.format(type(e).__name__, str(e).split('\n')[0])
--- a/config.template.toml
+++ b/config.template.toml
@@ -159,7 +159,7 @@ model = "gpt-4o"
 #timeout = 0

 # Top p for the API
-#top_p = 0.5
+#top_p = 1.0

 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true
--- a/containers/runtime/README.md
+++ b/containers/runtime/README.md
@@ -1,11 +1,12 @@
-# Dynamic constructed Dockerfile
+# Dynamically constructed Dockerfile

-This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `openhands`.
+This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
+that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.

-The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.
+The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.11-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image ubuntu:22.04 \
+    --base_image nikolaik/python-nodejs:python3.11-nodejs22 \
    --build_folder containers/runtime
 ```
--- a/docs/modules/usage/llms/azure-llms.md
+++ b/docs/modules/usage/llms/azure-llms.md
@@ -1,6 +1,6 @@
 # Azure

-OpenHands uses LiteLLM for completion calls. You can find their documentation on Azure [here](https://docs.litellm.ai/docs/providers/azure).
+OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a provider [here](https://docs.litellm.ai/docs/providers/azure).

 ## Azure OpenAI Configuration

@@ -27,7 +27,7 @@ You will need your ChatGPT deployment name which can be found on the deployments

 * Enable `Advanced Options`
 * `Custom Model` to azure/&lt;deployment-name&gt;
-* `Base URL` to your Azure API Base URL (Example: `https://example-endpoint.openai.azure.com`)
+* `Base URL` to your Azure API Base URL (e.g. `https://example-endpoint.openai.azure.com`)
 * `API Key` to your Azure API key

 ## Embeddings
--- a/docs/modules/usage/llms/google-llms.md
+++ b/docs/modules/usage/llms/google-llms.md
@@ -1,6 +1,6 @@
 # Google Gemini/Vertex

-OpenHands uses LiteLLM for completion calls. The following resources are relevant for using OpenHands with Google's LLMs:
+OpenHands uses LiteLLM to make calls to Google's chat models. You can find their documentation on using Google as a provider:

 - [Gemini - Google AI Studio](https://docs.litellm.ai/docs/providers/gemini)
 - [VertexAI - Google Cloud Platform](https://docs.litellm.ai/docs/providers/vertex)
@@ -10,7 +10,7 @@ OpenHands uses LiteLLM for completion calls. The following resources are relevan
 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `Gemini`
 * `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. gemini/&lt;model-name&gt;).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
 * `API Key` to your Gemini API key

 ## VertexAI - Google Cloud Platform Configs
@@ -27,4 +27,4 @@ VERTEXAI_LOCATION="<your-gcp-location>"
 Then set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `VertexAI`
 * `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. vertex_ai/&lt;model-name&gt;).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
--- a/docs/modules/usage/llms/groq.md
+++ b/docs/modules/usage/llms/groq.md
@@ -1,15 +1,15 @@
 # Groq

-OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their full documentation on using Groq as provider [here](https://docs.litellm.ai/docs/providers/groq).
+OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a provider [here](https://docs.litellm.ai/docs/providers/groq).

 ## Configuration

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `Groq`
-* `LLM Model` to the model you will be using. [Visit **here** to see the list of
+* `LLM Model` to the model you will be using. [Visit here to see the list of
 models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced Options`, and enter it in `Custom Model` (i.e. groq/&lt;model-name&gt;)
-* `API key` to your Groq API key. To find or create your Groq API Key, [see **here**](https://console.groq.com/keys)
+`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`)
+* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys)



@@ -18,6 +18,6 @@ models that Groq hosts](https://console.groq.com/docs/models). If the model is n
 The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
 would access any OpenAI-compatible endpoint. You can set the following in the OpenHands UI through the Settings:
 * Enable `Advanced Options`
-* `Custom Model` to the prefix `openai/` + the model you will be using (Example: `openai/llama3-8b-8192`)
+* `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
 * `Base URL` to `https://api.groq.com/openai/v1`
 * `API Key` to your Groq API key
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -54,8 +54,8 @@ We have a few guides for running OpenHands with specific model providers:
 * [Azure](llms/azure-llms)
 * [Google](llms/google-llms)
 * [Groq](llms/groq)
-* [ollama](llms/local-llms)
 * [OpenAI](llms/openai-llms)
+* [OpenRouter](llms/openrouter)

 ### API retries and rate limits

--- a/docs/modules/usage/llms/local-llms.md
+++ b/docs/modules/usage/llms/local-llms.md
@@ -28,17 +28,14 @@ mistral:7b-instruct-v0.2-q4_K_M eb14864c7427    4.4 GB  2 weeks ago
 starcoder2:latest               f67ae0f64584    1.7 GB  19 hours ago
 ```

-## Start OpenHands
-
-### Docker
+## Run OpenHands with Docker

+### Start OpenHands
 Use the instructions [here](../getting-started) to start OpenHands using Docker.
 But when running `docker run`, you'll need to add a few more arguments:

 ```bash
 --add-host host.docker.internal:host-gateway \
-e LLM_API_KEY="ollama" \
-e LLM_BASE_URL="http://host.docker.internal:11434" \
 -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
 ```

@@ -55,8 +52,6 @@ docker run \
    --pull=always \
    --add-host host.docker.internal:host-gateway \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e LLM_API_KEY="ollama" \
-    -e LLM_BASE_URL="http://host.docker.internal:11434" \
    -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -67,6 +62,16 @@ docker run \

 You should now be able to connect to `http://localhost:3000/`

+### Configure the Web Application
+
+When running `openhands`, you'll need to set the following in the OpenHands UI through the Settings:
+- the model to "ollama/&lt;model-name&gt;"
+- the base url to `http://host.docker.internal:11434`
+- the API key is optional, you can use any string, such as `ollama`.
+
+
+## Run OpenHands in Development Mode
+
 ### Build from Source

 Use the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to build OpenHands.
@@ -77,23 +82,22 @@ Make sure `config.toml` is there by running `make setup-config` which will creat
 workspace_base="./workspace"

 [llm]
-model="ollama/codellama:7b"
-api_key="ollama"
 embedding_model="local"
-base_url="http://localhost:11434"
 ollama_base_url="http://localhost:11434"

 ```

-Replace `LLM_MODEL` of your choice if you need to.
+Done! Now you can start OpenHands by: `make run`. You now should be able to connect to `http://localhost:3000/`

-Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
-
-## Select your Model
+### Configure the Web Application

 In the OpenHands UI, click on the Settings wheel in the bottom-left corner.
 Then in the `Model` input, enter `ollama/codellama:7b`, or the name of the model you pulled earlier.
-If it doesn’t show up in a dropdown, that’s fine, just type it in. Click Save when you’re done.
+If it doesn’t show up in the dropdown, enable `Advanced Settings` and type it in. Please note: you need the model name as listed by `ollama list`, with the prefix `ollama/`.
+
+In the API Key field, enter `ollama` or any value, since you don't need a particular key.
+
+In the Base URL field, enter `http://localhost:11434`.

 And now you're ready to go!

--- a/docs/modules/usage/llms/openai-llms.md
+++ b/docs/modules/usage/llms/openai-llms.md
@@ -1,15 +1,15 @@
 # OpenAI

-OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their full documentation on OpenAI chat calls [here](https://docs.litellm.ai/docs/providers/openai).
+OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a provider [here](https://docs.litellm.ai/docs/providers/openai).

 ## Configuration

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `OpenAI`
 * `LLM Model` to the model you will be using.
-[Visit **here** to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. openai/&lt;model-name&gt;).
-* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see **here**](https://platform.openai.com/api-keys).
+[Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
+* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).

 ## Using OpenAI-Compatible Endpoints

@@ -19,6 +19,6 @@ Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoi

 If you're using an OpenAI proxy, you'll need to set the following in the OpenHands UI through the Settings:
 * Enable `Advanced Options`
-* `Custom Model` to openai/&lt;model-name&gt; (i.e.: `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
+* `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
 * `Base URL` to the URL of your OpenAI proxy
 * `API Key` to your OpenAI API key
--- a/docs/modules/usage/llms/openrouter.md
+++ b/docs/modules/usage/llms/openrouter.md
@@ -0,0 +1,12 @@
+# OpenRouter
+
+OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).
+
+## Configuration
+
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `OpenRouter`
+* `LLM Model` to the model you will be using.
+[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
+* `API Key` to your OpenRouter API key.
--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -17,7 +17,6 @@ Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some tro
 ## Common Issues

 * [Unable to connect to Docker](#unable-to-connect-to-docker)
-* [Unable to connect to LLM](#unable-to-connect-to-llm)
 * [404 Resource not found](#404-resource-not-found)
 * [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
 * [Sessions are not restored](#sessions-are-not-restored)
@@ -47,33 +46,6 @@ OpenHands uses a Docker container to do its work safely, without potentially bre
 * If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
 * In addition, upgrade your Docker to the latest version under `Check for Updates`

---
-### Unable to connect to LLM
-
-[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1208)
-
-**Symptoms**
-
-```python
-  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
-    super().__init__(message, response.request, body=body)
-                              ^^^^^^^^^^^^^^^^
-AttributeError: 'NoneType' object has no attribute 'request'
-```
-
-**Details**
-
-[GitHub Issues](https://github.com/All-Hands-AI/OpenHands/issues?q=is%3Aissue+is%3Aopen+404)
-
-This usually happens with *local* LLM setups, when OpenHands can't connect to the LLM server.
-See our guide for [local LLMs](llms/local-llms) for more information.
-
-**Workarounds**
-
-* Check your `base_url` in your config.toml (if it exists) under the "llm" section
-* Check that ollama (or whatever LLM you're using) is running OK
-* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
-
 ---
 ### `404 Resource not found`

@@ -115,7 +87,6 @@ the API endpoint you're trying to connect to. Most often this happens for Azure
  * If you're running inside the UI, be sure to set the `model` in the settings modal
  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
 * Make sure you've followed any special instructions for your LLM provider
-  * [ollama](/modules/usage/llms/local-llms)
  * [Azure](/modules/usage/llms/azure-llms)
  * [Google](/modules/usage/llms/google-llms)
 * Make sure your API key is correct
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -21,11 +21,6 @@ const sidebars: SidebarsConfig = {
          type: 'category',
          label: 'Providers',
          items: [
-            {
-              type: 'doc',
-              label: 'OpenAI',
-              id: 'usage/llms/openai-llms',
-            },
            {
              type: 'doc',
              label: 'Azure',
@@ -43,9 +38,14 @@ const sidebars: SidebarsConfig = {
            },
            {
              type: 'doc',
-              label: 'Local/ollama',
-              id: 'usage/llms/local-llms',
-            }
+              label: 'OpenAI',
+              id: 'usage/llms/openai-llms',
+            },
+            {
+              type: 'doc',
+              label: 'OpenRouter',
+              id: 'usage/llms/openrouter',
+            },
          ],
        },
      ],
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -24,7 +24,7 @@ This is now the default behavior.

 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).

-When the `run_infer.sh` script is started, it will automatically pull the relavant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
+When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -3,7 +3,6 @@ import tempfile
 import time

 import pandas as pd
-from pydantic import BaseModel
 from swebench.harness.grading import get_eval_report
 from swebench.harness.run_evaluation import (
    APPLY_PATCH_FAIL,
@@ -35,6 +34,36 @@ DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xing
 logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')


+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
 def get_config(instance: pd.Series) -> AppConfig:
    # We use a different instance image for the each instance of swe-bench eval
    base_container_image = get_instance_docker_image(instance['instance_id'])
@@ -60,13 +89,6 @@ def get_config(instance: pd.Series) -> AppConfig:
    return config


-class SWEBenchEvalResult(BaseModel):
-    instance_id: str
-    apply_patch_output: str
-    test_output: str
-    resolved: bool
-
-
 def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata | None = None,
@@ -94,6 +116,7 @@ def process_instance(
        'resolved': False,
        'failed_apply_patch': False,
        'error_eval': False,
+        'test_timeout': False,
    }

    if model_patch == '':
@@ -170,13 +193,14 @@ def process_instance(

                # Poll for completion
                start_time = time.time()
-                timeout = 900  # 15 minutes
+                timeout = 1800  # 30 minutes
                while True:
                    seconds_elapsed = time.time() - start_time
                    if seconds_elapsed > timeout:
                        logger.info(
                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
                        )
+                        instance['test_result']['report']['test_timeout'] = True
                        break
                    check_action = CmdRunAction(
                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
@@ -315,6 +339,9 @@ if __name__ == '__main__':
        set(predictions.columns)
    ), 'Input file must contain instance_id and model_patch columns.'

+    # Process model_patch
+    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
+
    # Merge predictions with dataset
    predictions['instance'] = predictions['instance_id'].apply(
        lambda x: instance_id_to_instance[x]
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -30,6 +30,7 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
+from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.runtime import Runtime
 from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue

@@ -383,10 +384,7 @@ def process_instance(
    if state is None:
        raise ValueError('State should not be None.')

-    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-    # for compatibility with the existing output format, we can remake the pairs here
-    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = [event_to_dict(event) for event in state.history.get_events()]
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
@@ -398,6 +396,7 @@ def process_instance(
        metadata=metadata,
        history=histories,
        metrics=metrics,
+        llm_completions=state.extra_data.get('llm_completions', []),
        error=state.last_error if state and state.last_error else None,
    )
    return output
--- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
@@ -3,6 +3,8 @@ import os

 import pandas as pd

+from evaluation.swe_bench.eval_infer import process_git_patch
+
 parser = argparse.ArgumentParser()
 parser.add_argument('oh_output_file', type=str)
 args = parser.parse_args()
@@ -14,36 +16,6 @@ oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))


-def process_git_patch(patch):
-    if not isinstance(patch, str):
-        return ''
-
-    if not patch.strip():
-        # skip empty patches
-        return ''
-
-    patch = patch.replace('\r\n', '\n')
-    # There might be some weird characters at the beginning of the patch
-    # due to some OpenHands inference command outputs
-
-    # FOR EXAMPLE:
-    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
-    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
-    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
-    # new file mode 100644
-    # index 0000000000..fc13db5948
-
-    # We "find" the first line that starts with "diff" and then we remove lines before it
-    lines = patch.split('\n')
-    for i, line in enumerate(lines):
-        if line.startswith('diff --git'):
-            patch = '\n'.join(lines[i:])
-            break
-
-    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
-    return patch
-
-
 def convert_row_to_swebench_format(row):
    if 'git_patch' in row:
        model_patch = row['git_patch']
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -6,7 +6,6 @@ import pathlib
 import subprocess
 import time
 import traceback
-from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Any, Awaitable, Callable, TextIO

 import pandas as pd
@@ -57,7 +56,11 @@ class EvalOutput(BaseModel):

    # Interaction info
    metadata: EvalMetadata | None = None
-    history: list[tuple[dict[str, Any], dict[str, Any]]] | None = None
+    # list[tuple[dict[str, Any], dict[str, Any]]] - for compatibility with the old format
+    history: (
+        list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None
+    ) = None
+    llm_completions: list[dict[str, Any]]
    metrics: dict[str, Any] | None = None
    error: str | None = None

@@ -277,6 +280,7 @@ def _process_instance_wrapper(
                    + '-' * 10
                )
                # Raise an error after all retries & stop the evaluation
+                logger.exception(e)
                raise RuntimeError(
                    f'Maximum error retries reached for instance {instance.instance_id}'
                ) from e
@@ -296,6 +300,11 @@ def _process_instance_wrapper(
            time.sleep(5)


+def _process_instance_wrapper_mp(args):
+    """Wrapper for multiprocessing, especially for imap_unordered."""
+    return _process_instance_wrapper(*args)
+
+
 def run_evaluation(
    dataset: pd.DataFrame,
    metadata: EvalMetadata | None,
@@ -322,20 +331,13 @@ def run_evaluation(

    try:
        if use_multiprocessing:
-            with ProcessPoolExecutor(num_workers) as executor:
-                futures = [
-                    executor.submit(
-                        _process_instance_wrapper,
-                        process_instance_func=process_instance_func,
-                        instance=instance,
-                        metadata=metadata,
-                        use_mp=True,
-                        max_retries=max_retries,
-                    )
+            with mp.Pool(num_workers) as pool:
+                args_iter = (
+                    (process_instance_func, instance, metadata, True, max_retries)
                    for _, instance in dataset.iterrows()
-                ]
-                for future in as_completed(futures):
-                    result = future.result()
+                )
+                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
+                for result in results:
                    update_progress(result, pbar, output_fp)
        else:
            for _, instance in dataset.iterrows():
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.3",
+  "version": "0.9.4",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "openhands-frontend",
-      "version": "0.9.3",
+      "version": "0.9.4",
      "dependencies": {
        "@monaco-editor/react": "^4.6.0",
        "@nextui-org/react": "^2.4.8",
@@ -20,7 +20,7 @@
        "i18next": "^23.15.1",
        "i18next-browser-languagedetector": "^8.0.0",
        "i18next-http-backend": "^2.6.1",
-        "jose": "^5.9.2",
+        "jose": "^5.9.3",
        "monaco-editor": "^0.52.0",
        "react": "^18.3.1",
        "react-dom": "^18.3.1",
@@ -33,7 +33,7 @@
        "react-syntax-highlighter": "^15.5.0",
        "remark-gfm": "^4.0.0",
        "tailwind-merge": "^2.5.2",
-        "vite": "^5.4.6",
+        "vite": "^5.4.7",
        "web-vitals": "^3.5.2"
      },
      "devDependencies": {
@@ -41,8 +41,8 @@
        "@testing-library/jest-dom": "^6.5.0",
        "@testing-library/react": "^16.0.1",
        "@testing-library/user-event": "^14.5.2",
-        "@types/node": "^22.5.5",
-        "@types/react": "^18.3.7",
+        "@types/node": "^22.6.1",
+        "@types/react": "^18.3.8",
        "@types/react-dom": "^18.3.0",
        "@types/react-highlight": "^0.12.8",
        "@types/react-syntax-highlighter": "^15.5.13",
@@ -4860,9 +4860,9 @@
      "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
    },
    "node_modules/@types/node": {
-      "version": "22.5.5",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.5.tgz",
-      "integrity": "sha512-Xjs4y5UPO/CLdzpgR6GirZJx36yScjh73+2NlLlkFRSoQN8B0DpfXPdZGnvVmLRLOsqDpOfTNv7D9trgGhmOIA==",
+      "version": "22.6.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.6.1.tgz",
+      "integrity": "sha512-V48tCfcKb/e6cVUigLAaJDAILdMP0fUW6BidkPK4GpGjXcfbnoHasCZDwz3N3yVt5we2RHm4XTQCpv0KJz9zqw==",
      "devOptional": true,
      "dependencies": {
        "undici-types": "~6.19.2"
@@ -4874,9 +4874,9 @@
      "integrity": "sha512-5zvhXYtRNRluoE/jAp4GVsSduVUzNWKkOZrCDBWYtE7biZywwdC2AcEzg+cSMLFRfVgeAFqpfNabiPjxFddV1Q=="
    },
    "node_modules/@types/react": {
-      "version": "18.3.7",
-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.7.tgz",
-      "integrity": "sha512-KUnDCJF5+AiZd8owLIeVHqmW9yM4sqmDVf2JRJiBMFkGvkoZ4/WyV2lL4zVsoinmRS/W3FeEdZLEWFRofnT2FQ==",
+      "version": "18.3.8",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.8.tgz",
+      "integrity": "sha512-syBUrW3/XpnW4WJ41Pft+I+aPoDVbrBVQGEnbD7NijDGlVC+8gV/XKRY+7vMDlfPpbwYt0l1vd/Sj8bJGMbs9Q==",
      "dependencies": {
        "@types/prop-types": "*",
        "csstype": "^3.0.2"
@@ -8847,9 +8847,9 @@
      }
    },
    "node_modules/jose": {
-      "version": "5.9.2",
-      "resolved": "https://registry.npmjs.org/jose/-/jose-5.9.2.tgz",
-      "integrity": "sha512-ILI2xx/I57b20sd7rHZvgiiQrmp2mcotwsAH+5ajbpFQbrYVQdNHYlQhoA5cFb78CgtBOxtC05TeA+mcgkuCqQ==",
+      "version": "5.9.3",
+      "resolved": "https://registry.npmjs.org/jose/-/jose-5.9.3.tgz",
+      "integrity": "sha512-egLIoYSpcd+QUF+UHgobt5YzI2Pkw/H39ou9suW687MY6PmCwPmkNV/4TNjn1p2tX5xO3j0d0sq5hiYE24bSlg==",
      "funding": {
        "url": "https://github.com/sponsors/panva"
      }
@@ -13112,9 +13112,9 @@
      }
    },
    "node_modules/vite": {
-      "version": "5.4.6",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.6.tgz",
-      "integrity": "sha512-IeL5f8OO5nylsgzd9tq4qD2QqI0k2CQLGrWD0rCN0EQJZpBK5vJAx0I+GDkMOXxQX/OfFHMuLIx6ddAxGX/k+Q==",
+      "version": "5.4.7",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.7.tgz",
+      "integrity": "sha512-5l2zxqMEPVENgvzTuBpHer2awaetimj2BGkhBPdnwKbPNOlHsODU+oiazEZzLK7KhAnOrO+XGYJYn4ZlUhDtDQ==",
      "dependencies": {
        "esbuild": "^0.21.3",
        "postcss": "^8.4.43",
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.3",
+  "version": "0.9.4",
  "private": true,
  "type": "module",
  "engines": {
@@ -19,7 +19,7 @@
    "i18next": "^23.15.1",
    "i18next-browser-languagedetector": "^8.0.0",
    "i18next-http-backend": "^2.6.1",
-    "jose": "^5.9.2",
+    "jose": "^5.9.3",
    "monaco-editor": "^0.52.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
@@ -32,7 +32,7 @@
    "react-syntax-highlighter": "^15.5.0",
    "remark-gfm": "^4.0.0",
    "tailwind-merge": "^2.5.2",
-    "vite": "^5.4.6",
+    "vite": "^5.4.7",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -64,8 +64,8 @@
    "@testing-library/jest-dom": "^6.5.0",
    "@testing-library/react": "^16.0.1",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^22.5.5",
-    "@types/react": "^18.3.7",
+    "@types/node": "^22.6.1",
+    "@types/react": "^18.3.8",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
--- a/frontend/src/components/AgentStatusBar.tsx
+++ b/frontend/src/components/AgentStatusBar.tsx
@@ -18,6 +18,7 @@ enum IndicatorColor {
 function AgentStatusBar() {
  const { t } = useTranslation();
  const { curAgentState } = useSelector((state: RootState) => state.agent);
+  const { curStatusMessage } = useSelector((state: RootState) => state.status);

  const AgentStatusMap: {
    [k: string]: { message: string; indicator: IndicatorColor };
@@ -90,14 +91,25 @@ function AgentStatusBar() {
    }
  }, [curAgentState]);

+  const [statusMessage, setStatusMessage] = React.useState<string>("");
+
+  React.useEffect(() => {
+    const trimmedCustomMessage = curStatusMessage.message.trim();
+    if (trimmedCustomMessage) {
+      setStatusMessage(t(trimmedCustomMessage));
+    } else {
+      setStatusMessage(AgentStatusMap[curAgentState].message);
+    }
+  }, [curAgentState, curStatusMessage.message]);
+
  return (
-    <div className="flex items-center">
-      <div
-        className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
-      />
-      <span className="text-sm text-stone-400">
-        {AgentStatusMap[curAgentState].message}
-      </span>
+    <div className="flex flex-col items-center">
+      <div className="flex items-center">
+        <div
+          className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
+        />
+        <span className="text-sm text-stone-400">{statusMessage}</span>
+      </div>
    </div>
  );
 }
--- a/frontend/src/components/modals/settings/ModelSelector.tsx
+++ b/frontend/src/components/modals/settings/ModelSelector.tsx
@@ -112,7 +112,7 @@ export function ModelSelector({
            {models[selectedProvider || ""]?.models
              .filter((model) => VERIFIED_MODELS.includes(model))
              .map((model) => (
-                <AutocompleteItem key={model} value={model}>
+                <AutocompleteItem key={model} value={model} title={model}>
                  {model}
                </AutocompleteItem>
              ))}
@@ -121,7 +121,7 @@ export function ModelSelector({
            {models[selectedProvider || ""]?.models
              .filter((model) => !VERIFIED_MODELS.includes(model))
              .map((model) => (
-                <AutocompleteItem key={model} value={model}>
+                <AutocompleteItem key={model} value={model} title={model}>
                  {model}
                </AutocompleteItem>
              ))}
--- a/frontend/src/components/modals/settings/SettingsForm.tsx
+++ b/frontend/src/components/modals/settings/SettingsForm.tsx
@@ -52,13 +52,17 @@ function SettingsForm({
  const [enableAdvanced, setEnableAdvanced] =
    React.useState(advancedAlreadyInUse);

+  const handleAdvancedChange = (value: boolean) => {
+    setEnableAdvanced(value);
+  };
+
  return (
    <>
      <Switch
        data-testid="advanced-options-toggle"
        aria-checked={enableAdvanced}
        isSelected={enableAdvanced}
-        onValueChange={(value) => setEnableAdvanced(value)}
+        onValueChange={handleAdvancedChange}
      >
        Advanced Options
      </Switch>
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -6,10 +6,11 @@ import {
  ActionSecurityRisk,
  appendSecurityAnalyzerInput,
 } from "#/state/securityAnalyzerSlice";
+import { setCurStatusMessage } from "#/state/statusSlice";
 import { setRootTask } from "#/state/taskSlice";
 import store from "#/store";
 import ActionType from "#/types/ActionType";
-import { ActionMessage } from "#/types/Message";
+import { ActionMessage, StatusMessage } from "#/types/Message";
 import { SocketMessage } from "#/types/ResponseType";
 import { handleObservationMessage } from "./observations";
 import { getRootTask } from "./taskService";
@@ -138,6 +139,16 @@ export function handleActionMessage(message: ActionMessage) {
  }
 }

+export function handleStatusMessage(message: StatusMessage) {
+  const msg = message.message == null ? "" : message.message.trim();
+  store.dispatch(
+    setCurStatusMessage({
+      ...message,
+      message: msg,
+    }),
+  );
+}
+
 export function handleAssistantMessage(data: string | SocketMessage) {
  let socketMessage: SocketMessage;

@@ -149,7 +160,9 @@ export function handleAssistantMessage(data: string | SocketMessage) {

  if ("action" in socketMessage) {
    handleActionMessage(socketMessage);
-  } else {
+  } else if ("observation" in socketMessage) {
    handleObservationMessage(socketMessage);
+  } else if ("message" in socketMessage) {
+    handleStatusMessage(socketMessage);
  }
 }
--- a/frontend/src/services/session.ts
+++ b/frontend/src/services/session.ts
@@ -8,11 +8,19 @@ import { I18nKey } from "#/i18n/declaration";

 const translate = (key: I18nKey) => i18next.t(key);

+// Define a type for the messages
+type Message = {
+  action: ActionType;
+  args: Record<string, unknown>;
+};
+
 class Session {
  private static _socket: WebSocket | null = null;

  private static _latest_event_id: number = -1;

+  private static _messageQueue: Message[] = [];
+
  public static _history: Record<string, unknown>[] = [];

  // callbacks contain a list of callable functions
@@ -83,6 +91,7 @@ class Session {
      toast.success("ws", translate(I18nKey.SESSION$SERVER_CONNECTED_MESSAGE));
      Session._connecting = false;
      Session._initializeAgent();
+      Session._flushQueue();
      Session.callbacks.open?.forEach((callback) => {
        callback(e);
      });
@@ -94,7 +103,6 @@ class Session {
        data = JSON.parse(e.data);
        Session._history.push(data);
      } catch (err) {
-        // TODO: report the error
        toast.error(
          "ws",
          translate(I18nKey.SESSION$SESSION_HANDLING_ERROR_MESSAGE),
@@ -115,6 +123,7 @@ class Session {
    };

    Session._socket.onerror = () => {
+      // TODO report error
      toast.error(
        "ws",
        translate(I18nKey.SESSION$SESSION_CONNECTION_ERROR_MESSAGE),
@@ -145,9 +154,20 @@ class Session {
    Session._socket = null;
  }

+  private static _flushQueue(): void {
+    while (Session._messageQueue.length > 0) {
+      const message = Session._messageQueue.shift();
+      if (message) {
+        setTimeout(() => Session.send(JSON.stringify(message)), 1000);
+      }
+    }
+  }
+
  static send(message: string): void {
+    const messageObject: Message = JSON.parse(message);
+
    if (Session._connecting) {
-      setTimeout(() => Session.send(message), 1000);
+      Session._messageQueue.push(messageObject);
      return;
    }
    if (!Session.isConnected()) {
--- a/frontend/src/services/settings.ts
+++ b/frontend/src/services/settings.ts
@@ -87,10 +87,10 @@ export const getSettings = (): Settings => {
 export const saveSettings = (settings: Partial<Settings>) => {
  Object.keys(settings).forEach((key) => {
    const isValid = validKeys.includes(key as keyof Settings);
-    const value = settings[key as keyof Settings];
-
-    if (isValid && typeof value !== "undefined")
-      localStorage.setItem(key, value.toString());
+    if (!isValid) return;
+    let value = settings[key as keyof Settings];
+    if (value === undefined || value === null) value = "";
+    localStorage.setItem(key, value.toString());
  });
  localStorage.setItem("SETTINGS_VERSION", LATEST_SETTINGS_VERSION.toString());
 };
--- a/frontend/src/state/statusSlice.ts
+++ b/frontend/src/state/statusSlice.ts
@@ -0,0 +1,23 @@
+import { createSlice, PayloadAction } from "@reduxjs/toolkit";
+import { StatusMessage } from "#/types/Message";
+
+const initialStatusMessage: StatusMessage = {
+  message: "",
+  is_error: false,
+};
+
+export const statusSlice = createSlice({
+  name: "status",
+  initialState: {
+    curStatusMessage: initialStatusMessage,
+  },
+  reducers: {
+    setCurStatusMessage: (state, action: PayloadAction<StatusMessage>) => {
+      state.curStatusMessage = action.payload;
+    },
+  },
+});
+
+export const { setCurStatusMessage } = statusSlice.actions;
+
+export default statusSlice.reducer;
--- a/frontend/src/store.ts
+++ b/frontend/src/store.ts
@@ -8,6 +8,7 @@ import errorsReducer from "./state/errorsSlice";
 import taskReducer from "./state/taskSlice";
 import jupyterReducer from "./state/jupyterSlice";
 import securityAnalyzerReducer from "./state/securityAnalyzerSlice";
+import statusReducer from "./state/statusSlice";

 export const rootReducer = combineReducers({
  browser: browserReducer,
@@ -19,6 +20,7 @@ export const rootReducer = combineReducers({
  agent: agentReducer,
  jupyter: jupyterReducer,
  securityAnalyzer: securityAnalyzerReducer,
+  status: statusReducer,
 });

 const store = configureStore({
--- a/frontend/src/types/Message.tsx
+++ b/frontend/src/types/Message.tsx
@@ -31,3 +31,12 @@ export interface ObservationMessage {
  // The timestamp of the message
  timestamp: string;
 }
+
+export interface StatusMessage {
+  // TODO not implemented yet
+  // Whether the status is an error, default is false
+  is_error: boolean;
+
+  // A status message to display to the user
+  message: string;
+}
--- a/frontend/src/types/ResponseType.tsx
+++ b/frontend/src/types/ResponseType.tsx
@@ -1,5 +1,5 @@
-import { ActionMessage, ObservationMessage } from "./Message";
+import { ActionMessage, ObservationMessage, StatusMessage } from "./Message";

-type SocketMessage = ActionMessage | ObservationMessage;
+type SocketMessage = ActionMessage | ObservationMessage | StatusMessage;

 export { type SocketMessage };
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -132,6 +132,10 @@ class AgentController:
    async def update_state_after_step(self):
        # update metrics especially for cost
        self.state.local_metrics = self.agent.llm.metrics
+        if 'llm_completions' not in self.state.extra_data:
+            self.state.extra_data['llm_completions'] = []
+        self.state.extra_data['llm_completions'].extend(self.agent.llm.llm_completions)
+        self.agent.llm.llm_completions.clear()

    async def report_error(self, message: str, exception: Exception | None = None):
        """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct.
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -1,781 +0,0 @@
-import argparse
-import os
-import pathlib
-import platform
-import uuid
-from dataclasses import dataclass, field, fields, is_dataclass
-from enum import Enum
-from types import UnionType
-from typing import Any, ClassVar, MutableMapping, get_args, get_origin
-
-import toml
-from dotenv import load_dotenv
-
-from openhands.core import logger
-
-load_dotenv()
-
-
-LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
-_DEFAULT_AGENT = 'CodeActAgent'
-_MAX_ITERATIONS = 100
-
-
-@dataclass
-class LLMConfig:
-    """Configuration for the LLM model.
-
-    Attributes:
-        model: The model to use.
-        api_key: The API key to use.
-        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
-        api_version: The version of the API.
-        embedding_model: The embedding model to use.
-        embedding_base_url: The base URL for the embedding API.
-        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
-        aws_access_key_id: The AWS access key ID.
-        aws_secret_access_key: The AWS secret access key.
-        aws_region_name: The AWS region name.
-        num_retries: The number of retries to attempt.
-        retry_multiplier: The multiplier for the exponential backoff.
-        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
-        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
-        timeout: The timeout for the API.
-        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
-        temperature: The temperature for the API.
-        top_p: The top p for the API.
-        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
-        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
-        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
-        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
-        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
-        ollama_base_url: The base URL for the OLLAMA API.
-        drop_params: Drop any unmapped (unsupported) params without causing an exception.
-        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
-        caching_prompt: Using the prompt caching feature provided by the LLM.
-    """
-
-    model: str = 'gpt-4o'
-    api_key: str | None = None
-    base_url: str | None = None
-    api_version: str | None = None
-    embedding_model: str = 'local'
-    embedding_base_url: str | None = None
-    embedding_deployment_name: str | None = None
-    aws_access_key_id: str | None = None
-    aws_secret_access_key: str | None = None
-    aws_region_name: str | None = None
-    num_retries: int = 8
-    retry_multiplier: float = 2
-    retry_min_wait: int = 15
-    retry_max_wait: int = 120
-    timeout: int | None = None
-    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
-    temperature: float = 0
-    top_p: float = 0.5
-    custom_llm_provider: str | None = None
-    max_input_tokens: int | None = None
-    max_output_tokens: int | None = None
-    input_cost_per_token: float | None = None
-    output_cost_per_token: float | None = None
-    ollama_base_url: str | None = None
-    drop_params: bool | None = None
-    disable_vision: bool | None = None
-    caching_prompt: bool = False
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in LLM_SENSITIVE_FIELDS:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"LLMConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-    def to_safe_dict(self):
-        """Return a dict with the sensitive fields replaced with ******."""
-        ret = self.__dict__.copy()
-        for k, v in ret.items():
-            if k in LLM_SENSITIVE_FIELDS:
-                ret[k] = '******' if v else None
-        return ret
-
-    def set_missing_attributes(self):
-        """Set any missing attributes to their default values."""
-        for field_name, field_obj in self.__dataclass_fields__.items():
-            if not hasattr(self, field_name):
-                setattr(self, field_name, field_obj.default)
-
-
-@dataclass
-class AgentConfig:
-    """Configuration for the agent.
-
-    Attributes:
-        micro_agent_name: The name of the micro agent to use for this agent.
-        memory_enabled: Whether long-term memory (embeddings) is enabled.
-        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
-        llm_config: The name of the llm config to use. If specified, this will override global llm config.
-    """
-
-    micro_agent_name: str | None = None
-    memory_enabled: bool = False
-    memory_max_threads: int = 2
-    llm_config: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
-
-
-@dataclass
-class SecurityConfig:
-    """Configuration for security related functionalities.
-
-    Attributes:
-        confirmation_mode: Whether to enable confirmation mode.
-        security_analyzer: The security analyzer to use.
-    """
-
-    confirmation_mode: bool = False
-    security_analyzer: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SecurityConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-@dataclass
-class SandboxConfig:
-    """Configuration for the sandbox.
-
-    Attributes:
-        api_hostname: The hostname for the EventStream Runtime API.
-        base_container_image: The base container image from which to build the runtime image.
-        runtime_container_image: The runtime container image to use.
-        user_id: The user ID for the sandbox.
-        timeout: The timeout for the sandbox.
-        enable_auto_lint: Whether to enable auto-lint.
-        use_host_network: Whether to use the host network.
-        initialize_plugins: Whether to initialize plugins.
-        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
-            This will be rendered into the end of the Dockerfile that builds the runtime image.
-            It can contain any valid shell commands (e.g., pip install numpy).
-            The path to the interpreter is available as $OH_INTERPRETER_PATH,
-            which can be used to install dependencies for the OH-specific Python interpreter.
-        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
-            This is a dictionary of key-value pairs.
-            This is useful for setting environment variables that are needed by the runtime.
-            For example, for specifying the base url of website for browsergym evaluation.
-        browsergym_eval_env: The BrowserGym environment to use for evaluation.
-            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
-    """
-
-    api_hostname: str = 'localhost'
-    api_key: str | None = None
-    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
-    runtime_container_image: str | None = None
-    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
-    timeout: int = 120
-    enable_auto_lint: bool = (
-        False  # once enabled, OpenHands would lint files after editing
-    )
-    use_host_network: bool = False
-    initialize_plugins: bool = True
-    runtime_extra_deps: str | None = None
-    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
-    browsergym_eval_env: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SandboxConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class UndefinedString(str, Enum):
-    UNDEFINED = 'UNDEFINED'
-
-
-@dataclass
-class AppConfig:
-    """Configuration for the app.
-
-    Attributes:
-        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
-        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
-        default_agent: The name of the default agent to use.
-        sandbox: The sandbox configuration.
-        runtime: The runtime environment.
-        file_store: The file store to use.
-        file_store_path: The path to the file store.
-        workspace_base: The base path for the workspace. Defaults to ./workspace as an absolute path.
-        workspace_mount_path: The path to mount the workspace. This is set to the workspace base by default.
-        workspace_mount_path_in_sandbox: The path to mount the workspace in the sandbox. Defaults to /workspace.
-        workspace_mount_rewrite: The path to rewrite the workspace mount path to.
-        cache_dir: The path to the cache directory. Defaults to /tmp/cache.
-        run_as_openhands: Whether to run as openhands.
-        max_iterations: The maximum number of iterations.
-        max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
-        e2b_api_key: The E2B API key.
-        disable_color: Whether to disable color. For terminals that don't support color.
-        debug: Whether to enable debugging.
-        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
-        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
-        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
-        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
-    """
-
-    llms: dict[str, LLMConfig] = field(default_factory=dict)
-    agents: dict = field(default_factory=dict)
-    default_agent: str = _DEFAULT_AGENT
-    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
-    security: SecurityConfig = field(default_factory=SecurityConfig)
-    runtime: str = 'eventstream'
-    file_store: str = 'memory'
-    file_store_path: str = '/tmp/file_store'
-    # TODO: clean up workspace path after the removal of ServerRuntime
-    workspace_base: str = os.path.join(os.getcwd(), 'workspace')
-    workspace_mount_path: str | None = (
-        UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
-    )  # when set to None, do not mount the workspace
-    workspace_mount_path_in_sandbox: str = '/workspace'
-    workspace_mount_rewrite: str | None = None
-    cache_dir: str = '/tmp/cache'
-    run_as_openhands: bool = True
-    max_iterations: int = _MAX_ITERATIONS
-    max_budget_per_task: float | None = None
-    e2b_api_key: str = ''
-    disable_color: bool = False
-    jwt_secret: str = uuid.uuid4().hex
-    debug: bool = False
-    enable_cli_session: bool = False
-    file_uploads_max_file_size_mb: int = 0
-    file_uploads_restrict_file_types: bool = False
-    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
-
-    defaults_dict: ClassVar[dict] = {}
-
-    def get_llm_config(self, name='llm') -> LLMConfig:
-        """Llm is the name for default config (for backward compatibility prior to 0.8)"""
-        if name in self.llms:
-            return self.llms[name]
-        if name is not None and name != 'llm':
-            logger.openhands_logger.warning(
-                f'llm config group {name} not found, using default config'
-            )
-        if 'llm' not in self.llms:
-            self.llms['llm'] = LLMConfig()
-        return self.llms['llm']
-
-    def set_llm_config(self, value: LLMConfig, name='llm'):
-        self.llms[name] = value
-
-    def get_agent_config(self, name='agent') -> AgentConfig:
-        """Agent is the name for default config (for backward compability prior to 0.8)"""
-        if name in self.agents:
-            return self.agents[name]
-        if 'agent' not in self.agents:
-            self.agents['agent'] = AgentConfig()
-        return self.agents['agent']
-
-    def set_agent_config(self, value: AgentConfig, name='agent'):
-        self.agents[name] = value
-
-    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
-        """Get a map of agent names to llm configs."""
-        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
-
-    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
-        agent_config: AgentConfig = self.get_agent_config(name)
-        llm_config_name = agent_config.llm_config
-        return self.get_llm_config(llm_config_name)
-
-    def get_agent_configs(self) -> dict[str, AgentConfig]:
-        return self.agents
-
-    def __post_init__(self):
-        """Post-initialization hook, called when the instance is created with only default values."""
-        AppConfig.defaults_dict = self.defaults_to_dict()
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            field_value = getattr(self, f.name)
-
-            # dataclasses compute their defaults themselves
-            if is_dataclass(type(field_value)):
-                result[f.name] = field_value.defaults_to_dict()
-            else:
-                result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in [
-                'e2b_api_key',
-                'github_token',
-                'jwt_secret',
-            ]:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"AppConfig({', '.join(attr_str)}"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-def get_field_info(f):
-    """Extract information about a dataclass field: type, optional, and default.
-
-    Args:
-        f: The field to extract information from.
-
-    Returns: A dict with the field's type, whether it's optional, and its default value.
-    """
-    field_type = f.type
-    optional = False
-
-    # for types like str | None, find the non-None type and set optional to True
-    # this is useful for the frontend to know if a field is optional
-    # and to show the correct type in the UI
-    # Note: this only works for UnionTypes with None as one of the types
-    if get_origin(field_type) is UnionType:
-        types = get_args(field_type)
-        non_none_arg = next((t for t in types if t is not type(None)), None)
-        if non_none_arg is not None:
-            field_type = non_none_arg
-            optional = True
-
-    # type name in a pretty format
-    type_name = (
-        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
-    )
-
-    # default is always present
-    default = f.default
-
-    # return a schema with the useful info for frontend
-    return {'type': type_name.lower(), 'optional': optional, 'default': default}
-
-
-def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
-    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
-    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
-
-    Args:
-        cfg: The AppConfig object to set attributes on.
-        env_or_toml_dict: The environment variables or a config.toml dict.
-    """
-
-    def get_optional_type(union_type: UnionType) -> Any:
-        """Returns the non-None type from a Union."""
-        types = get_args(union_type)
-        return next((t for t in types if t is not type(None)), None)
-
-    # helper function to set attributes based on env vars
-    def set_attr_from_env(sub_config: Any, prefix=''):
-        """Set attributes of a config dataclass based on environment variables."""
-        for field_name, field_type in sub_config.__annotations__.items():
-            # compute the expected env var name from the prefix and field name
-            # e.g. LLM_BASE_URL
-            env_var_name = (prefix + field_name).upper()
-
-            if is_dataclass(field_type):
-                # nested dataclass
-                nested_sub_config = getattr(sub_config, field_name)
-                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
-            elif env_var_name in env_or_toml_dict:
-                # convert the env var to the correct type and set it
-                value = env_or_toml_dict[env_var_name]
-
-                # skip empty config values (fall back to default)
-                if not value:
-                    continue
-
-                try:
-                    # if it's an optional type, get the non-None type
-                    if get_origin(field_type) is UnionType:
-                        field_type = get_optional_type(field_type)
-
-                    # Attempt to cast the env var to type hinted in the dataclass
-                    if field_type is bool:
-                        cast_value = str(value).lower() in ['true', '1']
-                    else:
-                        cast_value = field_type(value)
-                    setattr(sub_config, field_name, cast_value)
-                except (ValueError, TypeError):
-                    logger.openhands_logger.error(
-                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
-                    )
-
-    # Start processing from the root of the config object
-    set_attr_from_env(cfg)
-
-    # load default LLM config from env
-    default_llm_config = cfg.get_llm_config()
-    set_attr_from_env(default_llm_config, 'LLM_')
-    # load default agent config from env
-    default_agent_config = cfg.get_agent_config()
-    set_attr_from_env(default_agent_config, 'AGENT_')
-
-
-def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
-    """Load the config from the toml file. Supports both styles of config vars.
-
-    Args:
-        cfg: The AppConfig object to update attributes of.
-        toml_file: The path to the toml file. Defaults to 'config.toml'.
-    """
-    # try to read the config.toml file into the config object
-    try:
-        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
-            toml_config = toml.load(toml_contents)
-    except FileNotFoundError:
-        return
-    except toml.TomlDecodeError as e:
-        logger.openhands_logger.warning(
-            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
-            exc_info=False,
-        )
-        return
-
-    # if there was an exception or core is not in the toml, try to use the old-style toml
-    if 'core' not in toml_config:
-        # re-use the env loader to set the config from env-style vars
-        load_from_env(cfg, toml_config)
-        return
-
-    core_config = toml_config['core']
-
-    # load llm configs and agent configs
-    for key, value in toml_config.items():
-        if isinstance(value, dict):
-            try:
-                if key is not None and key.lower() == 'agent':
-                    logger.openhands_logger.debug(
-                        'Attempt to load default agent config from config toml'
-                    )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
-                    }
-                    agent_config = AgentConfig(**non_dict_fields)
-                    cfg.set_agent_config(agent_config, 'agent')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as agent config'
-                            )
-                            agent_config = AgentConfig(**nested_value)
-                            cfg.set_agent_config(agent_config, nested_key)
-                elif key is not None and key.lower() == 'llm':
-                    logger.openhands_logger.debug(
-                        'Attempt to load default LLM config from config toml'
-                    )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
-                    }
-                    llm_config = LLMConfig(**non_dict_fields)
-                    cfg.set_llm_config(llm_config, 'llm')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as llm config'
-                            )
-                            llm_config = LLMConfig(**nested_value)
-                            cfg.set_llm_config(llm_config, nested_key)
-                elif not key.startswith('sandbox') and key.lower() != 'core':
-                    logger.openhands_logger.warning(
-                        f'Unknown key in {toml_file}: "{key}"'
-                    )
-            except (TypeError, KeyError) as e:
-                logger.openhands_logger.warning(
-                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
-                    exc_info=False,
-                )
-        else:
-            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
-
-    try:
-        # set sandbox config from the toml file
-        sandbox_config = cfg.sandbox
-
-        # migrate old sandbox configs from [core] section to sandbox config
-        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
-        for key in keys_to_migrate:
-            new_key = key.replace('sandbox_', '')
-            if new_key in sandbox_config.__annotations__:
-                # read the key in sandbox and remove it from core
-                setattr(sandbox_config, new_key, core_config.pop(key))
-            else:
-                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
-
-        # the new style values override the old style values
-        if 'sandbox' in toml_config:
-            sandbox_config = SandboxConfig(**toml_config['sandbox'])
-
-        # update the config object with the new values
-        cfg.sandbox = sandbox_config
-        for key, value in core_config.items():
-            if hasattr(cfg, key):
-                setattr(cfg, key, value)
-            else:
-                logger.openhands_logger.warning(f'Unknown core config key: {key}')
-    except (TypeError, KeyError) as e:
-        logger.openhands_logger.warning(
-            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
-            exc_info=False,
-        )
-
-
-def finalize_config(cfg: AppConfig):
-    """More tweaks to the config after it's been loaded."""
-    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
-    # Set workspace_mount_path if not set by the user
-    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
-        cfg.workspace_mount_path = cfg.workspace_base
-
-    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
-        # TODO why do we need to check if workspace_mount_path is None?
-        base = cfg.workspace_base or os.getcwd()
-        parts = cfg.workspace_mount_rewrite.split(':')
-        cfg.workspace_mount_path = base.replace(parts[0], parts[1])
-
-    for llm in cfg.llms.values():
-        if llm.embedding_base_url is None:
-            llm.embedding_base_url = llm.base_url
-
-    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
-        logger.openhands_logger.warning(
-            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
-            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
-        )
-
-    # make sure cache dir exists
-    if cfg.cache_dir:
-        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
-
-
-# Utility function for command line --group argument
-def get_llm_config_arg(
-    llm_config_arg: str, toml_file: str = 'config.toml'
-) -> LLMConfig | None:
-    """Get a group of llm settings from the config file.
-
-    A group in config.toml can look like this:
-
-    ```
-    [llm.gpt-3.5-for-eval]
-    model = 'gpt-3.5-turbo'
-    api_key = '...'
-    temperature = 0.5
-    num_retries = 8
-    ...
-    ```
-
-    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
-    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
-
-    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
-
-    Args:
-        llm_config_arg: The group of llm settings to get from the config.toml file.
-
-    Returns:
-        LLMConfig: The LLMConfig object with the settings from the config file.
-    """
-    # keep only the name, just in case
-    llm_config_arg = llm_config_arg.strip('[]')
-
-    # truncate the prefix, just in case
-    if llm_config_arg.startswith('llm.'):
-        llm_config_arg = llm_config_arg[4:]
-
-    logger.openhands_logger.info(f'Loading llm config from {llm_config_arg}')
-
-    # load the toml file
-    try:
-        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
-            toml_config = toml.load(toml_contents)
-    except FileNotFoundError as e:
-        logger.openhands_logger.error(f'Config file not found: {e}')
-        return None
-    except toml.TomlDecodeError as e:
-        logger.openhands_logger.error(
-            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
-        )
-        return None
-
-    # update the llm config with the specified section
-    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
-        return LLMConfig(**toml_config['llm'][llm_config_arg])
-    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
-    return None
-
-
-# Command line arguments
-def get_parser() -> argparse.ArgumentParser:
-    """Get the parser for the command line arguments."""
-    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
-    parser.add_argument(
-        '-d',
-        '--directory',
-        type=str,
-        help='The working directory for the agent',
-    )
-    parser.add_argument(
-        '-t',
-        '--task',
-        type=str,
-        default='',
-        help='The task for the agent to perform',
-    )
-    parser.add_argument(
-        '-f',
-        '--file',
-        type=str,
-        help='Path to a file containing the task. Overrides -t if both are provided.',
-    )
-    parser.add_argument(
-        '-c',
-        '--agent-cls',
-        default=_DEFAULT_AGENT,
-        type=str,
-        help='Name of the default agent to use',
-    )
-    parser.add_argument(
-        '-i',
-        '--max-iterations',
-        default=_MAX_ITERATIONS,
-        type=int,
-        help='The maximum number of iterations to run the agent',
-    )
-    parser.add_argument(
-        '-b',
-        '--max-budget-per-task',
-        type=float,
-        help='The maximum budget allowed per task, beyond which the agent will stop.',
-    )
-    # --eval configs are for evaluations only
-    parser.add_argument(
-        '--eval-output-dir',
-        default='evaluation/evaluation_outputs/outputs',
-        type=str,
-        help='The directory to save evaluation output',
-    )
-    parser.add_argument(
-        '--eval-n-limit',
-        default=None,
-        type=int,
-        help='The number of instances to evaluate',
-    )
-    parser.add_argument(
-        '--eval-num-workers',
-        default=4,
-        type=int,
-        help='The number of workers to use for evaluation',
-    )
-    parser.add_argument(
-        '--eval-note',
-        default=None,
-        type=str,
-        help='The note to add to the evaluation directory',
-    )
-    parser.add_argument(
-        '-l',
-        '--llm-config',
-        default=None,
-        type=str,
-        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
-    )
-    parser.add_argument(
-        '-n',
-        '--name',
-        default='default',
-        type=str,
-        help='Name for the session',
-    )
-    parser.add_argument(
-        '--eval-ids',
-        default=None,
-        type=str,
-        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
-    )
-    return parser
-
-
-def parse_arguments() -> argparse.Namespace:
-    """Parse the command line arguments."""
-    parser = get_parser()
-    parsed_args, _ = parser.parse_known_args()
-    return parsed_args
-
-
-def load_app_config(set_logging_levels: bool = True) -> AppConfig:
-    """Load the configuration from the config.toml file and environment variables.
-
-    Args:
-        set_logger_levels: Whether to set the global variables for logging levels.
-    """
-    config = AppConfig()
-    load_from_toml(config)
-    load_from_env(config, os.environ)
-    finalize_config(config)
-    if set_logging_levels:
-        logger.DEBUG = config.debug
-        logger.DISABLE_COLOR_PRINTING = config.disable_color
-    return config
--- a/openhands/core/config/README.md
+++ b/openhands/core/config/README.md
@@ -0,0 +1,104 @@
+# Configuration Management in OpenHands
+
+## Overview
+
+OpenHands uses a flexible configuration system that allows settings to be defined through environment variables, TOML files, and command-line arguments. The configuration is managed through a package structure in `openhands/core/config/`.
+
+## Configuration Classes
+
+The main configuration classes are:
+
+- `AppConfig`: The root configuration class
+- `LLMConfig`: Configuration for the Language Model
+- `AgentConfig`: Configuration for the agent
+- `SandboxConfig`: Configuration for the sandbox environment
+- `SecurityConfig`: Configuration for security settings
+
+These classes are defined as dataclasses, with class attributes holding default values for all fields.
+
+## Loading Configuration from Environment Variables
+
+The `load_from_env` function in the config package is responsible for loading configuration values from environment variables. It recursively processes the configuration classes, mapping environment variable names to class attributes.
+
+### Naming Convention for Environment Variables
+
+- Prefix: uppercase name of the configuration class followed by an underscore (e.g., `LLM_`, `AGENT_`)
+- Field Names: all uppercase
+- Full Variable Name: Prefix + Field Name (e.g., `LLM_API_KEY`, `AGENT_MEMORY_ENABLED`)
+
+### Examples
+
+```bash
+export LLM_API_KEY='your_api_key_here'
+export LLM_MODEL='gpt-4'
+export AGENT_MEMORY_ENABLED='true'
+export SANDBOX_TIMEOUT='300'
+```
+
+## Type Handling
+
+The `load_from_env` function attempts to cast environment variable values to the types specified in the dataclasses. It handles:
+
+- Basic types (str, int, bool)
+- Optional types (e.g., `str | None`)
+- Nested dataclasses
+
+If type casting fails, an error is logged, and the default value is retained.
+
+## Default Values
+
+If an environment variable is not set, the default value specified in the dataclass is used.
+
+## Nested Configurations
+
+The `AppConfig` class contains nested configurations like `LLMConfig` and `AgentConfig`. The `load_from_env` function handles these by recursively processing nested dataclasses with updated prefixes.
+
+## Security Considerations
+
+Be cautious when setting sensitive information like API keys in environment variables. Ensure your environment is secure.
+
+## Usage
+
+The `load_app_config()` function is the recommended way to initialize your configuration. It performs the following steps:
+
+1. Creates an instance of `AppConfig`
+2. Loads settings from the `config.toml` file (if present)
+3. Loads settings from environment variables, overriding TOML settings if applicable
+4. Applies final tweaks and validations to the configuration, falling back to the default values specified in the code
+5. Optionally sets global logging levels based on the configuration
+
+There are also command line args, which may work to override other sources.
+
+Here's an example of how to use `load_app_config()`:
+
+````python
+from openhands.core.config import load_app_config
+
+# Load all configuration settings
+config = load_app_config()
+
+# Now you can access your configuration
+llm_config = config.get_llm_config()
+agent_config = config.get_agent_config()
+sandbox_config = config.sandbox
+
+# Use the configuration in your application
+print(f"Using LLM model: {llm_config.model}")
+print(f"Agent memory enabled: {agent_config.memory_enabled}")
+print(f"Sandbox timeout: {sandbox_config.timeout}")
+````
+
+By using `load_app_config()`, you ensure that all configuration sources are properly loaded and processed, providing a consistent and fully initialized configuration for your application.
+
+## Additional Configuration Methods
+
+While this document focuses on environment variable configuration, OpenHands also supports:
+
+- Loading from TOML files
+- Parsing command-line arguments
+
+These methods are handled by separate functions in the config package.
+
+## Conclusion
+
+The OpenHands configuration system provides a flexible and type-safe way to manage application settings. By following the naming conventions and utilizing the provided functions, developers can easily customize the behavior of OpenHands components through environment variables and other configuration sources.
--- a/openhands/core/config/init.py
+++ b/openhands/core/config/init.py
@@ -0,0 +1,39 @@
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    UndefinedString,
+    get_field_info,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+from openhands.core.config.utils import (
+    finalize_config,
+    get_llm_config_arg,
+    get_parser,
+    load_app_config,
+    load_from_env,
+    load_from_toml,
+    parse_arguments,
+)
+
+__all__ = [
+    'OH_DEFAULT_AGENT',
+    'OH_MAX_ITERATIONS',
+    'UndefinedString',
+    'AgentConfig',
+    'AppConfig',
+    'LLMConfig',
+    'SandboxConfig',
+    'SecurityConfig',
+    'load_app_config',
+    'load_from_env',
+    'load_from_toml',
+    'finalize_config',
+    'get_llm_config_arg',
+    'get_field_info',
+    'get_parser',
+    'parse_arguments',
+]
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class AgentConfig:
+    """Configuration for the agent.
+
+    Attributes:
+        micro_agent_name: The name of the micro agent to use for this agent.
+        memory_enabled: Whether long-term memory (embeddings) is enabled.
+        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
+        llm_config: The name of the llm config to use. If specified, this will override global llm config.
+    """
+
+    micro_agent_name: str | None = None
+    memory_enabled: bool = False
+    memory_max_threads: int = 2
+    llm_config: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
--- a/openhands/core/config/app_config.py
+++ b/openhands/core/config/app_config.py
@@ -0,0 +1,151 @@
+import os
+import uuid
+from dataclasses import dataclass, field, fields, is_dataclass
+from typing import ClassVar
+
+from openhands.core import logger
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    UndefinedString,
+    get_field_info,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+
+
+@dataclass
+class AppConfig:
+    """Configuration for the app.
+
+    Attributes:
+        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
+        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
+        default_agent: The name of the default agent to use.
+        sandbox: The sandbox configuration.
+        runtime: The runtime environment.
+        file_store: The file store to use.
+        file_store_path: The path to the file store.
+        workspace_base: The base path for the workspace. Defaults to ./workspace as an absolute path.
+        workspace_mount_path: The path to mount the workspace. This is set to the workspace base by default.
+        workspace_mount_path_in_sandbox: The path to mount the workspace in the sandbox. Defaults to /workspace.
+        workspace_mount_rewrite: The path to rewrite the workspace mount path to.
+        cache_dir: The path to the cache directory. Defaults to /tmp/cache.
+        run_as_openhands: Whether to run as openhands.
+        max_iterations: The maximum number of iterations.
+        max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
+        e2b_api_key: The E2B API key.
+        disable_color: Whether to disable color. For terminals that don't support color.
+        debug: Whether to enable debugging.
+        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
+        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
+        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
+        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
+    """
+
+    llms: dict[str, LLMConfig] = field(default_factory=dict)
+    agents: dict = field(default_factory=dict)
+    default_agent: str = OH_DEFAULT_AGENT
+    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
+    security: SecurityConfig = field(default_factory=SecurityConfig)
+    runtime: str = 'eventstream'
+    file_store: str = 'memory'
+    file_store_path: str = '/tmp/file_store'
+    # TODO: clean up workspace path after the removal of ServerRuntime
+    workspace_base: str = os.path.join(os.getcwd(), 'workspace')
+    workspace_mount_path: str | None = (
+        UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
+    )  # when set to None, do not mount the workspace
+    workspace_mount_path_in_sandbox: str = '/workspace'
+    workspace_mount_rewrite: str | None = None
+    cache_dir: str = '/tmp/cache'
+    run_as_openhands: bool = True
+    max_iterations: int = OH_MAX_ITERATIONS
+    max_budget_per_task: float | None = None
+    e2b_api_key: str = ''
+    disable_color: bool = False
+    jwt_secret: str = uuid.uuid4().hex
+    debug: bool = False
+    enable_cli_session: bool = False
+    file_uploads_max_file_size_mb: int = 0
+    file_uploads_restrict_file_types: bool = False
+    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
+
+    defaults_dict: ClassVar[dict] = {}
+
+    def get_llm_config(self, name='llm') -> LLMConfig:
+        """Llm is the name for default config (for backward compatibility prior to 0.8)"""
+        if name in self.llms:
+            return self.llms[name]
+        if name is not None and name != 'llm':
+            logger.openhands_logger.warning(
+                f'llm config group {name} not found, using default config'
+            )
+        if 'llm' not in self.llms:
+            self.llms['llm'] = LLMConfig()
+        return self.llms['llm']
+
+    def set_llm_config(self, value: LLMConfig, name='llm'):
+        self.llms[name] = value
+
+    def get_agent_config(self, name='agent') -> AgentConfig:
+        """Agent is the name for default config (for backward compability prior to 0.8)"""
+        if name in self.agents:
+            return self.agents[name]
+        if 'agent' not in self.agents:
+            self.agents['agent'] = AgentConfig()
+        return self.agents['agent']
+
+    def set_agent_config(self, value: AgentConfig, name='agent'):
+        self.agents[name] = value
+
+    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
+        """Get a map of agent names to llm configs."""
+        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
+
+    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
+        agent_config: AgentConfig = self.get_agent_config(name)
+        llm_config_name = agent_config.llm_config
+        return self.get_llm_config(llm_config_name)
+
+    def get_agent_configs(self) -> dict[str, AgentConfig]:
+        return self.agents
+
+    def __post_init__(self):
+        """Post-initialization hook, called when the instance is created with only default values."""
+        AppConfig.defaults_dict = self.defaults_to_dict()
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            field_value = getattr(self, f.name)
+
+            # dataclasses compute their defaults themselves
+            if is_dataclass(type(field_value)):
+                result[f.name] = field_value.defaults_to_dict()
+            else:
+                result[f.name] = get_field_info(f)
+        return result
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in [
+                'e2b_api_key',
+                'github_token',
+                'jwt_secret',
+            ]:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"AppConfig({', '.join(attr_str)}"
+
+    def __repr__(self):
+        return self.__str__()
--- a/openhands/core/config/config_utils.py
+++ b/openhands/core/config/config_utils.py
@@ -0,0 +1,44 @@
+from enum import Enum
+from types import UnionType
+from typing import get_args, get_origin
+
+OH_DEFAULT_AGENT = 'CodeActAgent'
+OH_MAX_ITERATIONS = 100
+
+
+class UndefinedString(str, Enum):
+    UNDEFINED = 'UNDEFINED'
+
+
+def get_field_info(f):
+    """Extract information about a dataclass field: type, optional, and default.
+
+    Args:
+        f: The field to extract information from.
+
+    Returns: A dict with the field's type, whether it's optional, and its default value.
+    """
+    field_type = f.type
+    optional = False
+
+    # for types like str | None, find the non-None type and set optional to True
+    # this is useful for the frontend to know if a field is optional
+    # and to show the correct type in the UI
+    # Note: this only works for UnionTypes with None as one of the types
+    if get_origin(field_type) is UnionType:
+        types = get_args(field_type)
+        non_none_arg = next((t for t in types if t is not type(None)), None)
+        if non_none_arg is not None:
+            field_type = non_none_arg
+            optional = True
+
+    # type name in a pretty format
+    type_name = (
+        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
+    )
+
+    # default is always present
+    default = f.default
+
+    # return a schema with the useful info for frontend
+    return {'type': type_name.lower(), 'optional': optional, 'default': default}
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -0,0 +1,109 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
+
+
+@dataclass
+class LLMConfig:
+    """Configuration for the LLM model.
+
+    Attributes:
+        model: The model to use.
+        api_key: The API key to use.
+        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
+        api_version: The version of the API.
+        embedding_model: The embedding model to use.
+        embedding_base_url: The base URL for the embedding API.
+        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
+        aws_access_key_id: The AWS access key ID.
+        aws_secret_access_key: The AWS secret access key.
+        aws_region_name: The AWS region name.
+        num_retries: The number of retries to attempt.
+        retry_multiplier: The multiplier for the exponential backoff.
+        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
+        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
+        timeout: The timeout for the API.
+        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
+        temperature: The temperature for the API.
+        top_p: The top p for the API.
+        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
+        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
+        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
+        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
+        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
+        ollama_base_url: The base URL for the OLLAMA API.
+        drop_params: Drop any unmapped (unsupported) params without causing an exception.
+        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
+        caching_prompt: Using the prompt caching feature provided by the LLM.
+        log_completions: Whether to log LLM completions to the state.
+    """
+
+    model: str = 'gpt-4o'
+    api_key: str | None = None
+    base_url: str | None = None
+    api_version: str | None = None
+    embedding_model: str = 'local'
+    embedding_base_url: str | None = None
+    embedding_deployment_name: str | None = None
+    aws_access_key_id: str | None = None
+    aws_secret_access_key: str | None = None
+    aws_region_name: str | None = None
+    openrouter_site_url: str = 'https://docs.all-hands.dev/'
+    openrouter_app_name: str = 'OpenHands'
+    num_retries: int = 8
+    retry_multiplier: float = 2
+    retry_min_wait: int = 15
+    retry_max_wait: int = 120
+    timeout: int | None = None
+    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
+    temperature: float = 0.0
+    top_p: float = 1.0
+    custom_llm_provider: str | None = None
+    max_input_tokens: int | None = None
+    max_output_tokens: int | None = None
+    input_cost_per_token: float | None = None
+    output_cost_per_token: float | None = None
+    ollama_base_url: str | None = None
+    drop_params: bool = True
+    disable_vision: bool | None = None
+    caching_prompt: bool = False
+    log_completions: bool = False
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in LLM_SENSITIVE_FIELDS:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"LLMConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def to_safe_dict(self):
+        """Return a dict with the sensitive fields replaced with ******."""
+        ret = self.__dict__.copy()
+        for k, v in ret.items():
+            if k in LLM_SENSITIVE_FIELDS:
+                ret[k] = '******' if v else None
+        return ret
+
+    def set_missing_attributes(self):
+        """Set any missing attributes to their default values."""
+        for field_name, field_obj in self.__dataclass_fields__.items():
+            if not hasattr(self, field_name):
+                setattr(self, field_name, field_obj.default)
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -0,0 +1,66 @@
+import os
+from dataclasses import dataclass, field, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class SandboxConfig:
+    """Configuration for the sandbox.
+
+    Attributes:
+        api_hostname: The hostname for the EventStream Runtime API.
+        base_container_image: The base container image from which to build the runtime image.
+        runtime_container_image: The runtime container image to use.
+        user_id: The user ID for the sandbox.
+        timeout: The timeout for the sandbox.
+        enable_auto_lint: Whether to enable auto-lint.
+        use_host_network: Whether to use the host network.
+        initialize_plugins: Whether to initialize plugins.
+        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
+            This will be rendered into the end of the Dockerfile that builds the runtime image.
+            It can contain any valid shell commands (e.g., pip install numpy).
+            The path to the interpreter is available as $OH_INTERPRETER_PATH,
+            which can be used to install dependencies for the OH-specific Python interpreter.
+        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
+            This is a dictionary of key-value pairs.
+            This is useful for setting environment variables that are needed by the runtime.
+            For example, for specifying the base url of website for browsergym evaluation.
+        browsergym_eval_env: The BrowserGym environment to use for evaluation.
+            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
+    """
+
+    api_hostname: str = 'localhost'
+    api_key: str | None = None
+    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
+    runtime_container_image: str | None = None
+    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
+    timeout: int = 120
+    enable_auto_lint: bool = (
+        False  # once enabled, OpenHands would lint files after editing
+    )
+    use_host_network: bool = False
+    initialize_plugins: bool = True
+    runtime_extra_deps: str | None = None
+    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
+    browsergym_eval_env: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SandboxConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
--- a/openhands/core/config/security_config.py
+++ b/openhands/core/config/security_config.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class SecurityConfig:
+    """Configuration for security related functionalities.
+
+    Attributes:
+        confirmation_mode: Whether to enable confirmation mode.
+        security_analyzer: The security analyzer to use.
+    """
+
+    confirmation_mode: bool = False
+    security_analyzer: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SecurityConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -0,0 +1,391 @@
+import argparse
+import os
+import pathlib
+import platform
+from dataclasses import is_dataclass
+from types import UnionType
+from typing import Any, MutableMapping, get_args, get_origin
+
+import toml
+from dotenv import load_dotenv
+
+from openhands.core import logger
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    UndefinedString,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+
+load_dotenv()
+
+
+def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
+    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
+    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
+
+    Args:
+        cfg: The AppConfig object to set attributes on.
+        env_or_toml_dict: The environment variables or a config.toml dict.
+    """
+
+    def get_optional_type(union_type: UnionType) -> Any:
+        """Returns the non-None type from a Union."""
+        types = get_args(union_type)
+        return next((t for t in types if t is not type(None)), None)
+
+    # helper function to set attributes based on env vars
+    def set_attr_from_env(sub_config: Any, prefix=''):
+        """Set attributes of a config dataclass based on environment variables."""
+        for field_name, field_type in sub_config.__annotations__.items():
+            # compute the expected env var name from the prefix and field name
+            # e.g. LLM_BASE_URL
+            env_var_name = (prefix + field_name).upper()
+
+            if is_dataclass(field_type):
+                # nested dataclass
+                nested_sub_config = getattr(sub_config, field_name)
+                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
+            elif env_var_name in env_or_toml_dict:
+                # convert the env var to the correct type and set it
+                value = env_or_toml_dict[env_var_name]
+
+                # skip empty config values (fall back to default)
+                if not value:
+                    continue
+
+                try:
+                    # if it's an optional type, get the non-None type
+                    if get_origin(field_type) is UnionType:
+                        field_type = get_optional_type(field_type)
+
+                    # Attempt to cast the env var to type hinted in the dataclass
+                    if field_type is bool:
+                        cast_value = str(value).lower() in ['true', '1']
+                    else:
+                        cast_value = field_type(value)
+                    setattr(sub_config, field_name, cast_value)
+                except (ValueError, TypeError):
+                    logger.openhands_logger.error(
+                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
+                    )
+
+    # Start processing from the root of the config object
+    set_attr_from_env(cfg)
+
+    # load default LLM config from env
+    default_llm_config = cfg.get_llm_config()
+    set_attr_from_env(default_llm_config, 'LLM_')
+    # load default agent config from env
+    default_agent_config = cfg.get_agent_config()
+    set_attr_from_env(default_agent_config, 'AGENT_')
+
+
+def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
+    """Load the config from the toml file. Supports both styles of config vars.
+
+    Args:
+        cfg: The AppConfig object to update attributes of.
+        toml_file: The path to the toml file. Defaults to 'config.toml'.
+    """
+    # try to read the config.toml file into the config object
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError:
+        return
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+        return
+
+    # if there was an exception or core is not in the toml, try to use the old-style toml
+    if 'core' not in toml_config:
+        # re-use the env loader to set the config from env-style vars
+        load_from_env(cfg, toml_config)
+        return
+
+    core_config = toml_config['core']
+
+    # load llm configs and agent configs
+    for key, value in toml_config.items():
+        if isinstance(value, dict):
+            try:
+                if key is not None and key.lower() == 'agent':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default agent config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    agent_config = AgentConfig(**non_dict_fields)
+                    cfg.set_agent_config(agent_config, 'agent')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as agent config'
+                            )
+                            agent_config = AgentConfig(**nested_value)
+                            cfg.set_agent_config(agent_config, nested_key)
+                elif key is not None and key.lower() == 'llm':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default LLM config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    llm_config = LLMConfig(**non_dict_fields)
+                    cfg.set_llm_config(llm_config, 'llm')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as llm config'
+                            )
+                            llm_config = LLMConfig(**nested_value)
+                            cfg.set_llm_config(llm_config, nested_key)
+                elif not key.startswith('sandbox') and key.lower() != 'core':
+                    logger.openhands_logger.warning(
+                        f'Unknown key in {toml_file}: "{key}"'
+                    )
+            except (TypeError, KeyError) as e:
+                logger.openhands_logger.warning(
+                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
+                    exc_info=False,
+                )
+        else:
+            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
+
+    try:
+        # set sandbox config from the toml file
+        sandbox_config = cfg.sandbox
+
+        # migrate old sandbox configs from [core] section to sandbox config
+        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
+        for key in keys_to_migrate:
+            new_key = key.replace('sandbox_', '')
+            if new_key in sandbox_config.__annotations__:
+                # read the key in sandbox and remove it from core
+                setattr(sandbox_config, new_key, core_config.pop(key))
+            else:
+                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
+
+        # the new style values override the old style values
+        if 'sandbox' in toml_config:
+            sandbox_config = SandboxConfig(**toml_config['sandbox'])
+
+        # update the config object with the new values
+        cfg.sandbox = sandbox_config
+        for key, value in core_config.items():
+            if hasattr(cfg, key):
+                setattr(cfg, key, value)
+            else:
+                logger.openhands_logger.warning(f'Unknown core config key: {key}')
+    except (TypeError, KeyError) as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+
+
+def finalize_config(cfg: AppConfig):
+    """More tweaks to the config after it's been loaded."""
+    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
+    # Set workspace_mount_path if not set by the user
+    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
+        cfg.workspace_mount_path = cfg.workspace_base
+
+    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
+        # TODO why do we need to check if workspace_mount_path is None?
+        base = cfg.workspace_base or os.getcwd()
+        parts = cfg.workspace_mount_rewrite.split(':')
+        cfg.workspace_mount_path = base.replace(parts[0], parts[1])
+
+    for llm in cfg.llms.values():
+        if llm.embedding_base_url is None:
+            llm.embedding_base_url = llm.base_url
+
+    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
+        logger.openhands_logger.warning(
+            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
+            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
+        )
+
+    # make sure cache dir exists
+    if cfg.cache_dir:
+        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
+
+
+# Utility function for command line --group argument
+def get_llm_config_arg(
+    llm_config_arg: str, toml_file: str = 'config.toml'
+) -> LLMConfig | None:
+    """Get a group of llm settings from the config file.
+
+    A group in config.toml can look like this:
+
+    ```
+    [llm.gpt-3.5-for-eval]
+    model = 'gpt-3.5-turbo'
+    api_key = '...'
+    temperature = 0.5
+    num_retries = 8
+    ...
+    ```
+
+    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
+    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
+
+    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
+
+    Args:
+        llm_config_arg: The group of llm settings to get from the config.toml file.
+
+    Returns:
+        LLMConfig: The LLMConfig object with the settings from the config file.
+    """
+    # keep only the name, just in case
+    llm_config_arg = llm_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if llm_config_arg.startswith('llm.'):
+        llm_config_arg = llm_config_arg[4:]
+
+    logger.openhands_logger.info(f'Loading llm config from {llm_config_arg}')
+
+    # load the toml file
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError as e:
+        logger.openhands_logger.error(f'Config file not found: {e}')
+        return None
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.error(
+            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
+        )
+        return None
+
+    # update the llm config with the specified section
+    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
+        return LLMConfig(**toml_config['llm'][llm_config_arg])
+    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
+    return None
+
+
+# Command line arguments
+def get_parser() -> argparse.ArgumentParser:
+    """Get the parser for the command line arguments."""
+    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+    parser.add_argument(
+        '-d',
+        '--directory',
+        type=str,
+        help='The working directory for the agent',
+    )
+    parser.add_argument(
+        '-t',
+        '--task',
+        type=str,
+        default='',
+        help='The task for the agent to perform',
+    )
+    parser.add_argument(
+        '-f',
+        '--file',
+        type=str,
+        help='Path to a file containing the task. Overrides -t if both are provided.',
+    )
+    parser.add_argument(
+        '-c',
+        '--agent-cls',
+        default=OH_DEFAULT_AGENT,
+        type=str,
+        help='Name of the default agent to use',
+    )
+    parser.add_argument(
+        '-i',
+        '--max-iterations',
+        default=OH_MAX_ITERATIONS,
+        type=int,
+        help='The maximum number of iterations to run the agent',
+    )
+    parser.add_argument(
+        '-b',
+        '--max-budget-per-task',
+        type=float,
+        help='The maximum budget allowed per task, beyond which the agent will stop.',
+    )
+    # --eval configs are for evaluations only
+    parser.add_argument(
+        '--eval-output-dir',
+        default='evaluation/evaluation_outputs/outputs',
+        type=str,
+        help='The directory to save evaluation output',
+    )
+    parser.add_argument(
+        '--eval-n-limit',
+        default=None,
+        type=int,
+        help='The number of instances to evaluate',
+    )
+    parser.add_argument(
+        '--eval-num-workers',
+        default=4,
+        type=int,
+        help='The number of workers to use for evaluation',
+    )
+    parser.add_argument(
+        '--eval-note',
+        default=None,
+        type=str,
+        help='The note to add to the evaluation directory',
+    )
+    parser.add_argument(
+        '-l',
+        '--llm-config',
+        default=None,
+        type=str,
+        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
+    )
+    parser.add_argument(
+        '-n',
+        '--name',
+        default='default',
+        type=str,
+        help='Name for the session',
+    )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
+    return parser
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse the command line arguments."""
+    parser = get_parser()
+    parsed_args, _ = parser.parse_known_args()
+    return parsed_args
+
+
+def load_app_config(set_logging_levels: bool = True) -> AppConfig:
+    """Load the configuration from the config.toml file and environment variables.
+
+    Args:
+        set_logger_levels: Whether to set the global variables for logging levels.
+    """
+    config = AppConfig()
+    load_from_toml(config)
+    load_from_env(config, os.environ)
+    finalize_config(config)
+    if set_logging_levels:
+        logger.DEBUG = config.debug
+        logger.DISABLE_COLOR_PRINTING = config.disable_color
+    return config
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -77,3 +77,10 @@ class UserCancelledError(Exception):
 class MicroAgentValidationError(Exception):
    def __init__(self, message='Micro agent validation failed'):
        super().__init__(message)
+
+
+class OperationCancelled(Exception):
+    """Exception raised when an operation is cancelled (e.g. by a keyboard interrupt)."""
+
+    def __init__(self, message='Operation was cancelled'):
+        super().__init__(message)
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -55,7 +55,6 @@ def create_runtime(

    config: The app config.
    sid: The session id.
-    runtime_tools_config: (will be deprecated) The runtime tools config.
    """
    # if sid is provided on the command line, use it as the name of the event stream
    # otherwise generate it on the basis of the configured jwt_secret
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -1,7 +1,10 @@
 import asyncio
 import copy
+import os
+import time
 import warnings
 from functools import partial
+from typing import Any

 from openhands.core.config import LLMConfig
 from openhands.runtime.utils.shutdown_listener import should_continue
@@ -24,15 +27,21 @@ from litellm.types.utils import CostPerToken
 from tenacity import (
    retry,
    retry_if_exception_type,
+    retry_if_not_exception_type,
    stop_after_attempt,
    wait_exponential,
 )

-from openhands.core.exceptions import LLMResponseError, UserCancelledError
+from openhands.core.exceptions import (
+    LLMResponseError,
+    OperationCancelled,
+    UserCancelledError,
+)
 from openhands.core.logger import get_llm_loggers
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
 from openhands.core.metrics import Metrics
+from openhands.runtime.utils.shutdown_listener import should_exit

 __all__ = ['LLM']

@@ -69,6 +78,14 @@ class LLM:
        self.cost_metric_supported = True
        self.config = copy.deepcopy(config)

+        os.environ['OR_SITE_URL'] = self.config.openrouter_site_url
+        os.environ['OR_APP_NAME'] = self.config.openrouter_app_name
+
+        # list of LLM completions (for logging purposes). Each completion is a dict with the following keys:
+        # - 'messages': list of messages
+        # - 'response': response from the LLM
+        self.llm_completions: list[dict[str, Any]] = []
+
        # Set up config attributes with default values to prevent AttributeError
        LLMConfig.set_missing_attributes(self.config)

@@ -121,9 +138,6 @@ class LLM:
                ):
                    self.config.max_output_tokens = self.model_info['max_tokens']

-        if self.config.drop_params:
-            litellm.drop_params = self.config.drop_params
-
        # This only seems to work with Google as the provider, not with OpenRouter!
        gemini_safety_settings = (
            [
@@ -159,6 +173,7 @@ class LLM:
            timeout=self.config.timeout,
            temperature=self.config.temperature,
            top_p=self.config.top_p,
+            drop_params=self.config.drop_params,
            **(
                {'safety_settings': gemini_safety_settings}
                if gemini_safety_settings is not None
@@ -171,13 +186,18 @@ class LLM:

        completion_unwrapped = self._completion

-        def attempt_on_error(retry_state):
-            """Custom attempt function for litellm completion."""
+        def log_retry_attempt(retry_state):
+            """With before_sleep, this is called before `custom_completion_wait` and
+            ONLY if the retry is triggered by an exception."""
+            if should_exit():
+                raise OperationCancelled(
+                    'Operation cancelled.'
+                )  # exits the @retry loop
+            exception = retry_state.outcome.exception()
            logger.error(
-                f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize retry values in the configuration.',
+                f'{exception}. Attempt #{retry_state.attempt_number} | You can customize retry values in the configuration.',
                exc_info=False,
            )
-            return None

        def custom_completion_wait(retry_state):
            """Custom wait function for litellm completion."""
@@ -213,10 +233,13 @@ class LLM:
            return exponential_wait(retry_state)

        @retry(
-            after=attempt_on_error,
+            before_sleep=log_retry_attempt,
            stop=stop_after_attempt(self.config.num_retries),
            reraise=True,
-            retry=retry_if_exception_type(self.retry_exceptions),
+            retry=(
+                retry_if_exception_type(self.retry_exceptions)
+                & retry_if_not_exception_type(OperationCancelled)
+            ),
            wait=custom_completion_wait,
        )
        def wrapper(*args, **kwargs):
@@ -245,6 +268,16 @@ class LLM:
                logger.debug('No completion messages!')
                resp = {'choices': [{'message': {'content': ''}}]}

+            if self.config.log_completions:
+                self.llm_completions.append(
+                    {
+                        'messages': messages,
+                        'response': resp,
+                        'timestamp': time.time(),
+                        'cost': self.completion_cost(resp),
+                    }
+                )
+
            # log the response
            message_back = resp['choices'][0]['message']['content']
            if message_back:
@@ -269,7 +302,7 @@ class LLM:
            timeout=self.config.timeout,
            temperature=self.config.temperature,
            top_p=self.config.top_p,
-            drop_params=True,
+            drop_params=self.config.drop_params,
            **(
                {'safety_settings': gemini_safety_settings}
                if gemini_safety_settings is not None
@@ -280,10 +313,13 @@ class LLM:
        async_completion_unwrapped = self._async_completion

        @retry(
-            after=attempt_on_error,
+            before_sleep=log_retry_attempt,
            stop=stop_after_attempt(self.config.num_retries),
            reraise=True,
-            retry=retry_if_exception_type(self.retry_exceptions),
+            retry=(
+                retry_if_exception_type(self.retry_exceptions)
+                & retry_if_not_exception_type(OperationCancelled)
+            ),
            wait=custom_completion_wait,
        )
        async def async_completion_wrapper(*args, **kwargs):
@@ -353,10 +389,13 @@ class LLM:
                    pass

        @retry(
-            after=attempt_on_error,
+            before_sleep=log_retry_attempt,
            stop=stop_after_attempt(self.config.num_retries),
            reraise=True,
-            retry=retry_if_exception_type(self.retry_exceptions),
+            retry=(
+                retry_if_exception_type(self.retry_exceptions)
+                & retry_if_not_exception_type(OperationCancelled)
+            ),
            wait=custom_completion_wait,
        )
        async def async_acompletion_stream_wrapper(*args, **kwargs):
@@ -450,6 +489,9 @@ class LLM:
        return str(element)

    async def _call_acompletion(self, *args, **kwargs):
+        """This is a wrapper for the litellm acompletion function which
+        makes it mockable for testing.
+        """
        return await litellm.acompletion(*args, **kwargs)

    @property
@@ -530,10 +572,15 @@ class LLM:
            output_tokens = usage.get('completion_tokens')

            if input_tokens:
-                stats += 'Input tokens: ' + str(input_tokens) + '\n'
+                stats += 'Input tokens: ' + str(input_tokens)

            if output_tokens:
-                stats += 'Output tokens: ' + str(output_tokens) + '\n'
+                stats += (
+                    (' | ' if input_tokens else '')
+                    + 'Output tokens: '
+                    + str(output_tokens)
+                    + '\n'
+                )

            model_extra = usage.get('model_extra', {})

@@ -633,6 +680,7 @@ class LLM:

    def reset(self):
        self.metrics = Metrics()
+        self.llm_completions = []

    def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dict]:
        if isinstance(messages, Message):
--- a/openhands/runtime/README.md
+++ b/openhands/runtime/README.md
@@ -74,6 +74,41 @@ Key features of the `RuntimeClient` class:
 - The system uses a plugin architecture for extensibility.
 - All interactions with the external environment are managed through the Runtime, ensuring a controlled and secure execution environment.

+## Runtime Types
+
+### EventStream Runtime
+
+The EventStream Runtime is designed for local execution using Docker containers:
+
+- Creates and manages a Docker container for each session
+- Executes actions within the container
+- Supports direct file system access and local resource management
+- Ideal for development, testing, and scenarios requiring full control over the execution environment
+
+Key features:
+- Real-time logging and debugging capabilities
+- Direct access to the local file system
+- Faster execution due to local resources
+
+This is the default runtime used within OpenHands.
+
+### Remote Runtime
+
+The Remote Runtime is designed for execution in a remote environment:
+
+- Connects to a remote server running the RuntimeClient
+- Executes actions by sending requests to the remote client
+- Supports distributed execution and cloud-based deployments
+- Ideal for production environments, scalability, and scenarios where local resource constraints are a concern
+
+Key features:
+- Scalability and resource flexibility
+- Reduced local resource usage
+- Support for cloud-based deployments
+- Potential for improved security through isolation
+
+At the time of this writing, this is mostly used in parallel evaluation, such as this example for [SWE-Bench](https://github.com/All-Hands-AI/OpenHands/tree/main/evaluation/swe_bench#run-inference-on-remoteruntime-experimental).
+
 ## Related Components

 - The runtime interacts closely with the event system defined in the `openhands.events` module.
--- a/openhands/runtime/client/client.py
+++ b/openhands/runtime/client/client.py
@@ -16,8 +16,10 @@ from pathlib import Path

 import pexpect
 from fastapi import FastAPI, HTTPException, Request, UploadFile
+from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
+from starlette.exceptions import HTTPException as StarletteHTTPException
 from uvicorn import run

 from openhands.core.logger import openhands_logger as logger
@@ -562,6 +564,35 @@ if __name__ == '__main__':

    app = FastAPI(lifespan=lifespan)

+    # TODO below 3 exception handlers were recommended by Sonnet.
+    # Are these something we should keep?
+    @app.exception_handler(Exception)
+    async def global_exception_handler(request: Request, exc: Exception):
+        logger.exception('Unhandled exception occurred:')
+        return JSONResponse(
+            status_code=500,
+            content={
+                'message': 'An unexpected error occurred. Please try again later.'
+            },
+        )
+
+    @app.exception_handler(StarletteHTTPException)
+    async def http_exception_handler(request: Request, exc: StarletteHTTPException):
+        logger.error(f'HTTP exception occurred: {exc.detail}')
+        return JSONResponse(
+            status_code=exc.status_code, content={'message': exc.detail}
+        )
+
+    @app.exception_handler(RequestValidationError)
+    async def validation_exception_handler(
+        request: Request, exc: RequestValidationError
+    ):
+        logger.error(f'Validation error occurred: {exc}')
+        return JSONResponse(
+            status_code=422,
+            content={'message': 'Invalid request parameters', 'details': exc.errors()},
+        )
+
    @app.middleware('http')
    async def one_request_at_a_time(request: Request, call_next):
        assert client is not None
--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@@ -1,8 +1,8 @@
 import os
 import tempfile
 import threading
-import time
 import uuid
+from typing import Callable
 from zipfile import ZipFile

 import docker
@@ -120,6 +120,7 @@ class EventStreamRuntime(Runtime):
        sid: str = 'default',
        plugins: list[PluginRequirement] | None = None,
        env_vars: dict[str, str] | None = None,
+        status_message_callback: Callable | None = None,
    ):
        self.config = config
        self._host_port = 30000  # initial dummy value
@@ -131,12 +132,13 @@ class EventStreamRuntime(Runtime):
        self.instance_id = (
            sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
        )
+        self.status_message_callback = status_message_callback

+        self.send_status_message('STATUS$STARTING_RUNTIME')
        self.docker_client: docker.DockerClient = self._init_docker_client()
        self.base_container_image = self.config.sandbox.base_container_image
        self.runtime_container_image = self.config.sandbox.runtime_container_image
        self.container_name = self.container_name_prefix + self.instance_id
-
        self.container = None
        self.action_semaphore = threading.Semaphore(1)  # Ensure one action at a time

@@ -147,7 +149,7 @@ class EventStreamRuntime(Runtime):
        self.log_buffer: LogBuffer | None = None

        if self.config.sandbox.runtime_extra_deps:
-            logger.info(
+            logger.debug(
                f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}'
            )

@@ -161,6 +163,8 @@ class EventStreamRuntime(Runtime):
                raise ValueError(
                    'Neither runtime container image nor base container image is set'
                )
+            logger.info('Preparing container, this might take a few minutes...')
+            self.send_status_message('STATUS$STARTING_CONTAINER')
            self.runtime_container_image = build_runtime_image(
                self.base_container_image,
                self.runtime_builder,
@@ -171,16 +175,23 @@ class EventStreamRuntime(Runtime):
            mount_dir=self.config.workspace_mount_path,  # e.g. /opt/openhands/_test_workspace
            plugins=plugins,
        )
+
        # will initialize both the event stream and the env vars
-        super().__init__(config, event_stream, sid, plugins, env_vars)
+        super().__init__(
+            config, event_stream, sid, plugins, env_vars, status_message_callback
+        )
+
+        logger.info('Waiting for client to become ready...')
+        self.send_status_message('STATUS$WAITING_FOR_CLIENT')

        self._wait_until_alive()

+        self.setup_initial_env()
+
        logger.info(
            f'Container initialized with plugins: {[plugin.name for plugin in self.plugins]}'
        )
-        logger.debug(f'Container initialized with env vars: {env_vars}')
-        time.sleep(1)
+        self.send_status_message(' ')

    @staticmethod
    def _init_docker_client() -> docker.DockerClient:
@@ -203,9 +214,8 @@ class EventStreamRuntime(Runtime):
        plugins: list[PluginRequirement] | None = None,
    ):
        try:
-            logger.info(
-                f'Starting container with image: {self.runtime_container_image} and name: {self.container_name}'
-            )
+            logger.info('Preparing to start container...')
+            self.send_status_message('STATUS$PREPARING_CONTAINER')
            plugin_arg = ''
            if plugins is not None and len(plugins) > 0:
                plugin_arg = (
@@ -243,7 +253,7 @@ class EventStreamRuntime(Runtime):
            if self.config.debug:
                environment['DEBUG'] = 'true'

-            logger.info(f'Workspace Base: {self.config.workspace_base}')
+            logger.debug(f'Workspace Base: {self.config.workspace_base}')
            if mount_dir is not None and sandbox_workspace_dir is not None:
                # e.g. result would be: {"/home/user/openhands/workspace": {'bind': "/workspace", 'mode': 'rw'}}
                volumes = {mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}}
@@ -261,6 +271,7 @@ class EventStreamRuntime(Runtime):
                )
            else:
                browsergym_arg = ''
+
            container = self.docker_client.containers.run(
                self.runtime_container_image,
                command=(
@@ -283,6 +294,7 @@ class EventStreamRuntime(Runtime):
            )
            self.log_buffer = LogBuffer(container)
            logger.debug(f'Container started. Server url: {self.api_url}')
+            self.send_status_message('STATUS$CONTAINER_STARTED')
            return container
        except Exception as e:
            logger.error(
@@ -292,19 +304,13 @@ class EventStreamRuntime(Runtime):
            self.close(close_client=False)
            raise e

-    @tenacity.retry(
-        stop=tenacity.stop_after_attempt(10),
-        wait=tenacity.wait_exponential(multiplier=2, min=1, max=20),
-        reraise=(ConnectionRefusedError,),
-    )
-    def _wait_until_alive(self):
+    def _refresh_logs(self):
        logger.debug('Getting container logs...')

        assert (
            self.log_buffer is not None
        ), 'Log buffer is expected to be initialized when container is started'

-        # Always process logs, regardless of client_ready status
        logs = self.log_buffer.get_and_clear()
        if logs:
            formatted_logs = '\n'.join([f'    |{log}' for log in logs])
@@ -318,24 +324,15 @@ class EventStreamRuntime(Runtime):
                + '-' * 80
            )

-        if not self.log_buffer.client_ready:
-            time.sleep(1)
-            attempts = 0
-            while not self.log_buffer.client_ready and attempts < 5:
-                attempts += 1
-                time.sleep(1)
-                logs = self.log_buffer.get_and_clear()
-                if logs:
-                    formatted_logs = '\n'.join([f'    |{log}' for log in logs])
-                    logger.info(
-                        '\n'
-                        + '-' * 35
-                        + 'Container logs:'
-                        + '-' * 35
-                        + f'\n{formatted_logs}'
-                        + '\n'
-                        + '-' * 80
-                    )
+    @tenacity.retry(
+        stop=tenacity.stop_after_attempt(10),
+        wait=tenacity.wait_exponential(multiplier=2, min=1, max=20),
+        reraise=(ConnectionRefusedError,),
+    )
+    def _wait_until_alive(self):
+        self._refresh_logs()
+        if not (self.log_buffer and self.log_buffer.client_ready):
+            raise RuntimeError('Runtime client is not ready.')

        response = self.session.get(f'{self.api_url}/alive')
        if response.status_code == 200:
@@ -415,8 +412,7 @@ class EventStreamRuntime(Runtime):
                    'Action has been rejected by the user! Waiting for further user input.'
                )

-            logger.debug('Awaiting session')
-            self._wait_until_alive()
+            self._refresh_logs()

            assert action.timeout is not None

@@ -442,8 +438,7 @@ class EventStreamRuntime(Runtime):
            except Exception as e:
                logger.error(f'Error during command execution: {e}')
                obs = ErrorObservation(f'Command execution failed: {str(e)}')
-            # TODO Refresh docker logs or not?
-            # self._wait_until_alive()
+            self._refresh_logs()
            return obs

    def run(self, action: CmdRunAction) -> Observation:
@@ -474,7 +469,7 @@ class EventStreamRuntime(Runtime):
        if not os.path.exists(host_src):
            raise FileNotFoundError(f'Source file {host_src} does not exist')

-        self._wait_until_alive()
+        self._refresh_logs()
        try:
            if recursive:
                # For recursive copy, create a zip file
@@ -516,15 +511,14 @@ class EventStreamRuntime(Runtime):
            if recursive:
                os.unlink(temp_zip_path)
            logger.debug(f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}')
-            # Refresh docker logs
-            self._wait_until_alive()
+            self._refresh_logs()

    def list_files(self, path: str | None = None) -> list[str]:
        """List files in the sandbox.

        If path is None, list files in the sandbox's initial working directory (e.g., /workspace).
        """
-        self._wait_until_alive()
+        self._refresh_logs()
        try:
            data = {}
            if path is not None:
@@ -559,3 +553,8 @@ class EventStreamRuntime(Runtime):
                return port
        # If no port is found after max_attempts, return the last tried port
        return port
+
+    def send_status_message(self, message: str):
+        """Sends a status message if the callback function was provided."""
+        if self.status_message_callback:
+            self.status_message_callback(message)
--- a/openhands/runtime/e2b/runtime.py
+++ b/openhands/runtime/e2b/runtime.py
@@ -1,3 +1,5 @@
+from typing import Callable, Optional
+
 from openhands.core.config import AppConfig
 from openhands.events.action import (
    FileReadAction,
@@ -25,8 +27,15 @@ class E2BRuntime(Runtime):
        sid: str = 'default',
        plugins: list[PluginRequirement] | None = None,
        sandbox: E2BSandbox | None = None,
+        status_message_callback: Optional[Callable] = None,
    ):
-        super().__init__(config, event_stream, sid, plugins)
+        super().__init__(
+            config,
+            event_stream,
+            sid,
+            plugins,
+            status_message_callback=status_message_callback,
+        )
        if sandbox is None:
            self.sandbox = E2BSandbox()
        if not isinstance(self.sandbox, E2BSandbox):
--- a/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
+++ b/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
@@ -510,11 +510,11 @@ def _edit_file_impl(
        # NOTE: we need to get env var inside this function
        # because the env var will be set AFTER the agentskills is imported
        if enable_auto_lint:
-            # BACKUP the original file
-            original_file_backup_path = os.path.join(
-                os.path.dirname(file_name),
-                f'.backup.{os.path.basename(file_name)}',
-            )
+            # Generate a random temporary file path
+            suffix = os.path.splitext(file_name)[1]
+            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tfile:
+                original_file_backup_path = tfile.name
+
            with open(original_file_backup_path, 'w') as f:
                f.writelines(lines)

@@ -597,7 +597,9 @@ def _edit_file_impl(
                    file_name, 'w'
                ) as fout:
                    fout.write(fin.read())
-                os.remove(original_file_backup_path)
+
+                # Don't forget to remove the temporary file after you're done
+                os.unlink(original_file_backup_path)
                return ret_str

    except FileNotFoundError as e:
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@@ -2,6 +2,7 @@ import os
 import tempfile
 import threading
 import uuid
+from typing import Callable, Optional
 from zipfile import ZipFile

 import requests
@@ -55,6 +56,7 @@ class RemoteRuntime(Runtime):
        sid: str = 'default',
        plugins: list[PluginRequirement] | None = None,
        env_vars: dict[str, str] | None = None,
+        status_message_callback: Optional[Callable] = None,
    ):
        self.config = config
        if self.config.sandbox.api_hostname == 'localhost':
@@ -168,7 +170,9 @@ class RemoteRuntime(Runtime):
        )

        # Initialize the eventstream and env vars
-        super().__init__(config, event_stream, sid, plugins, env_vars)
+        super().__init__(
+            config, event_stream, sid, plugins, env_vars, status_message_callback
+        )

        logger.info(
            f'Runtime initialized with plugins: {[plugin.name for plugin in self.plugins]}'
--- a/openhands/runtime/runtime.py
+++ b/openhands/runtime/runtime.py
@@ -3,6 +3,7 @@ import copy
 import json
 import os
 from abc import abstractmethod
+from typing import Callable

 from openhands.core.config import AppConfig, SandboxConfig
 from openhands.core.logger import openhands_logger as logger
@@ -49,7 +50,7 @@ class Runtime:

    sid: str
    config: AppConfig
-    DEFAULT_ENV_VARS: dict[str, str]
+    initial_env_vars: dict[str, str]

    def __init__(
        self,
@@ -58,22 +59,24 @@ class Runtime:
        sid: str = 'default',
        plugins: list[PluginRequirement] | None = None,
        env_vars: dict[str, str] | None = None,
+        status_message_callback: Callable | None = None,
    ):
        self.sid = sid
        self.event_stream = event_stream
        self.event_stream.subscribe(EventStreamSubscriber.RUNTIME, self.on_event)
        self.plugins = plugins if plugins is not None and len(plugins) > 0 else []
+        self.status_message_callback = status_message_callback

        self.config = copy.deepcopy(config)
-        self.DEFAULT_ENV_VARS = _default_env_vars(config.sandbox)
        atexit.register(self.close)

-        if self.DEFAULT_ENV_VARS:
-            logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
-            self.add_env_vars(self.DEFAULT_ENV_VARS)
+        self.initial_env_vars = _default_env_vars(config.sandbox)
        if env_vars is not None:
-            logger.debug(f'Adding provided env vars: {env_vars}')
-            self.add_env_vars(env_vars)
+            self.initial_env_vars.update(env_vars)
+
+    def setup_initial_env(self) -> None:
+        logger.debug(f'Adding env vars: {self.initial_env_vars}')
+        self.add_env_vars(self.initial_env_vars)

    def close(self) -> None:
        pass
--- a/openhands/server/session/agent_session.py
+++ b/openhands/server/session/agent_session.py
@@ -1,3 +1,6 @@
+import asyncio
+from typing import Callable, Optional
+
 from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
@@ -11,7 +14,7 @@ from openhands.storage.files import FileStore


 class AgentSession:
-    """Represents a session with an agent.
+    """Represents a session with an Agent

    Attributes:
        controller: The AgentController instance for controlling the agent.
@@ -26,7 +29,13 @@ class AgentSession:
    _closed: bool = False

    def __init__(self, sid: str, file_store: FileStore):
-        """Initializes a new instance of the Session class."""
+        """Initializes a new instance of the Session class
+
+        Parameters:
+        - sid: The session ID
+        - file_store: Instance of the FileStore
+        """
+
        self.sid = sid
        self.event_stream = EventStream(sid, file_store)
        self.file_store = file_store
@@ -40,18 +49,24 @@ class AgentSession:
        max_budget_per_task: float | None = None,
        agent_to_llm_config: dict[str, LLMConfig] | None = None,
        agent_configs: dict[str, AgentConfig] | None = None,
+        status_message_callback: Optional[Callable] = None,
    ):
-        """Starts the agent session.
-
-        Args:
-            start_event: The start event data (optional).
+        """Starts the Agent session
+        Parameters:
+        - runtime_name: The name of the runtime associated with the session
+        - config:
+        - agent:
+        - max_interations:
+        - max_budget_per_task:
+        - agent_to_llm_config:
+        - agent_configs:
        """
        if self.controller or self.runtime:
            raise RuntimeError(
                'Session already started. You need to close this session and start a new one.'
            )
        await self._create_security_analyzer(config.security.security_analyzer)
-        await self._create_runtime(runtime_name, config, agent)
+        await self._create_runtime(runtime_name, config, agent, status_message_callback)
        await self._create_controller(
            agent,
            config.security.confirmation_mode,
@@ -62,6 +77,8 @@ class AgentSession:
        )

    async def close(self):
+        """Closes the Agent session"""
+
        if self._closed:
            return
        if self.controller is not None:
@@ -75,27 +92,55 @@ class AgentSession:
        self._closed = True

    async def _create_security_analyzer(self, security_analyzer: str | None):
-        """Creates a SecurityAnalyzer instance that will be used to analyze the agent actions."""
-        logger.info(f'Using security analyzer: {security_analyzer}')
+        """Creates a SecurityAnalyzer instance that will be used to analyze the agent actions
+
+        Parameters:
+        - security_analyzer: The name of the security analyzer to use
+        """
+
        if security_analyzer:
+            logger.debug(f'Using security analyzer: {security_analyzer}')
            self.security_analyzer = options.SecurityAnalyzers.get(
                security_analyzer, SecurityAnalyzer
            )(self.event_stream)

-    async def _create_runtime(self, runtime_name: str, config: AppConfig, agent: Agent):
-        """Creates a runtime instance."""
+    async def _create_runtime(
+        self,
+        runtime_name: str,
+        config: AppConfig,
+        agent: Agent,
+        status_message_callback: Optional[Callable] = None,
+    ):
+        """Creates a runtime instance
+
+        Parameters:
+        - runtime_name: The name of the runtime associated with the session
+        - config:
+        - agent:
+        """
+
        if self.runtime is not None:
-            raise Exception('Runtime already created')
+            raise RuntimeError('Runtime already created')

        logger.info(f'Initializing runtime `{runtime_name}` now...')
        runtime_cls = get_runtime_cls(runtime_name)
-        self.runtime = runtime_cls(
+
+        self.runtime = await asyncio.to_thread(
+            runtime_cls,
            config=config,
            event_stream=self.event_stream,
            sid=self.sid,
            plugins=agent.sandbox_plugins,
+            status_message_callback=status_message_callback,
        )

+        if self.runtime is not None:
+            logger.debug(
+                f'Runtime initialized with plugins: {[plugin.name for plugin in self.runtime.plugins]}'
+            )
+        else:
+            logger.warning('Runtime initialization failed')
+
    async def _create_controller(
        self,
        agent: Agent,
@@ -105,7 +150,17 @@ class AgentSession:
        agent_to_llm_config: dict[str, LLMConfig] | None = None,
        agent_configs: dict[str, AgentConfig] | None = None,
    ):
-        """Creates an AgentController instance."""
+        """Creates an AgentController instance
+
+        Parameters:
+        - agent:
+        - confirmation_mode: Whether to use confirmation mode
+        - max_iterations:
+        - max_budget_per_task:
+        - agent_to_llm_config:
+        - agent_configs:
+        """
+
        if self.controller is not None:
            raise RuntimeError('Controller already created')
        if self.runtime is None:
@@ -113,8 +168,13 @@ class AgentSession:
                'Runtime must be initialized before the agent controller'
            )

-        logger.debug(f'Agents: {agent_configs}')
-        logger.info(f'Creating agent {agent.name} using LLM {agent.llm.config.model}')
+        logger.info(
+            '\n--------------------------------- OpenHands Configuration ---------------------------------\n'
+            f'LLM: {agent.llm.config.model}\n'
+            f'Base URL: {agent.llm.config.base_url}\n'
+            f'Agent: {agent.name}\n'
+            '-------------------------------------------------------------------------------------------'
+        )

        self.controller = AgentController(
            sid=self.sid,
@@ -136,4 +196,5 @@ class AgentSession:
            )
            logger.info(f'Restored agent state from session, sid: {self.sid}')
        except Exception as e:
-            logger.debug(f'Cannot restore state: {e}')
+            logger.debug(f'State could not be restored: {e}')
+        logger.info('Agent controller initialized.')
--- a/openhands/server/session/manager.py
+++ b/openhands/server/session/manager.py
@@ -35,9 +35,11 @@ class SessionManager:

    async def send(self, sid: str, data: dict[str, object]) -> bool:
        """Sends data to the client."""
-        if sid not in self._sessions:
+        session = self.get_session(sid)
+        if session is None:
+            logger.error(f'*** No session found for {sid}, skipping message ***')
            return False
-        return await self._sessions[sid].send(data)
+        return await session.send(data)

    async def send_error(self, sid: str, message: str) -> bool:
        """Sends an error message to the client."""
--- a/openhands/server/session/session.py
+++ b/openhands/server/session/session.py
@@ -21,7 +21,7 @@ from openhands.events.serialization import event_from_dict, event_to_dict
 from openhands.events.stream import EventStreamSubscriber
 from openhands.llm.llm import LLM
 from openhands.runtime.utils.shutdown_listener import should_continue
-from openhands.server.session.agent import AgentSession
+from openhands.server.session.agent_session import AgentSession
 from openhands.storage.files import FileStore

 DEL_DELT_SEC = 60 * 60 * 5
@@ -33,6 +33,7 @@ class Session:
    last_active_ts: int = 0
    is_alive: bool = True
    agent_session: AgentSession
+    loop: asyncio.AbstractEventLoop

    def __init__(
        self, sid: str, ws: WebSocket | None, config: AppConfig, file_store: FileStore
@@ -45,6 +46,7 @@ class Session:
            EventStreamSubscriber.SERVER, self.on_event
        )
        self.config = config
+        self.loop = asyncio.get_event_loop()

    async def close(self):
        self.is_alive = False
@@ -76,9 +78,7 @@ class Session:
            AgentStateChangedObservation('', AgentState.LOADING), EventSource.AGENT
        )
        # Extract the agent-relevant arguments from the request
-        args = {
-            key: value for key, value in data.get('args', {}).items() if value != ''
-        }
+        args = {key: value for key, value in data.get('args', {}).items()}
        agent_cls = args.get(ConfigType.AGENT, self.config.default_agent)
        self.config.security.confirmation_mode = args.get(
            ConfigType.CONFIRMATION_MODE, self.config.security.confirmation_mode
@@ -115,6 +115,7 @@ class Session:
                max_budget_per_task=self.config.max_budget_per_task,
                agent_to_llm_config=self.config.get_agent_to_llm_config_map(),
                agent_configs=self.config.get_agent_configs(),
+                status_message_callback=self.queue_status_message,
            )
        except Exception as e:
            logger.exception(f'Error creating controller: {e}')
@@ -127,7 +128,8 @@ class Session:
        )

    async def on_event(self, event: Event):
-        """Callback function for agent events.
+        """Callback function for events that mainly come from the agent.
+        Event is the base class for any agent action and observation.

        Args:
            event: The agent event (Observation or Action).
@@ -173,6 +175,9 @@ class Session:
            await asyncio.sleep(0.001)  # This flushes the data to the client
            self.last_active_ts = int(time.time())
            return True
+        except RuntimeError:
+            self.is_alive = False
+            return False
        except WebSocketDisconnect:
            self.is_alive = False
            return False
@@ -196,3 +201,8 @@ class Session:
            return False
        self.is_alive = data.get('is_alive', False)
        return True
+
+    def queue_status_message(self, message: str):
+        """Queues a status message to be sent asynchronously."""
+        # Ensure the coroutine runs in the main event loop
+        asyncio.run_coroutine_threadsafe(self.send_message(message), self.loop)
--- a/poetry.lock
+++ b/poetry.lock
@@ -571,17 +571,17 @@ files = [

 [[package]]
 name = "boto3"
-version = "1.35.22"
+version = "1.35.25"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "boto3-1.35.22-py3-none-any.whl", hash = "sha256:2109b632b451c1d4347a93a9abe6dc866c03db4ff1f910597f4543f1965829de"},
-    {file = "boto3-1.35.22.tar.gz", hash = "sha256:8f4f6e0860ca1b18cbb8d13f3a572a4c099577e741b10205b5604058af0e75b7"},
+    {file = "boto3-1.35.25-py3-none-any.whl", hash = "sha256:b1cfad301184cdd44dfd4805187ccab12de8dd28dd12a11a5cfdace17918c6de"},
+    {file = "boto3-1.35.25.tar.gz", hash = "sha256:5df4e2cbe3409db07d3a0d8d63d5220ce3202a78206ad87afdbb41519b26ce45"},
 ]

 [package.dependencies]
-botocore = ">=1.35.22,<1.36.0"
+botocore = ">=1.35.25,<1.36.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.10.0,<0.11.0"

@@ -590,13 +590,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]

 [[package]]
 name = "botocore"
-version = "1.35.22"
+version = "1.35.25"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.35.22-py3-none-any.whl", hash = "sha256:d9bc656e7dde0b3e3f3080fc54bacff6a97fd7806b98acbcc21c7f9d4d0102b9"},
-    {file = "botocore-1.35.22.tar.gz", hash = "sha256:18362b7ec748561d786aebf1dd5c9faf22c4732efbf89344314199f96d3bbb65"},
+    {file = "botocore-1.35.25-py3-none-any.whl", hash = "sha256:e58d60260abf10ccc4417967923117c9902a6a0cff9fddb6ea7ff42dc1bd4630"},
+    {file = "botocore-1.35.25.tar.gz", hash = "sha256:76c5706b2c6533000603ae8683a297c887abbbaf6ee31e1b2e2863b74b2989bc"},
 ]

 [package.dependencies]
@@ -609,32 +609,32 @@ crt = ["awscrt (==0.21.5)"]

 [[package]]
 name = "browsergym"
-version = "0.6.4"
+version = "0.7.0"
 description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
 optional = false
 python-versions = ">3.7"
 files = [
-    {file = "browsergym-0.6.4-py3-none-any.whl", hash = "sha256:929ff38c76c11b0982369c032d54ccd7ec9977a9cc6161205a4e2fb528f2f9f7"},
-    {file = "browsergym-0.6.4.tar.gz", hash = "sha256:91500f3f44b135a55fcab4b0710ec4f8790413205a621b31133c57a6eda5812a"},
+    {file = "browsergym-0.7.0-py3-none-any.whl", hash = "sha256:e2b98d2990ec1bfd80fd3e8034e60a60f363a5240be794e0ace975f24601d1a8"},
+    {file = "browsergym-0.7.0.tar.gz", hash = "sha256:e1cd9812b32a9387bac42b726bf7669c35a46b5fe6d1faf939333f095d5a6ba5"},
 ]

 [package.dependencies]
-browsergym-core = "0.6.4"
-browsergym-experiments = "0.6.4"
-browsergym-miniwob = "0.6.4"
-browsergym-visualwebarena = "0.6.4"
-browsergym-webarena = "0.6.4"
+browsergym-core = "0.7.0"
+browsergym-experiments = "0.7.0"
+browsergym-miniwob = "0.7.0"
+browsergym-visualwebarena = "0.7.0"
+browsergym-webarena = "0.7.0"
 browsergym-workarena = "*"

 [[package]]
 name = "browsergym-core"
-version = "0.6.4"
+version = "0.7.0"
 description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
 optional = false
 python-versions = ">3.9"
 files = [
-    {file = "browsergym_core-0.6.4-py3-none-any.whl", hash = "sha256:da1edcb7de2cdbfbef54161886203ac22a76fa06f2bb0afcc0ec1e55044692ec"},
-    {file = "browsergym_core-0.6.4.tar.gz", hash = "sha256:5723666da2d7b6288c6521c2d6a0ebec2407cceb4a596f580e7aa4116ec73c3e"},
+    {file = "browsergym_core-0.7.0-py3-none-any.whl", hash = "sha256:4f4c7a153daa984701f76e81eaa358b4a9684e8f3fb4dcd80c807e7ed8112914"},
+    {file = "browsergym_core-0.7.0.tar.gz", hash = "sha256:069987057dcdea2c25b1b631691f93d77c2d042108079c16874128dcc459d809"},
 ]

 [package.dependencies]
@@ -648,62 +648,62 @@ pyparsing = ">=3"

 [[package]]
 name = "browsergym-experiments"
-version = "0.6.4"
+version = "0.7.0"
 description = "Experimentation tools for BrowserGym"
 optional = false
 python-versions = ">3.7"
 files = [
-    {file = "browsergym_experiments-0.6.4-py3-none-any.whl", hash = "sha256:1a1a38363554380444be759a40952fcd90114ae2b814f7e3aaad0b1b159bcb51"},
-    {file = "browsergym_experiments-0.6.4.tar.gz", hash = "sha256:136538e787a634e0b39c57f0995bc6cf02d1dfd5e0b34640e7362c34c615a96d"},
+    {file = "browsergym_experiments-0.7.0-py3-none-any.whl", hash = "sha256:c10f810eb631622804ebbf5e5783636cf8aff2a53ea0e38bfcfb129273865b1b"},
+    {file = "browsergym_experiments-0.7.0.tar.gz", hash = "sha256:9ee937720d2b84563851a2ae2c94c685da299fbadd957ba743ef7f1351fd0e23"},
 ]

 [package.dependencies]
-browsergym-core = "0.6.4"
+browsergym-core = "0.7.0"
 tiktoken = ">=0.4"

 [[package]]
 name = "browsergym-miniwob"
-version = "0.6.4"
+version = "0.7.0"
 description = "MiniWoB++ benchmark for BrowserGym"
 optional = false
 python-versions = ">3.7"
 files = [
-    {file = "browsergym_miniwob-0.6.4-py3-none-any.whl", hash = "sha256:f5a5d0635506957d5428ffbf73dc2acececb91bb5c6005414a03b9127c21ff4b"},
-    {file = "browsergym_miniwob-0.6.4.tar.gz", hash = "sha256:7a43f9db53ffcb613b21f583deef048523ba30133c373659c52d8890f05bcbf9"},
+    {file = "browsergym_miniwob-0.7.0-py3-none-any.whl", hash = "sha256:9223400aa737dcbca79884a6174b67635ec5b913f490232b60e5391fc34eecb4"},
+    {file = "browsergym_miniwob-0.7.0.tar.gz", hash = "sha256:b4d248541a86f9dc21c9fc5a03699ef16dfd96a97d9347d3c6ef4ae9145f691f"},
 ]

 [package.dependencies]
-browsergym-core = "0.6.4"
+browsergym-core = "0.7.0"

 [[package]]
 name = "browsergym-visualwebarena"
-version = "0.6.4"
+version = "0.7.0"
 description = "VisualWebArena benchmark for BrowserGym"
 optional = false
 python-versions = ">3.7"
 files = [
-    {file = "browsergym_visualwebarena-0.6.4-py3-none-any.whl", hash = "sha256:479d0e3bd73133a0addbb92056807442f94ddbb2b7532fcca3490c813948e6f0"},
-    {file = "browsergym_visualwebarena-0.6.4.tar.gz", hash = "sha256:82d7bb9b3096386909af0b3c3851e57d21e215a8e84b703b4ceac8a7b6268420"},
+    {file = "browsergym_visualwebarena-0.7.0-py3-none-any.whl", hash = "sha256:499124dd8a0619905049598428205cad4d3237e6acef80225f3c734f428b16b9"},
+    {file = "browsergym_visualwebarena-0.7.0.tar.gz", hash = "sha256:78fd89a922b94b7de912b6ab44d48845a25283eb7265c526811542f6833edbaa"},
 ]

 [package.dependencies]
-browsergym-core = "0.6.4"
+browsergym-core = "0.7.0"
 libvisualwebarena = "0.0.8"
 requests = "*"

 [[package]]
 name = "browsergym-webarena"
-version = "0.6.4"
+version = "0.7.0"
 description = "WebArena benchmark for BrowserGym"
 optional = false
 python-versions = ">3.7"
 files = [
-    {file = "browsergym_webarena-0.6.4-py3-none-any.whl", hash = "sha256:d08719c0088987ebd2d7436dfc7241e795b3af52ff5450df2f730380b5bd6349"},
-    {file = "browsergym_webarena-0.6.4.tar.gz", hash = "sha256:33cb485b2a8aaa0b377d3e8923e0691bb671a455918ff83145f9d4c6edd4972d"},
+    {file = "browsergym_webarena-0.7.0-py3-none-any.whl", hash = "sha256:d04b2cdadce47ffc9b4d6751f7f5dbd403e561cf4bf2b80801edcbb03bcf8ce6"},
+    {file = "browsergym_webarena-0.7.0.tar.gz", hash = "sha256:f7b0839ca009962457a03c948261fb36fbcbababd60208132ec77f92c6a19a59"},
 ]

 [package.dependencies]
-browsergym-core = "0.6.4"
+browsergym-core = "0.7.0"
 libwebarena = "0.0.3"

 [[package]]
@@ -3234,13 +3234,13 @@ files = [

 [[package]]
 name = "json-repair"
-version = "0.29.2"
+version = "0.29.4"
 description = "A package to repair broken json strings"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "json_repair-0.29.2-py3-none-any.whl", hash = "sha256:a92436eb3cf4e51eff3f25b4540d983910d4a23a2c65c28aff2896d1645c97d5"},
-    {file = "json_repair-0.29.2.tar.gz", hash = "sha256:246bd76a7e213166f82a81ab02812db29b7ca2dfaedc58cb98fb68d97536b956"},
+    {file = "json_repair-0.29.4-py3-none-any.whl", hash = "sha256:2d7addfa01e3b4c295c4ebabd5f393127adae0d345616d3a2517df8260429dae"},
+    {file = "json_repair-0.29.4.tar.gz", hash = "sha256:2921760e707ac0d0b63478402fd6ea3162d4191adf873b396becb31c47a1ac30"},
 ]

 [[package]]
@@ -3762,13 +3762,13 @@ types-tqdm = "*"

 [[package]]
 name = "litellm"
-version = "1.46.6"
+version = "1.48.0"
 description = "Library to easily interface with LLM API providers"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.46.6-py3-none-any.whl", hash = "sha256:e568933a408ccb6d954c8f48b5a481542e2efaa69f1aad61307c6c99719fdf72"},
-    {file = "litellm-1.46.6.tar.gz", hash = "sha256:1c3196567c85507d05350cdcb4948c551705c9a38c3687c4098a3a58175bdd5a"},
+    {file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
+    {file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
 ]

 [package.dependencies]
@@ -3805,23 +3805,23 @@ pydantic = ">=1.10"

 [[package]]
 name = "llama-index"
-version = "0.11.10"
+version = "0.11.12"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index-0.11.10-py3-none-any.whl", hash = "sha256:5f5ef55e14c2f670a855abb4d24bf626333ef33dd630a90ec61c3b36635b2cf7"},
-    {file = "llama_index-0.11.10.tar.gz", hash = "sha256:1733f23bfb5afcea97a594d356994d379e6fb0da4e85d5adcb49757559f113e6"},
+    {file = "llama_index-0.11.12-py3-none-any.whl", hash = "sha256:a7d0b4065df2689cec1baeab9bfaed4d94e4ddc7e941df2ee47abfb218ce3ea1"},
+    {file = "llama_index-0.11.12.tar.gz", hash = "sha256:6b9220bf4c76a4ac0a82ccc642c3ea94f51381a9718ac601021f2fa95b74aab1"},
 ]

 [package.dependencies]
-llama-index-agent-openai = ">=0.3.1,<0.4.0"
+llama-index-agent-openai = ">=0.3.4,<0.4.0"
 llama-index-cli = ">=0.3.1,<0.4.0"
-llama-index-core = ">=0.11.10,<0.12.0"
+llama-index-core = ">=0.11.11,<0.12.0"
 llama-index-embeddings-openai = ">=0.2.4,<0.3.0"
 llama-index-indices-managed-llama-cloud = ">=0.3.0"
 llama-index-legacy = ">=0.9.48,<0.10.0"
-llama-index-llms-openai = ">=0.2.3,<0.3.0"
+llama-index-llms-openai = ">=0.2.9,<0.3.0"
 llama-index-multi-modal-llms-openai = ">=0.2.0,<0.3.0"
 llama-index-program-openai = ">=0.2.0,<0.3.0"
 llama-index-question-gen-openai = ">=0.2.0,<0.3.0"
@@ -3831,18 +3831,18 @@ nltk = ">3.8.1"

 [[package]]
 name = "llama-index-agent-openai"
-version = "0.3.1"
+version = "0.3.4"
 description = "llama-index agent openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_agent_openai-0.3.1-py3-none-any.whl", hash = "sha256:21886081ae74574d8cad1ec4886f2d4b082284b9364adb46fff87aba1a25e9b9"},
-    {file = "llama_index_agent_openai-0.3.1.tar.gz", hash = "sha256:41c9aa5b7cebc5043adddb3442e3167e97971589466e36448e04e0a767f5b9aa"},
+    {file = "llama_index_agent_openai-0.3.4-py3-none-any.whl", hash = "sha256:3720ce9bb12417a99a3fe84e52cce23e762b13f88a2dfc4292c76f4df9b26b4a"},
+    {file = "llama_index_agent_openai-0.3.4.tar.gz", hash = "sha256:80e3408d97121bebca3fa3ffd14b51285870c1c3c73d4ee04d3d18cfe6040466"},
 ]

 [package.dependencies]
 llama-index-core = ">=0.11.0,<0.12.0"
-llama-index-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-llms-openai = ">=0.2.9,<0.3.0"
 openai = ">=1.14.0"

 [[package]]
@@ -3863,13 +3863,13 @@ llama-index-llms-openai = ">=0.2.0,<0.3.0"

 [[package]]
 name = "llama-index-core"
-version = "0.11.10"
+version = "0.11.12"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_core-0.11.10-py3-none-any.whl", hash = "sha256:2dddd7cb4ccee89fdbbddd62e5fe3c7ae7fc431130e0a0a7155daee052874191"},
-    {file = "llama_index_core-0.11.10.tar.gz", hash = "sha256:9929b11cfb24a3581620466660ab11a6360fde8c2441caa3660e0127df65c1b9"},
+    {file = "llama_index_core-0.11.12-py3-none-any.whl", hash = "sha256:7dc7ead649bac8f09e61c6c8bf93d257f68a7315223552421be4f0ffc3a8054d"},
+    {file = "llama_index_core-0.11.12.tar.gz", hash = "sha256:ce2dd037ff889d9ea6b25872228cc9de614c10445d19377f6ae5c66b93a50c61"},
 ]

 [package.dependencies]
@@ -4030,17 +4030,16 @@ llama-index-llms-openai = ">=0.2.0,<0.3.0"

 [[package]]
 name = "llama-index-llms-openai"
-version = "0.2.3"
+version = "0.2.9"
 description = "llama-index llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_openai-0.2.3-py3-none-any.whl", hash = "sha256:dbcf8636bca57b0c063621a9981bbf2fdfaedd96c8e4906b8ddf0a139cb74d2f"},
-    {file = "llama_index_llms_openai-0.2.3.tar.gz", hash = "sha256:e9173b430331791c6a5a9df16796437ae4a3ae247fd4e0f281f7cbe59258b07a"},
+    {file = "llama_index_llms_openai-0.2.9-py3-none-any.whl", hash = "sha256:5f36e8cbca2c3c657380c711bd3974fe7e2344d3b6a8dde6c263e56868d01e27"},
+    {file = "llama_index_llms_openai-0.2.9.tar.gz", hash = "sha256:56376f39e3a40253b5c4fb90d0fb6af093f21bb2935925615f0c28a28d028187"},
 ]

 [package.dependencies]
-llama-index-agent-openai = ">=0.3.1,<0.4.0"
 llama-index-core = ">=0.11.7,<0.12.0"
 openai = ">=1.40.0,<2.0.0"

@@ -4540,13 +4539,13 @@ files = [

 [[package]]
 name = "minio"
-version = "7.2.8"
+version = "7.2.9"
 description = "MinIO Python SDK for Amazon S3 Compatible Cloud Storage"
 optional = false
 python-versions = ">3.8"
 files = [
-    {file = "minio-7.2.8-py3-none-any.whl", hash = "sha256:aa3b485788b63b12406a5798465d12a57e4be2ac2a58a8380959b6b748e64ddd"},
-    {file = "minio-7.2.8.tar.gz", hash = "sha256:f8af2dafc22ebe1aef3ac181b8e217037011c430aa6da276ed627e55aaf7c815"},
+    {file = "minio-7.2.9-py3-none-any.whl", hash = "sha256:fe5523d9c4a4d6cfc07e96905852841bccdb22b22770e1efca4bf5ae8b65774b"},
+    {file = "minio-7.2.9.tar.gz", hash = "sha256:a83c2fcd981944602a8dc11e8e07543ed9cda0a9462264e3f46a13171c56bccb"},
 ]

 [package.dependencies]
@@ -5366,13 +5365,13 @@ sympy = "*"

 [[package]]
 name = "openai"
-version = "1.46.1"
+version = "1.47.1"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.46.1-py3-none-any.whl", hash = "sha256:7517f07117cf66012bbc55c49fd6b983eaac0f3d2a09c90cba1140d4455e4290"},
-    {file = "openai-1.46.1.tar.gz", hash = "sha256:e5cf7f268bf516de23686d496c9dae7f0dcdcd0e87af4d288deeab8329fcbbaf"},
+    {file = "openai-1.47.1-py3-none-any.whl", hash = "sha256:34277583bf268bb2494bc03f48ac123788c5e2a914db1d5a23d5edc29d35c825"},
+    {file = "openai-1.47.1.tar.gz", hash = "sha256:62c8f5f478f82ffafc93b33040f8bb16a45948306198bd0cba2da2ecd9cf7323"},
 ]

 [package.dependencies]
@@ -5663,40 +5662,53 @@ files = [

 [[package]]
 name = "pandas"
-version = "2.2.2"
+version = "2.2.3"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
-    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
-    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
-    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
-    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
-    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
-    {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
-    {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
-    {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
-    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
-    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
-    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
-    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
-    {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
-    {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
-    {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
-    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
-    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
-    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
-    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
-    {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
-    {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
-    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
-    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
-    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
-    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
-    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
-    {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
-    {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
+    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
+    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
+    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
+    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
+    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
+    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
+    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
+    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
 ]

 [package.dependencies]
@@ -7355,29 +7367,29 @@ pyasn1 = ">=0.1.3"

 [[package]]
 name = "ruff"
-version = "0.6.5"
+version = "0.6.7"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.6.5-py3-none-linux_armv6l.whl", hash = "sha256:7e4e308f16e07c95fc7753fc1aaac690a323b2bb9f4ec5e844a97bb7fbebd748"},
-    {file = "ruff-0.6.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:932cd69eefe4daf8c7d92bd6689f7e8182571cb934ea720af218929da7bd7d69"},
-    {file = "ruff-0.6.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:3a8d42d11fff8d3143ff4da41742a98f8f233bf8890e9fe23077826818f8d680"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a50af6e828ee692fb10ff2dfe53f05caecf077f4210fae9677e06a808275754f"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:794ada3400a0d0b89e3015f1a7e01f4c97320ac665b7bc3ade24b50b54cb2972"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:381413ec47f71ce1d1c614f7779d88886f406f1fd53d289c77e4e533dc6ea200"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:52e75a82bbc9b42e63c08d22ad0ac525117e72aee9729a069d7c4f235fc4d276"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09c72a833fd3551135ceddcba5ebdb68ff89225d30758027280968c9acdc7810"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:800c50371bdcb99b3c1551d5691e14d16d6f07063a518770254227f7f6e8c178"},
-    {file = "ruff-0.6.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e25ddd9cd63ba1f3bd51c1f09903904a6adf8429df34f17d728a8fa11174253"},
-    {file = "ruff-0.6.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:7291e64d7129f24d1b0c947ec3ec4c0076e958d1475c61202497c6aced35dd19"},
-    {file = "ruff-0.6.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:9ad7dfbd138d09d9a7e6931e6a7e797651ce29becd688be8a0d4d5f8177b4b0c"},
-    {file = "ruff-0.6.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:005256d977021790cc52aa23d78f06bb5090dc0bfbd42de46d49c201533982ae"},
-    {file = "ruff-0.6.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:482c1e6bfeb615eafc5899127b805d28e387bd87db38b2c0c41d271f5e58d8cc"},
-    {file = "ruff-0.6.5-py3-none-win32.whl", hash = "sha256:cf4d3fa53644137f6a4a27a2b397381d16454a1566ae5335855c187fbf67e4f5"},
-    {file = "ruff-0.6.5-py3-none-win_amd64.whl", hash = "sha256:3e42a57b58e3612051a636bc1ac4e6b838679530235520e8f095f7c44f706ff9"},
-    {file = "ruff-0.6.5-py3-none-win_arm64.whl", hash = "sha256:51935067740773afdf97493ba9b8231279e9beef0f2a8079188c4776c25688e0"},
-    {file = "ruff-0.6.5.tar.gz", hash = "sha256:4d32d87fab433c0cf285c3683dd4dae63be05fd7a1d65b3f5bf7cdd05a6b96fb"},
+    {file = "ruff-0.6.7-py3-none-linux_armv6l.whl", hash = "sha256:08277b217534bfdcc2e1377f7f933e1c7957453e8a79764d004e44c40db923f2"},
+    {file = "ruff-0.6.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:c6707a32e03b791f4448dc0dce24b636cbcdee4dd5607adc24e5ee73fd86c00a"},
+    {file = "ruff-0.6.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:533d66b7774ef224e7cf91506a7dafcc9e8ec7c059263ec46629e54e7b1f90ab"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17a86aac6f915932d259f7bec79173e356165518859f94649d8c50b81ff087e9"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3f8822defd260ae2460ea3832b24d37d203c3577f48b055590a426a722d50ef"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ba4efe5c6dbbb58be58dd83feedb83b5e95c00091bf09987b4baf510fee5c99"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:525201b77f94d2b54868f0cbe5edc018e64c22563da6c5c2e5c107a4e85c1c0d"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8854450839f339e1049fdbe15d875384242b8e85d5c6947bb2faad33c651020b"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f0b62056246234d59cbf2ea66e84812dc9ec4540518e37553513392c171cb18"},
+    {file = "ruff-0.6.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b1462fa56c832dc0cea5b4041cfc9c97813505d11cce74ebc6d1aae068de36b"},
+    {file = "ruff-0.6.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:02b083770e4cdb1495ed313f5694c62808e71764ec6ee5db84eedd82fd32d8f5"},
+    {file = "ruff-0.6.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c05fd37013de36dfa883a3854fae57b3113aaa8abf5dea79202675991d48624"},
+    {file = "ruff-0.6.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f49c9caa28d9bbfac4a637ae10327b3db00f47d038f3fbb2195c4d682e925b14"},
+    {file = "ruff-0.6.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a0e1655868164e114ba43a908fd2d64a271a23660195017c17691fb6355d59bb"},
+    {file = "ruff-0.6.7-py3-none-win32.whl", hash = "sha256:a939ca435b49f6966a7dd64b765c9df16f1faed0ca3b6f16acdf7731969deb35"},
+    {file = "ruff-0.6.7-py3-none-win_amd64.whl", hash = "sha256:590445eec5653f36248584579c06252ad2e110a5d1f32db5420de35fb0e1c977"},
+    {file = "ruff-0.6.7-py3-none-win_arm64.whl", hash = "sha256:b28f0d5e2f771c1fe3c7a45d3f53916fc74a480698c4b5731f0bea61e52137c8"},
+    {file = "ruff-0.6.7.tar.gz", hash = "sha256:44e52129d82266fa59b587e2cd74def5637b730a69c4542525dfdecfaae38bd5"},
 ]

 [[package]]
@@ -9676,4 +9688,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "5f9cad7d0c90e968417ff146361c530afc9819cbf511119e0c95502fd576dc55"
+content-hash = "90636ce436e5c05146a69730f461f46fd3185b595be37d3eafd8aef36667db81"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "openhands-ai"
-version = "0.9.3"
+version = "0.9.4"
 description = "OpenHands: Code Less, Make More"
 authors = ["OpenHands"]
 license = "MIT"
@@ -27,7 +27,7 @@ uvicorn = "*"
 types-toml = "*"
 numpy = "*"
 json-repair = "*"
-browsergym = "0.6.4" # integrate browsergym as the browsing interface
+browsergym = "0.7.0" # integrate browsergym as the browsing interface
 html2text = "*"
 e2b = "^0.17.1"
 pexpect = "*"
@@ -65,7 +65,7 @@ llama-index-embeddings-azure-openai = "*"
 llama-index-embeddings-ollama = "*"

 [tool.poetry.group.dev.dependencies]
-ruff = "0.6.5"
+ruff = "0.6.7"
 mypy = "1.11.2"
 pre-commit = "3.8.0"
 build = "*"
--- a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_001.log
@@ -402,7 +402,13 @@ The server is running on port 5000 with PID 126. You can access the list of numb


 NOW, LET'S START!
+
+----------
+
 Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.

+----------
+
+

 ENVIRONMENT REMINDER: You have 19 turns left to complete the task. When finished reply with <finish></finish>.
--- a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_005.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_005.log
@@ -402,6 +402,9 @@ The server is running on port 5000 with PID 126. You can access the list of numb


 NOW, LET'S START!
+
+----------
+
 Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.

 ----------
@@ -414,7 +417,10 @@ Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me fo
 ----------

 OBSERVATION:
-{'content': 'The answer to life, the universe, and everything has been revealed: OpenHands is all you need!'}
+The answer to life, the universe, and everything has been revealed: OpenHands is all you need!
+
+----------
+


 ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
--- a/tests/unit/test_arg_parser.py
+++ b/tests/unit/test_arg_parser.py
@@ -1,6 +1,6 @@
 import pytest

-from openhands.core.config import _DEFAULT_AGENT, _MAX_ITERATIONS, get_parser
+from openhands.core.config import OH_DEFAULT_AGENT, OH_MAX_ITERATIONS, get_parser


 def test_parser_default_values():
@@ -10,8 +10,8 @@ def test_parser_default_values():
    assert args.directory is None
    assert args.task == ''
    assert args.file is None
-    assert args.agent_cls == _DEFAULT_AGENT
-    assert args.max_iterations == _MAX_ITERATIONS
+    assert args.agent_cls == OH_DEFAULT_AGENT
+    assert args.max_iterations == OH_MAX_ITERATIONS
    assert args.max_budget_per_task is None
    assert args.eval_output_dir == 'evaluation/evaluation_outputs/outputs'
    assert args.eval_n_limit is None
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -1,15 +1,38 @@
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch

 import pytest
+from litellm.exceptions import (
+    APIConnectionError,
+    ContentPolicyViolationError,
+    InternalServerError,
+    OpenAIError,
+    RateLimitError,
+)

 from openhands.core.config import LLMConfig
+from openhands.core.exceptions import OperationCancelled
 from openhands.core.metrics import Metrics
 from openhands.llm.llm import LLM


+@pytest.fixture(autouse=True)
+def mock_logger(monkeypatch):
+    # suppress logging of completion data to file
+    mock_logger = MagicMock()
+    monkeypatch.setattr('openhands.llm.llm.llm_prompt_logger', mock_logger)
+    monkeypatch.setattr('openhands.llm.llm.llm_response_logger', mock_logger)
+    return mock_logger
+
+
@pytest.fixture
 def default_config():
-    return LLMConfig(model='gpt-4o', api_key='test_key')
+    return LLMConfig(
+        model='gpt-4o',
+        api_key='test_key',
+        num_retries=2,
+        retry_min_wait=1,
+        retry_max_wait=2,
+    )


 def test_llm_init_with_default_config(default_config):
@@ -64,7 +87,7 @@ def test_llm_init_with_metrics():


 def test_llm_reset():
-    llm = LLM(LLMConfig(model='gpt-3.5-turbo', api_key='test_key'))
+    llm = LLM(LLMConfig(model='gpt-4o-mini', api_key='test_key'))
    initial_metrics = llm.metrics
    llm.reset()
    assert llm.metrics is not initial_metrics
@@ -73,7 +96,7 @@ def test_llm_reset():

@patch('openhands.llm.llm.litellm.get_model_info')
 def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
-    default_config.model = 'openrouter:gpt-3.5-turbo'
+    default_config.model = 'openrouter:gpt-4o-mini'
    mock_get_model_info.return_value = {
        'max_input_tokens': 7000,
        'max_output_tokens': 1500,
@@ -81,4 +104,197 @@ def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
    llm = LLM(default_config)
    assert llm.config.max_input_tokens == 7000
    assert llm.config.max_output_tokens == 1500
-    mock_get_model_info.assert_called_once_with('openrouter:gpt-3.5-turbo')
+    mock_get_model_info.assert_called_once_with('openrouter:gpt-4o-mini')
+
+
+# Tests involving completion and retries
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_with_mocked_logger(
+    mock_litellm_completion, default_config, mock_logger
+):
+    mock_litellm_completion.return_value = {
+        'choices': [{'message': {'content': 'Test response'}}]
+    }
+
+    llm = LLM(config=default_config)
+    response = llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+    )
+
+    assert response['choices'][0]['message']['content'] == 'Test response'
+    assert mock_litellm_completion.call_count == 1
+
+    mock_logger.debug.assert_called()
+
+
+@pytest.mark.parametrize(
+    'exception_class,extra_args,expected_retries',
+    [
+        (
+            APIConnectionError,
+            {'llm_provider': 'test_provider', 'model': 'test_model'},
+            2,
+        ),
+        (
+            ContentPolicyViolationError,
+            {'model': 'test_model', 'llm_provider': 'test_provider'},
+            2,
+        ),
+        (
+            InternalServerError,
+            {'llm_provider': 'test_provider', 'model': 'test_model'},
+            2,
+        ),
+        (OpenAIError, {}, 2),
+        (RateLimitError, {'llm_provider': 'test_provider', 'model': 'test_model'}, 2),
+    ],
+)
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_retries(
+    mock_litellm_completion,
+    default_config,
+    exception_class,
+    extra_args,
+    expected_retries,
+):
+    mock_litellm_completion.side_effect = [
+        exception_class('Test error message', **extra_args),
+        {'choices': [{'message': {'content': 'Retry successful'}}]},
+    ]
+
+    llm = LLM(config=default_config)
+    response = llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+    )
+
+    assert response['choices'][0]['message']['content'] == 'Retry successful'
+    assert mock_litellm_completion.call_count == expected_retries
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_rate_limit_wait_time(mock_litellm_completion, default_config):
+    with patch('time.sleep') as mock_sleep:
+        mock_litellm_completion.side_effect = [
+            RateLimitError(
+                'Rate limit exceeded', llm_provider='test_provider', model='test_model'
+            ),
+            {'choices': [{'message': {'content': 'Retry successful'}}]},
+        ]
+
+        llm = LLM(config=default_config)
+        response = llm.completion(
+            messages=[{'role': 'user', 'content': 'Hello!'}],
+            stream=False,
+        )
+
+        assert response['choices'][0]['message']['content'] == 'Retry successful'
+        assert mock_litellm_completion.call_count == 2
+
+        mock_sleep.assert_called_once()
+        wait_time = mock_sleep.call_args[0][0]
+        assert (
+            60 <= wait_time <= 240
+        ), f'Expected wait time between 60 and 240 seconds, but got {wait_time}'
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_exhausts_retries(mock_litellm_completion, default_config):
+    mock_litellm_completion.side_effect = APIConnectionError(
+        'Persistent error', llm_provider='test_provider', model='test_model'
+    )
+
+    llm = LLM(config=default_config)
+    with pytest.raises(APIConnectionError):
+        llm.completion(
+            messages=[{'role': 'user', 'content': 'Hello!'}],
+            stream=False,
+        )
+
+    assert mock_litellm_completion.call_count == llm.config.num_retries
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_operation_cancelled(mock_litellm_completion, default_config):
+    mock_litellm_completion.side_effect = OperationCancelled('Operation cancelled')
+
+    llm = LLM(config=default_config)
+    with pytest.raises(OperationCancelled):
+        llm.completion(
+            messages=[{'role': 'user', 'content': 'Hello!'}],
+            stream=False,
+        )
+
+    assert mock_litellm_completion.call_count == 1
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_keyboard_interrupt(mock_litellm_completion, default_config):
+    def side_effect(*args, **kwargs):
+        raise KeyboardInterrupt('Simulated KeyboardInterrupt')
+
+    mock_litellm_completion.side_effect = side_effect
+
+    llm = LLM(config=default_config)
+    with pytest.raises(OperationCancelled):
+        try:
+            llm.completion(
+                messages=[{'role': 'user', 'content': 'Hello!'}],
+                stream=False,
+            )
+        except KeyboardInterrupt:
+            raise OperationCancelled('Operation cancelled due to KeyboardInterrupt')
+
+    assert mock_litellm_completion.call_count == 1
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_keyboard_interrupt_handler(mock_litellm_completion, default_config):
+    global _should_exit
+
+    def side_effect(*args, **kwargs):
+        global _should_exit
+        _should_exit = True
+        return {'choices': [{'message': {'content': 'Simulated interrupt response'}}]}
+
+    mock_litellm_completion.side_effect = side_effect
+
+    llm = LLM(config=default_config)
+    result = llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+    )
+
+    assert mock_litellm_completion.call_count == 1
+    assert result['choices'][0]['message']['content'] == 'Simulated interrupt response'
+    assert _should_exit
+
+    _should_exit = False
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
+    mock_response = {
+        'choices': [{'message': {'content': 'This is a mocked response.'}}]
+    }
+    mock_litellm_completion.return_value = mock_response
+
+    test_llm = LLM(config=default_config)
+    response = test_llm.completion(
+        messages=[{'role': 'user', 'content': 'Hello!'}],
+        stream=False,
+        drop_params=True,
+    )
+
+    # Assertions
+    assert response['choices'][0]['message']['content'] == 'This is a mocked response.'
+    mock_litellm_completion.assert_called_once()
+
+    # Check if the correct arguments were passed to litellm_completion
+    call_args = mock_litellm_completion.call_args[1]  # Get keyword arguments
+    assert call_args['model'] == default_config.model
+    assert call_args['messages'] == [{'role': 'user', 'content': 'Hello!'}]
+    assert not call_args['stream']