diff --git a/.github/ISSUE_TEMPLATE/bug_template.yml b/.github/ISSUE_TEMPLATE/bug_template.yml
index 0ae2968397..6089f1c2ff 100644
--- a/.github/ISSUE_TEMPLATE/bug_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_template.yml
@@ -12,7 +12,7 @@ body:
       label: Is there an existing issue for the same bug?
       description: Please check if an issue already exists for the bug you encountered.
       options:
-      - label: I have checked the troubleshooting document at https://github.com/OpenDevin/OpenDevin/blob/main/docs/guides/Troubleshooting.md
+      - label: I have checked the troubleshooting document at https://opendevin.github.io/OpenDevin/modules/usage/troubleshooting
         required: true
       - label: I have checked the existing issues.
         required: true
diff --git a/.github/workflows/dummy-agent-test.yml b/.github/workflows/dummy-agent-test.yml
new file mode 100644
index 0000000000..0a853a7b4b
--- /dev/null
+++ b/.github/workflows/dummy-agent-test.yml
@@ -0,0 +1,21 @@
+name: Run e2e test with dummy agent
+
+on: [push]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Set up environment
+        run: |
+          curl -sSL https://install.python-poetry.org | python3 -
+          poetry install --without evaluation
+          wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
+      - name: Run tests
+        run: |
+          poetry run python opendevin/main.py -t "do a flip" -m ollama/not-a-model -d ./workspace/ -c DummyAgent
diff --git a/.github/workflows/ghcr.yml b/.github/workflows/ghcr.yml
index d6cb826edc..7d8405ebaf 100644
--- a/.github/workflows/ghcr.yml
+++ b/.github/workflows/ghcr.yml
@@ -42,8 +42,21 @@ jobs:
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Delete huge unnecessary tools folder
-        run: rm -rf /opt/hostedtoolcache
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
 
       - name: Build and push ${{ matrix.image }}
         if: github.event.pull_request.head.repo.full_name == github.repository
diff --git a/.github/workflows/run-unit-tests.yml b/.github/workflows/run-unit-tests.yml
index 4e34ec2474..47697da73d 100644
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -34,11 +34,16 @@ jobs:
           brew install colima docker
           colima start
 
+          # For testcontainers to find the Colima socket
+          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
+          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
+
       - name: Build Environment
         run: make build
 
       - name: Run Tests
         run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit
+
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -70,6 +75,7 @@ jobs:
 
       - name: Run Tests
         run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit
+
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
diff --git a/Development.md b/Development.md
index f98e3e437d..c67b569014 100644
--- a/Development.md
+++ b/Development.md
@@ -82,3 +82,15 @@ If you encounter any issues with the Language Model (LM) or you're simply curiou
     ```bash
     make help
     ```
+
+### 8. Testing
+
+#### Unit tests
+
+```bash
+poetry run pytest ./tests/unit/test_sandbox.py
+```
+
+#### Integration tests
+
+Please refer to [this README](./tests/integration/README.md) for details.
diff --git a/Makefile b/Makefile
index ab1062114e..0a2ce42ce3 100644
--- a/Makefile
+++ b/Makefile
@@ -200,12 +200,22 @@ setup-config-prompts:
 	@read -p "Enter your LLM Base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
 	 if [[ ! -z "$$llm_base_url" ]]; then echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi
 
-	@echo "Enter your LLM Embedding Model\nChoices are openai, azureopenai, llama2 or leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
-	 read -p "> " llm_embedding_model; \
-	 	echo "LLM_EMBEDDING_MODEL=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
-		if [ "$$llm_embedding_model" = "llama2" ]; then \
-			read -p "Enter the local model URL (will overwrite LLM_BASE_URL): " llm_base_url; \
-				echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
+	@echo "Enter your LLM Embedding Model"; \
+		echo "Choices are:"; \
+		echo "  - openai"; \
+		echo "  - azureopenai"; \
+		echo "  - Embeddings available only with OllamaEmbedding:"; \
+		echo "    - llama2"; \
+		echo "    - mxbai-embed-large"; \
+		echo "    - nomic-embed-text"; \
+		echo "    - all-minilm"; \
+		echo "    - stable-code"; \
+		echo "  - Leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
+		read -p "> " llm_embedding_model; \
+		echo "LLM_EMBEDDING_MODEL=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
+		if [ "$$llm_embedding_model" = "llama2" ] || [ "$$llm_embedding_model" = "mxbai-embed-large" ] || [ "$$llm_embedding_model" = "nomic-embed-text" ] || [ "$$llm_embedding_model" = "all-minilm" ] || [ "$$llm_embedding_model" = "stable-code" ]; then \
+			read -p "Enter the local model URL for the embedding model (will set LLM_EMBEDDING_BASE_URL): " llm_embedding_base_url; \
+				echo "LLM_EMBEDDING_BASE_URL=\"$$llm_embedding_base_url\"" >> $(CONFIG_FILE).tmp; \
 		elif [ "$$llm_embedding_model" = "azureopenai" ]; then \
 			read -p "Enter the Azure endpoint URL (will overwrite LLM_BASE_URL): " llm_base_url; \
 				echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
diff --git a/agenthub/README.md b/agenthub/README.md
index c2c756a819..d4c80e8214 100644
--- a/agenthub/README.md
+++ b/agenthub/README.md
@@ -26,6 +26,7 @@ The `state` contains:
 Here is a list of available Actions, which can be returned by `agent.step()`:
 - [`CmdRunAction`](../opendevin/action/bash.py) - Runs a command inside a sandboxed terminal
 - [`CmdKillAction`](../opendevin/action/bash.py) - Kills a background command
+- [`IPythonRunCellAction`](../opendevin/action/bash.py) - Execute a block of Python code interactively (in Jupyter notebook) and receives `CmdOutputObservation`. Requires setting up `jupyter` [plugin](../opendevin/sandbox/plugins) as a requirement.
 - [`FileReadAction`](../opendevin/action/fileop.py) - Reads the content of a file
 - [`FileWriteAction`](../opendevin/action/fileop.py) - Writes new content to a file
 - [`BrowseURLAction`](../opendevin/action/browse.py) - Gets the content of a URL
@@ -33,6 +34,7 @@ Here is a list of available Actions, which can be returned by `agent.step()`:
 - [`AddTaskAction`](../opendevin/action/tasks.py) - Adds a subtask to the plan
 - [`ModifyTaskAction`](../opendevin/action/tasks.py) - Changes the state of a subtask
 - [`AgentThinkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history (as well as the chat log)
+- [`AgentTalkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history and talk to the user.
 - [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user to enter a new task
 
 You can use `action.to_dict()` and `action_from_dict` to serialize and deserialize actions.
diff --git a/agenthub/SWE_agent/__init__.py b/agenthub/SWE_agent/__init__.py
index 8f0f418a72..58e5f72038 100644
--- a/agenthub/SWE_agent/__init__.py
+++ b/agenthub/SWE_agent/__init__.py
@@ -1,4 +1,5 @@
 from opendevin.agent import Agent
+
 from .agent import SWEAgent
 
 Agent.register('SWEAgent', SWEAgent)
diff --git a/agenthub/SWE_agent/agent.py b/agenthub/SWE_agent/agent.py
index 80e443f98b..808063eaff 100644
--- a/agenthub/SWE_agent/agent.py
+++ b/agenthub/SWE_agent/agent.py
@@ -1,23 +1,23 @@
 from typing import List
-from opendevin.agent import Agent
-from opendevin.llm.llm import LLM
-from opendevin.state import State
+
 from opendevin.action import (
     Action,
     AgentThinkAction,
     FileReadAction,
     FileWriteAction,
 )
+from opendevin.agent import Agent
+from opendevin.llm.llm import LLM
 from opendevin.observation import Observation
+from opendevin.state import State
 
 from .parser import parse_command
-
 from .prompts import (
-    SYSTEM_MESSAGE,
-    STEP_PROMPT,
+    CONTEXT_PROMPT,
     MEMORY_FORMAT,
     NO_ACTION,
-    CONTEXT_PROMPT
+    STEP_PROMPT,
+    SYSTEM_MESSAGE,
 )
 
 
diff --git a/agenthub/SWE_agent/parser.py b/agenthub/SWE_agent/parser.py
index 3a8ca180b5..d4600291ef 100644
--- a/agenthub/SWE_agent/parser.py
+++ b/agenthub/SWE_agent/parser.py
@@ -1,17 +1,17 @@
+import re
+
 from opendevin.action import (
     Action,
+    AgentEchoAction,
     AgentFinishAction,
+    AgentThinkAction,
+    BrowseURLAction,
     CmdRunAction,
     FileReadAction,
     FileWriteAction,
-    BrowseURLAction,
-    AgentEchoAction,
-    AgentThinkAction,
 )
 
-import re
-
-from .prompts import CUSTOM_DOCS, COMMAND_USAGE
+from .prompts import COMMAND_USAGE, CUSTOM_DOCS
 
 # commands: exit, read, write, browse, kill, search_file, search_dir
 
diff --git a/agenthub/__init__.py b/agenthub/__init__.py
index c39cfa3b6f..8c7c4027e0 100644
--- a/agenthub/__init__.py
+++ b/agenthub/__init__.py
@@ -1,20 +1,27 @@
-from .micro.registry import all_microagents
-from .micro.agent import MicroAgent
+from dotenv import load_dotenv
+
 from opendevin.agent import Agent
 
-from dotenv import load_dotenv
+from .micro.agent import MicroAgent
+from .micro.registry import all_microagents
+
 load_dotenv()
 
 
-# Import agents after environment variables are loaded
-from . import monologue_agent  # noqa: E402
-from . import codeact_agent    # noqa: E402
-from . import planner_agent    # noqa: E402
-from . import SWE_agent        # noqa: E402
-from . import delegator_agent  # noqa: E402
+
+from . import (  # noqa: E402
+    SWE_agent,
+    codeact_agent,
+    delegator_agent,
+    dummy_agent,
+    monologue_agent,
+    planner_agent,
+)
 
 __all__ = ['monologue_agent', 'codeact_agent',
-           'planner_agent', 'SWE_agent', 'delegator_agent']
+           'planner_agent', 'SWE_agent',
+           'delegator_agent',
+           'dummy_agent']
 
 for agent in all_microagents.values():
     name = agent['name']
diff --git a/agenthub/codeact_agent/__init__.py b/agenthub/codeact_agent/__init__.py
index a07c920950..c8d08d364d 100644
--- a/agenthub/codeact_agent/__init__.py
+++ b/agenthub/codeact_agent/__init__.py
@@ -1,4 +1,5 @@
 from opendevin.agent import Agent
+
 from .codeact_agent import CodeActAgent
 
 Agent.register('CodeActAgent', CodeActAgent)
diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
index 9ff503f3bd..3154db5daf 100644
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -1,54 +1,37 @@
 import re
 from typing import List, Mapping
 
+from agenthub.codeact_agent.prompt import EXAMPLES, SYSTEM_MESSAGE
 from opendevin.action import (
     Action,
     AgentEchoAction,
     AgentFinishAction,
+    AgentTalkAction,
     CmdRunAction,
+    IPythonRunCellAction,
+    NullAction,
 )
 from opendevin.agent import Agent
 from opendevin.llm.llm import LLM
 from opendevin.observation import (
     AgentMessageObservation,
     CmdOutputObservation,
+    IPythonRunCellObservation,
+    UserMessageObservation,
+)
+from opendevin.sandbox.plugins import (
+    JupyterRequirement,
+    PluginRequirement,
+    SWEAgentCommandsRequirement,
 )
 from opendevin.state import State
-from opendevin.sandbox.plugins import PluginRequirement, JupyterRequirement
-
-SYSTEM_MESSAGE = """You are a helpful assistant. You will be provided access (as root) to a bash shell to complete user-provided tasks.
-You will be able to execute commands in the bash shell, interact with the file system, install packages, and receive the output of your commands.
-
-DO NOT provide code in ```triple backticks```. Instead, you should execute bash command on behalf of the user by wrapping them with <execute> and </execute>.
-For example:
-
-You can list the files in the current directory by executing the following command:
-<execute>ls</execute>
-
-You can also install packages using pip:
-<execute> pip install numpy </execute>
-
-You can also write a block of code to a file:
-<execute>
-echo "import math
-print(math.pi)" > math.py
-</execute>
-
-When you are done, execute the following to close the shell and end the conversation:
-<execute>exit</execute>
-"""
-
-INVALID_INPUT_MESSAGE = (
-    "I don't understand your input. \n"
-    'If you want to execute command, please use <execute> YOUR_COMMAND_HERE </execute>.\n'
-    'If you already completed the task, please exit the shell by generating: <execute> exit </execute>.'
-)
 
 
 def parse_response(response) -> str:
     action = response.choices[0].message.content
-    if '<execute>' in action and '</execute>' not in action:
-        action += '</execute>'
+    for lang in ['bash', 'ipython']:
+        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+            action += f'</execute_{lang}>'
     return action
 
 
@@ -58,7 +41,20 @@ class CodeActAgent(Agent):
     The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
     """
 
-    sandbox_plugins: List[PluginRequirement] = [JupyterRequirement()]
+    sandbox_plugins: List[PluginRequirement] = [JupyterRequirement(), SWEAgentCommandsRequirement()]
+    SUPPORTED_ACTIONS = (
+        CmdRunAction,
+        IPythonRunCellAction,
+        AgentEchoAction,
+        AgentTalkAction,
+        NullAction
+    )
+    SUPPORTED_OBSERVATIONS = (
+        AgentMessageObservation,
+        UserMessageObservation,
+        CmdOutputObservation,
+        IPythonRunCellObservation
+    )
 
     def __init__(
         self,
@@ -93,56 +89,82 @@ class CodeActAgent(Agent):
             assert state.plan.main_goal, 'Expecting instruction to be set'
             self.messages = [
                 {'role': 'system', 'content': SYSTEM_MESSAGE},
-                {'role': 'user', 'content': state.plan.main_goal},
+                {
+                    'role': 'user',
+                    'content': (
+                        f'Here is an example of how you can interact with the environment for task solving:\n{EXAMPLES}\n\n'
+                        f"NOW, LET'S START!\n\n{state.plan.main_goal}"
+                    )
+                },
             ]
         updated_info = state.updated_info
         if updated_info:
             for prev_action, obs in updated_info:
                 assert isinstance(
-                    prev_action, (CmdRunAction, AgentEchoAction)
-                ), 'Expecting CmdRunAction or AgentEchoAction for Action'
-                if isinstance(
-                    obs, AgentMessageObservation
-                ):  # warning message from itself
+                    prev_action, self.SUPPORTED_ACTIONS
+                ), f'{prev_action.__class__} is not supported (supported: {self.SUPPORTED_ACTIONS})'
+                # prev_action is already added to self.messages when returned
+
+                # handle observations
+                assert isinstance(
+                    obs, self.SUPPORTED_OBSERVATIONS
+                ), f'{obs.__class__} is not supported (supported: {self.SUPPORTED_OBSERVATIONS})'
+                if isinstance(obs, (AgentMessageObservation, UserMessageObservation)):
                     self.messages.append(
                         {'role': 'user', 'content': obs.content})
+
+                    # User wants to exit
+                    if obs.content.strip() == '/exit':
+                        return AgentFinishAction()
                 elif isinstance(obs, CmdOutputObservation):
                     content = 'OBSERVATION:\n' + obs.content
                     content += f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]]'
                     self.messages.append({'role': 'user', 'content': content})
+                elif isinstance(obs, IPythonRunCellObservation):
+                    content = 'OBSERVATION:\n' + obs.content
+                    # replace base64 images with a placeholder
+                    splited = content.split('\n')
+                    for i, line in enumerate(splited):
+                        if '![image](data:image/png;base64,' in line:
+                            splited[i] = '![image](data:image/png;base64, ...) already displayed to user'
+                    content = '\n'.join(splited)
+                    self.messages.append({'role': 'user', 'content': content})
                 else:
                     raise NotImplementedError(
                         f'Unknown observation type: {obs.__class__}'
                     )
+
         response = self.llm.completion(
             messages=self.messages,
-            stop=['</execute>'],
+            stop=[
+                '</execute_ipython>',
+                '</execute_bash>',
+            ],
             temperature=0.0
         )
         action_str: str = parse_response(response)
-        state.num_of_chars += sum(len(message['content'])
-                                  for message in self.messages) + len(action_str)
+        state.num_of_chars += sum(
+            len(message['content']) for message in self.messages
+        ) + len(action_str)
         self.messages.append({'role': 'assistant', 'content': action_str})
 
-        command = re.search(r'<execute>(.*)</execute>', action_str, re.DOTALL)
-        if command is not None:
+        if bash_command := re.search(r'<execute_bash>(.*)</execute_bash>', action_str, re.DOTALL):
+            # remove the command from the action string to get thought
+            thought = action_str.replace(bash_command.group(0), '').strip()
             # a command was found
-            command_group = command.group(1)
+            command_group = bash_command.group(1).strip()
             if command_group.strip() == 'exit':
                 return AgentFinishAction()
-            return CmdRunAction(command=command_group)
-            # # execute the code
-            # # TODO: does exit_code get loaded into Message?
-            # exit_code, observation = self.env.execute(command_group)
-            # self._history.append(Message(Role.ASSISTANT, observation))
+            return CmdRunAction(command=command_group, thought=thought)
+        elif python_code := re.search(r'<execute_ipython>(.*)</execute_ipython>', action_str, re.DOTALL):
+            # a code block was found
+            code_group = python_code.group(1).strip()
+            thought = action_str.replace(python_code.group(0), '').strip()
+            return IPythonRunCellAction(code=code_group, thought=thought)
         else:
-            # we could provide a error message for the model to continue similar to
-            # https://github.com/xingyaoww/mint-bench/blob/main/mint/envs/general_env.py#L18-L23
-            # observation = INVALID_INPUT_MESSAGE
-            # self._history.append(Message(Role.ASSISTANT, observation))
-            return AgentEchoAction(
-                content=INVALID_INPUT_MESSAGE
-            )  # warning message to itself
+            # We assume the LLM is GOOD enough that when it returns pure natural language
+            # it want to talk to the user
+            return AgentTalkAction(content=action_str)
 
     def search_memory(self, query: str) -> List[str]:
         raise NotImplementedError('Implement this abstract method')
diff --git a/agenthub/codeact_agent/prompt.py b/agenthub/codeact_agent/prompt.py
new file mode 100644
index 0000000000..b504a12951
--- /dev/null
+++ b/agenthub/codeact_agent/prompt.py
@@ -0,0 +1,226 @@
+from opendevin.sandbox.plugins import SWEAgentCommandsRequirement
+
+_SWEAGENT_BASH_DOCS = '\n'.join(
+    filter(
+        lambda x: not x.startswith('submit'),
+        SWEAgentCommandsRequirement.documentation.split('\n')
+    )
+)
+# _SWEAGENT_BASH_DOCS content below:
+"""
+open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+goto <line_number> - moves the window to show <line_number>
+scroll_down - moves the window down {WINDOW} lines
+scroll_up - moves the window down {WINDOW} lines
+create <filename> - creates and opens a new file with the given name
+search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
+find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
+edit <start_line>:<end_line>
+<replacement_text>
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+"""
+
+_COMMAND_DOCS = (
+    '\nApart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:\n'
+    f'{_SWEAGENT_BASH_DOCS}'
+    "Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
+)
+
+SYSTEM_MESSAGE = f"""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
+The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+
+{_COMMAND_DOCS}
+
+Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+The assistant's response should be concise, but do express their thoughts.
+Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
+"""
+
+EXAMPLES = """
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+<execute_ipython>
+CODE='''
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+'''
+with open('app.py', 'w') as f:
+    f.write(CODE)
+</execute_ipython>
+
+USER:
+Observation:
+[Code executed successfully with no output]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+Observation:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+Observation:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_bash>
+open app.py
+</execute_bash>
+
+USER:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return str(numbers)
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. Let me do that for you:
+
+<execute_bash>
+edit 8:8 <<EOF
+    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+EOF
+</execute_bash>
+
+USER:
+Observation:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+
+ASSISTANT:
+The file has been updated. Let me run the Python file again with the new changes:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+"""
+
+INVALID_INPUT_MESSAGE = (
+    "I don't understand your input. \n"
+    'If you want to execute a bash command, please use <execute_bash> YOUR_COMMAND_HERE </execute_bash>.\n'
+    'If you want to execute a block of Python code, please use <execute_ipython> YOUR_COMMAND_HERE </execute_ipython>.\n'
+)
diff --git a/agenthub/delegator_agent/__init__.py b/agenthub/delegator_agent/__init__.py
index ffc2af8b24..d25d295e3a 100644
--- a/agenthub/delegator_agent/__init__.py
+++ b/agenthub/delegator_agent/__init__.py
@@ -1,4 +1,5 @@
 from opendevin.agent import Agent
+
 from .agent import DelegatorAgent
 
 Agent.register('DelegatorAgent', DelegatorAgent)
diff --git a/agenthub/delegator_agent/agent.py b/agenthub/delegator_agent/agent.py
index 8481d17d31..57ecd7bcd4 100644
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -1,11 +1,10 @@
 from typing import List
 
+from opendevin.action import Action, AgentDelegateAction, AgentFinishAction
 from opendevin.agent import Agent
-from opendevin.action import AgentFinishAction, AgentDelegateAction
-from opendevin.observation import AgentDelegateObservation
 from opendevin.llm.llm import LLM
+from opendevin.observation import AgentDelegateObservation
 from opendevin.state import State
-from opendevin.action import Action
 
 
 class DelegatorAgent(Agent):
diff --git a/agenthub/dummy_agent/__init__.py b/agenthub/dummy_agent/__init__.py
index e69de29bb2..1c8698ccd1 100644
--- a/agenthub/dummy_agent/__init__.py
+++ b/agenthub/dummy_agent/__init__.py
@@ -0,0 +1,5 @@
+from opendevin.agent import Agent
+
+from .agent import DummyAgent
+
+Agent.register('DummyAgent', DummyAgent)
diff --git a/agenthub/dummy_agent/agent.py b/agenthub/dummy_agent/agent.py
index f1cd85cb3c..64fe3bfe91 100644
--- a/agenthub/dummy_agent/agent.py
+++ b/agenthub/dummy_agent/agent.py
@@ -1,21 +1,118 @@
-"""Module for a Dummy agent."""
+import time
+from typing import List, TypedDict
 
-from opendevin.action.base import NullAction
-from opendevin.state import State
-from opendevin.action import Action
-from typing import List
+from opendevin.action import (
+    Action,
+    AddTaskAction,
+    AgentFinishAction,
+    AgentRecallAction,
+    AgentThinkAction,
+    BrowseURLAction,
+    CmdRunAction,
+    FileReadAction,
+    FileWriteAction,
+    ModifyTaskAction,
+)
 from opendevin.agent import Agent
-from opendevin.controller.agent_controller import AgentController
-from opendevin.observation.base import NullObservation, Observation
+from opendevin.llm.llm import LLM
+from opendevin.observation import (
+    AgentRecallObservation,
+    CmdOutputObservation,
+    FileReadObservation,
+    FileWriteObservation,
+    NullObservation,
+    Observation,
+)
+from opendevin.state import State
+
+"""
+FIXME: There are a few problems this surfaced
+* FileWrites seem to add an unintended newline at the end of the file
+* command_id is sometimes a number, sometimes a string
+* Why isn't the output of the background command split between two steps?
+* Browser not working
+"""
+
+ActionObs = TypedDict('ActionObs', {'action': Action, 'observations': List[Observation]})
+
+BACKGROUND_CMD = 'echo "This is in the background" && sleep .1 && echo "This too"'
+
 
 class DummyAgent(Agent):
-    """A dummy agent that does nothing but can be used in testing."""
+    """
+    The DummyAgent is used for e2e testing. It just sends the same set of actions deterministically,
+    without making any LLM calls.
+    """
 
-    async def run(self, controller: AgentController) -> Observation:
-        return NullObservation('')
+    def __init__(self, llm: LLM):
+        super().__init__(llm)
+        self.steps: List[ActionObs] = [{
+            'action': AddTaskAction(parent='0', goal='check the current directory'),
+            'observations': [NullObservation('')],
+        }, {
+            'action': AddTaskAction(parent='0.0', goal='run ls'),
+            'observations': [NullObservation('')],
+        }, {
+            'action': ModifyTaskAction(id='0.0', state='in_progress'),
+            'observations': [NullObservation('')],
+        }, {
+            'action': AgentThinkAction(thought='Time to get started!'),
+            'observations': [NullObservation('')],
+        }, {
+            'action': CmdRunAction(command='echo "foo"'),
+            'observations': [CmdOutputObservation('foo', command_id=-1, command='echo "foo"')],
+        }, {
+            'action': FileWriteAction(content='echo "Hello, World!"', path='hello.sh'),
+            'observations': [FileWriteObservation('', path='hello.sh')],
+        }, {
+            'action': FileReadAction(path='hello.sh'),
+            'observations': [FileReadObservation('echo "Hello, World!"\n', path='hello.sh')],
+        }, {
+            'action': CmdRunAction(command='bash hello.sh'),
+            'observations': [CmdOutputObservation('Hello, World!', command_id=-1, command='bash hello.sh')],
+        }, {
+            'action': CmdRunAction(command=BACKGROUND_CMD, background=True),
+            'observations': [
+                CmdOutputObservation('Background command started. To stop it, send a `kill` action with id 42', command_id='42', command=BACKGROUND_CMD),  # type: ignore[arg-type]
+                CmdOutputObservation('This is in the background\nThis too\n', command_id='42', command=BACKGROUND_CMD),  # type: ignore[arg-type]
+            ]
+        }, {
+            'action': AgentRecallAction(query='who am I?'),
+            'observations': [
+                AgentRecallObservation('', memories=['I am a computer.']),
+                # CmdOutputObservation('This too\n', command_id='42', command=BACKGROUND_CMD),
+            ],
+        }, {
+            'action': BrowseURLAction(url='https://google.com'),
+            'observations': [
+                # BrowserOutputObservation('<html></html>', url='https://google.com', screenshot=""),
+            ],
+        }, {
+            'action': AgentFinishAction(),
+            'observations': [],
+        }]
 
     def step(self, state: State) -> Action:
-        return NullAction('')
+        time.sleep(0.1)
+        if state.iteration > 0:
+            prev_step = self.steps[state.iteration - 1]
+            if 'observations' in prev_step:
+                expected_observations = prev_step['observations']
+                hist_start = len(state.history) - len(expected_observations)
+                for i in range(len(expected_observations)):
+                    hist_obs = state.history[hist_start + i][1].to_dict()
+                    expected_obs = expected_observations[i].to_dict()
+                    if 'command_id' in hist_obs['extras'] and hist_obs['extras']['command_id'] != -1:
+                        del hist_obs['extras']['command_id']
+                        hist_obs['content'] = ''
+                    if 'command_id' in expected_obs['extras'] and expected_obs['extras']['command_id'] != -1:
+                        del expected_obs['extras']['command_id']
+                        expected_obs['content'] = ''
+                    if hist_obs != expected_obs:
+                        print('\nactual', hist_obs)
+                        print('\nexpect', expected_obs)
+                    assert hist_obs == expected_obs, f'Expected observation {expected_obs}, got {hist_obs}'
+        return self.steps[state.iteration]['action']
 
     def search_memory(self, query: str) -> List[str]:
-        return []
+        return ['I am a computer.']
diff --git a/agenthub/micro/README.md b/agenthub/micro/README.md
new file mode 100644
index 0000000000..68be0a8a9a
--- /dev/null
+++ b/agenthub/micro/README.md
@@ -0,0 +1,14 @@
+## Introduction
+
+This package contains definitions of micro-agents. A micro-agent is defined
+in the following structure:
+
+```
+[AgentName]
+├── agent.yaml
+└── prompt.md
+```
+
+Note that `prompt.md` could use jinja2 template syntax. During runtime, `prompt.md`
+is loaded and rendered, and used together with `agent.yaml` to initialize a
+micro-agent.
diff --git a/agenthub/micro/agent.py b/agenthub/micro/agent.py
index 4445221d25..a0f6de3795 100644
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -1,13 +1,13 @@
 import json
-from typing import List, Dict
+from typing import Dict, List
 
-from jinja2 import Environment, BaseLoader
+from jinja2 import BaseLoader, Environment
 
+from opendevin.action import Action, action_from_dict
 from opendevin.agent import Agent
+from opendevin.exceptions import LLMOutputError
 from opendevin.llm.llm import LLM
 from opendevin.state import State
-from opendevin.action import Action, action_from_dict
-from opendevin.exceptions import LLMOutputError
 
 from .instructions import instructions
 from .registry import all_microagents
diff --git a/agenthub/micro/coder/prompt.md b/agenthub/micro/coder/prompt.md
index bfdb3d8706..3e6028c944 100644
--- a/agenthub/micro/coder/prompt.md
+++ b/agenthub/micro/coder/prompt.md
@@ -4,9 +4,11 @@ need to modify to complete this task:
 
 {{ state.plan.main_goal }}
 
+{% if state.inputs.summary %}
 Here's a summary of the codebase, as it relates to this task:
 
 {{ state.inputs.summary }}
+{% endif %}
 
 ## Available Actions
 {{ instructions.actions.run }}
diff --git a/agenthub/micro/instructions.py b/agenthub/micro/instructions.py
index 856c78024a..022e08511b 100644
--- a/agenthub/micro/instructions.py
+++ b/agenthub/micro/instructions.py
@@ -1,5 +1,5 @@
-from typing import Dict
 import os
+from typing import Dict
 
 instructions: Dict = {}
 
diff --git a/agenthub/micro/registry.py b/agenthub/micro/registry.py
index fd52cc8a7a..2fc4060dc4 100644
--- a/agenthub/micro/registry.py
+++ b/agenthub/micro/registry.py
@@ -1,4 +1,5 @@
 import os
+
 import yaml
 
 all_microagents = {}
diff --git a/agenthub/monologue_agent/__init__.py b/agenthub/monologue_agent/__init__.py
index 1cfef46f11..b60cb48bb5 100644
--- a/agenthub/monologue_agent/__init__.py
+++ b/agenthub/monologue_agent/__init__.py
@@ -1,4 +1,5 @@
 from opendevin.agent import Agent
+
 from .agent import MonologueAgent
 
 Agent.register('MonologueAgent', MonologueAgent)
diff --git a/agenthub/monologue_agent/agent.py b/agenthub/monologue_agent/agent.py
index 2be6c26039..59c5510d24 100644
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -1,35 +1,34 @@
 from typing import List
-from opendevin.agent import Agent
-from opendevin.state import State
-from opendevin.llm.llm import LLM
-from opendevin.schema import ActionType
-from opendevin.exceptions import AgentNoInstructionError
-from opendevin.schema.config import ConfigType
-from opendevin import config
-
-from opendevin.action import (
-    Action,
-    NullAction,
-    CmdRunAction,
-    FileWriteAction,
-    FileReadAction,
-    AgentRecallAction,
-    BrowseURLAction,
-    GitHubPushAction,
-    AgentThinkAction,
-)
-
-from opendevin.observation import (
-    Observation,
-    NullObservation,
-    CmdOutputObservation,
-    FileReadObservation,
-    AgentRecallObservation,
-    BrowserOutputObservation,
-)
 
 import agenthub.monologue_agent.utils.prompts as prompts
 from agenthub.monologue_agent.utils.monologue import Monologue
+from opendevin import config
+from opendevin.action import (
+    Action,
+    AgentRecallAction,
+    AgentThinkAction,
+    BrowseURLAction,
+    CmdRunAction,
+    FileReadAction,
+    FileWriteAction,
+    GitHubPushAction,
+    NullAction,
+)
+from opendevin.agent import Agent
+from opendevin.exceptions import AgentNoInstructionError
+from opendevin.llm.llm import LLM
+from opendevin.observation import (
+    AgentRecallObservation,
+    BrowserOutputObservation,
+    CmdOutputObservation,
+    FileReadObservation,
+    NullObservation,
+    Observation,
+)
+from opendevin.schema import ActionType
+from opendevin.schema.config import ConfigType
+from opendevin.state import State
+
 if config.get(ConfigType.AGENT_MEMORY_ENABLED):
     from agenthub.monologue_agent.utils.memory import LongTermMemory
 
@@ -137,6 +136,7 @@ class MonologueAgent(Agent):
         Utilizes the INITIAL_THOUGHTS list to give the agent a context for it's capabilities
         and how to navigate the WORKSPACE_MOUNT_PATH_IN_SANDBOX in `config` (e.g., /workspace by default).
         Short circuited to return when already initialized.
+        Will execute again when called after reset.
 
         Parameters:
         - task (str): The initial goal statement provided by the user
@@ -157,6 +157,10 @@ class MonologueAgent(Agent):
         else:
             self.memory = None
 
+        self._add_initial_thoughts(task)
+        self._initialized = True
+
+    def _add_initial_thoughts(self, task):
         previous_action = ''
         for thought in INITIAL_THOUGHTS:
             thought = thought.replace('$TASK', task)
@@ -208,7 +212,6 @@ class MonologueAgent(Agent):
                 else:
                     action = AgentThinkAction(thought=thought)
                 self._add_event(action.to_memory())
-        self._initialized = True
 
     def step(self, state: State) -> Action:
         """
@@ -257,8 +260,6 @@ class MonologueAgent(Agent):
 
     def reset(self) -> None:
         super().reset()
-        self.monologue = Monologue()
-        if config.get(ConfigType.AGENT_MEMORY_ENABLED):
-            self.memory = LongTermMemory()
-        else:
-            self.memory = None
+
+        # Reset the initial monologue and memory
+        self._initialized = False
diff --git a/agenthub/monologue_agent/utils/json.py b/agenthub/monologue_agent/utils/json.py
index 455a42ee0b..13b7416086 100644
--- a/agenthub/monologue_agent/utils/json.py
+++ b/agenthub/monologue_agent/utils/json.py
@@ -1,4 +1,5 @@
 import json
+
 from json_repair import repair_json
 
 
diff --git a/agenthub/monologue_agent/utils/memory.py b/agenthub/monologue_agent/utils/memory.py
index 9500d78287..9f5433ff73 100644
--- a/agenthub/monologue_agent/utils/memory.py
+++ b/agenthub/monologue_agent/utils/memory.py
@@ -1,17 +1,22 @@
-import llama_index.embeddings.openai.base as llama_openai
 import threading
 
 import chromadb
-from llama_index.core import Document
+import llama_index.embeddings.openai.base as llama_openai
+from llama_index.core import Document, VectorStoreIndex
 from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.core import VectorStoreIndex
 from llama_index.vector_stores.chroma import ChromaVectorStore
-from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
-from openai._exceptions import APIConnectionError, RateLimitError, InternalServerError
+from openai._exceptions import APIConnectionError, InternalServerError, RateLimitError
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_random_exponential,
+)
 
 from opendevin import config
 from opendevin.logger import opendevin_logger as logger
 from opendevin.schema.config import ConfigType
+
 from . import json
 
 num_retries = config.get(ConfigType.LLM_NUM_RETRIES)
@@ -51,11 +56,12 @@ embedding_strategy = config.get(ConfigType.LLM_EMBEDDING_MODEL)
 
 # TODO: More embeddings: https://docs.llamaindex.ai/en/stable/examples/embeddings/OpenAI/
 # There's probably a more programmatic way to do this.
-if embedding_strategy == 'llama2':
+supported_ollama_embed_models = ['llama2', 'mxbai-embed-large', 'nomic-embed-text', 'all-minilm', 'stable-code']
+if embedding_strategy in supported_ollama_embed_models:
     from llama_index.embeddings.ollama import OllamaEmbedding
     embed_model = OllamaEmbedding(
-        model_name='llama2',
-        base_url=config.get(ConfigType.LLM_BASE_URL, required=True),
+        model_name=embedding_strategy,
+        base_url=config.get(ConfigType.LLM_EMBEDDING_BASE_URL, required=True),
         ollama_additional_kwargs={'mirostat': 0},
     )
 elif embedding_strategy == 'openai':
diff --git a/agenthub/monologue_agent/utils/monologue.py b/agenthub/monologue_agent/utils/monologue.py
index 410b49c17a..545498d7b5 100644
--- a/agenthub/monologue_agent/utils/monologue.py
+++ b/agenthub/monologue_agent/utils/monologue.py
@@ -1,8 +1,8 @@
 
-from opendevin.llm.llm import LLM
-from opendevin.exceptions import AgentEventTypeError
 import agenthub.monologue_agent.utils.json as json
 import agenthub.monologue_agent.utils.prompts as prompts
+from opendevin.exceptions import AgentEventTypeError
+from opendevin.llm.llm import LLM
 from opendevin.logger import opendevin_logger as logger
 
 
diff --git a/agenthub/monologue_agent/utils/prompts.py b/agenthub/monologue_agent/utils/prompts.py
index 4bf0e78931..8767aecaa3 100644
--- a/agenthub/monologue_agent/utils/prompts.py
+++ b/agenthub/monologue_agent/utils/prompts.py
@@ -1,21 +1,20 @@
+import re
+from json import JSONDecodeError
 from typing import List
 
-from . import json
-from json import JSONDecodeError
-
-import re
-
+from opendevin import config
 from opendevin.action import (
-    action_from_dict,
     Action,
+    action_from_dict,
 )
+from opendevin.exceptions import LLMOutputError
 from opendevin.observation import (
     CmdOutputObservation,
 )
-from opendevin.exceptions import LLMOutputError
-from opendevin import config
 from opendevin.schema.config import ConfigType
 
+from . import json
+
 ACTION_PROMPT = """
 You're a thoughtful robot. Your main task is this:
 %(task)s
diff --git a/agenthub/planner_agent/__init__.py b/agenthub/planner_agent/__init__.py
index 77bed3e686..d81ba6cc26 100644
--- a/agenthub/planner_agent/__init__.py
+++ b/agenthub/planner_agent/__init__.py
@@ -1,4 +1,5 @@
 from opendevin.agent import Agent
+
 from .agent import PlannerAgent
 
 Agent.register('PlannerAgent', PlannerAgent)
diff --git a/agenthub/planner_agent/agent.py b/agenthub/planner_agent/agent.py
index de0672ef57..44413dfeea 100644
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -1,11 +1,11 @@
 from typing import List
-from .prompt import get_prompt, parse_response
 
+from opendevin.action import Action, AgentFinishAction
 from opendevin.agent import Agent
-from opendevin.action import AgentFinishAction
 from opendevin.llm.llm import LLM
 from opendevin.state import State
-from opendevin.action import Action
+
+from .prompt import get_prompt, parse_response
 
 
 class PlannerAgent(Agent):
diff --git a/agenthub/planner_agent/prompt.py b/agenthub/planner_agent/prompt.py
index 2b97b88348..234b958590 100644
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@@ -1,29 +1,29 @@
 import json
-from typing import List, Tuple, Dict, Type
-from opendevin.plan import Plan
-from opendevin.action import Action, action_from_dict
-from opendevin.observation import Observation
-from opendevin.schema import ActionType
-from opendevin.logger import opendevin_logger as logger
+from typing import Dict, List, Tuple, Type
 
 from opendevin.action import (
-    NullAction,
-    CmdRunAction,
-    CmdKillAction,
+    Action,
+    AddTaskAction,
+    AgentFinishAction,
+    AgentRecallAction,
+    AgentSummarizeAction,
+    AgentThinkAction,
     BrowseURLAction,
+    CmdKillAction,
+    CmdRunAction,
     FileReadAction,
     FileWriteAction,
-    AgentRecallAction,
-    AgentThinkAction,
-    AgentFinishAction,
-    AgentSummarizeAction,
-    AddTaskAction,
     ModifyTaskAction,
+    NullAction,
+    action_from_dict,
 )
-
+from opendevin.logger import opendevin_logger as logger
 from opendevin.observation import (
     NullObservation,
+    Observation,
 )
+from opendevin.plan import Plan
+from opendevin.schema import ActionType
 
 ACTION_TYPE_TO_CLASS: Dict[str, Type[Action]] = {
     ActionType.RUN: CmdRunAction,
diff --git a/containers/app/Dockerfile b/containers/app/Dockerfile
index 9d456a6271..be434419f0 100644
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -32,7 +32,8 @@ FROM python:3.12-slim as runtime
 
 WORKDIR /app
 
-ENV RUN_AS_DEVIN=false
+ENV RUN_AS_DEVIN=true
+ENV SANDBOX_USER_ID=1000
 ENV USE_HOST_NETWORK=false
 ENV SSH_HOSTNAME=host.docker.internal
 ENV WORKSPACE_BASE=/opt/workspace_base
@@ -40,13 +41,23 @@ ENV OPEN_DEVIN_BUILD_VERSION=$OPEN_DEVIN_BUILD_VERSION
 RUN mkdir -p $WORKSPACE_BASE
 
 RUN apt-get update -y \
-    && apt-get install -y curl ssh
+    && apt-get install -y curl ssh sudo
+
+RUN useradd -m -u $SANDBOX_USER_ID -s /bin/bash opendevin && \
+    usermod -aG sudo opendevin && \
+    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN chown -R opendevin:opendevin /app
+USER opendevin
 
 ENV VIRTUAL_ENV=/app/.venv \
     PATH="/app/.venv/bin:$PATH" \
     PYTHONPATH='/app'
 
 COPY --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+# change ownership of the virtual environment to the sandbox user
+USER root
+RUN chown -R opendevin:opendevin ${VIRTUAL_ENV}
+USER opendevin
 
 COPY ./opendevin ./opendevin
 COPY ./agenthub ./agenthub
@@ -55,4 +66,17 @@ RUN playwright install --with-deps chromium
 
 COPY --from=frontend-builder /app/dist ./frontend/dist
 
-CMD ["uvicorn", "opendevin.server.listen:app", "--host", "0.0.0.0", "--port", "3000"]
+USER root
+RUN chown -R opendevin:opendevin /app
+# make group permissions the same as user permissions
+RUN chmod -R g=u /app
+USER opendevin
+
+# change ownership of the app directory to the sandbox user
+COPY ./containers/app/entrypoint.sh /app/entrypoint.sh
+
+# run the script as root
+USER root
+RUN chown opendevin:opendevin /app/entrypoint.sh
+RUN chmod 777 /app/entrypoint.sh
+CMD ["/app/entrypoint.sh"]
diff --git a/containers/app/entrypoint.sh b/containers/app/entrypoint.sh
new file mode 100755
index 0000000000..843790b352
--- /dev/null
+++ b/containers/app/entrypoint.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# check user is root
+if [ "$(id -u)" -ne 0 ]; then
+  echo "Please run as root"
+  exit 1
+fi
+
+if [ -z "$SANDBOX_USER_ID" ]; then
+  echo "SANDBOX_USER_ID is not set"
+  exit 1
+fi
+
+# change uid of opendevin user to match the host user
+# but the group id is not changed, so the user can still access everything under /app
+usermod -u $SANDBOX_USER_ID opendevin
+
+# get the user group of /var/run/docker.sock and set opendevin to that group
+DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
+echo "Docker socket group id: $DOCKER_SOCKET_GID"
+usermod -aG $DOCKER_SOCKET_GID opendevin
+
+# switch to the user and start the server
+su opendevin -c "cd /app && uvicorn opendevin.server.listen:app --host 0.0.0.0 --port 3000"
diff --git a/containers/sandbox/Dockerfile b/containers/sandbox/Dockerfile
index 5c0a2c9ea7..1f81645f99 100644
--- a/containers/sandbox/Dockerfile
+++ b/containers/sandbox/Dockerfile
@@ -27,3 +27,7 @@ RUN mkdir -p -m0755 /var/run/sshd
 
 # symlink python3 to python
 RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# install basic dependencies for CodeActAgent
+RUN pip3 install --upgrade pip
+RUN pip3 install jupyterlab notebook jupyter_kernel_gateway flake8
diff --git a/dev_config/python/ruff.toml b/dev_config/python/ruff.toml
index 06ba2d3ace..4efe096a30 100644
--- a/dev_config/python/ruff.toml
+++ b/dev_config/python/ruff.toml
@@ -7,6 +7,7 @@ select = [
     "E",
     "W",
     "F",
+    "I",
     "Q",
 ]
 
diff --git a/docs/modules/python/agenthub/dummy_agent/agent.md b/docs/modules/python/agenthub/dummy_agent/agent.md
index c783e7061f..e2738fb8b6 100644
--- a/docs/modules/python/agenthub/dummy_agent/agent.md
+++ b/docs/modules/python/agenthub/dummy_agent/agent.md
@@ -3,13 +3,12 @@ sidebar_label: agent
 title: agenthub.dummy_agent.agent
 ---
 
-Module for a Dummy agent.
-
 ## DummyAgent Objects
 
 ```python
 class DummyAgent(Agent)
 ```
 
-A dummy agent that does nothing but can be used in testing.
+The DummyAgent is used for e2e testing. It just sends the same set of actions deterministically,
+without making any LLM calls.
 
diff --git a/docs/modules/python/opendevin/observation/run.md b/docs/modules/python/opendevin/observation/run.md
index a311d40a7f..c0914c70ac 100644
--- a/docs/modules/python/opendevin/observation/run.md
+++ b/docs/modules/python/opendevin/observation/run.md
@@ -12,3 +12,12 @@ class CmdOutputObservation(Observation)
 
 This data class represents the output of a command.
 
+## IPythonRunCellObservation Objects
+
+```python
+@dataclass
+class IPythonRunCellObservation(Observation)
+```
+
+This data class represents the output of a IPythonRunCellAction.
+
diff --git a/docs/modules/python/opendevin/schema/action.md b/docs/modules/python/opendevin/schema/action.md
index 8046dc23e7..0c35c47186 100644
--- a/docs/modules/python/opendevin/schema/action.md
+++ b/docs/modules/python/opendevin/schema/action.md
@@ -13,9 +13,13 @@ class ActionTypeSchema(BaseModel)
 
 Initializes the agent. Only sent by client.
 
+#### USER\_MESSAGE
+
+Sends a message from the user. Only sent by the client.
+
 #### START
 
-Starts a new development task. Only sent by the client.
+Starts a new development task OR send chat from the user. Only sent by the client.
 
 #### READ
 
@@ -29,6 +33,10 @@ Writes the content to a file.
 
 Runs a command.
 
+#### RUN\_IPYTHON
+
+Runs a IPython cell.
+
 #### KILL
 
 Kills a background command.
@@ -45,6 +53,10 @@ Searches long-term memory
 
 Allows the agent to make a plan, set a goal, or record thoughts
 
+#### TALK
+
+Allows the agent to respond to the user.
+
 #### DELEGATE
 
 Delegates a task to another agent.
diff --git a/docs/modules/python/opendevin/schema/observation.md b/docs/modules/python/opendevin/schema/observation.md
index db6a80ab83..c327e61d5f 100644
--- a/docs/modules/python/opendevin/schema/observation.md
+++ b/docs/modules/python/opendevin/schema/observation.md
@@ -21,6 +21,10 @@ The HTML content of a URL
 
 The output of a command
 
+#### RUN\_IPYTHON
+
+Runs a IPython cell.
+
 #### RECALL
 
 The result of a search
diff --git a/docs/modules/python/opendevin/schema/task.md b/docs/modules/python/opendevin/schema/task.md
index 4f21dc2559..ba1900bcce 100644
--- a/docs/modules/python/opendevin/schema/task.md
+++ b/docs/modules/python/opendevin/schema/task.md
@@ -17,6 +17,10 @@ Initial state of the task.
 
 The task is running.
 
+#### AWAITING\_USER\_INPUT
+
+The task is awaiting user input.
+
 #### PAUSED
 
 The task is paused.
diff --git a/evaluation/SWE-bench/notebooks/devin_eval_analysis.ipynb b/evaluation/SWE-bench/notebooks/devin_eval_analysis.ipynb
index b6633aac7c..3cc21c912a 100644
--- a/evaluation/SWE-bench/notebooks/devin_eval_analysis.ipynb
+++ b/evaluation/SWE-bench/notebooks/devin_eval_analysis.ipynb
@@ -11,12 +11,12 @@
    },
    "outputs": [],
    "source": [
-    "import requests\n",
+    "import matplotlib.pyplot as plt\n",
     "import pandas as pd\n",
-    "from tqdm import tqdm\n",
-    "from datasets import load_dataset\n",
+    "import requests\n",
     "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt"
+    "from datasets import load_dataset\n",
+    "from tqdm import tqdm"
    ]
   },
   {
diff --git a/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py b/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py
index 829e25faf2..428f1061ca 100644
--- a/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py
+++ b/evaluation/SWE-bench/scripts/prepare_devin_outputs_for_evaluation.py
@@ -11,9 +11,10 @@ Outputs:
 '''
 
 # fetch devin's outputs into a json file for evaluation
+import json
 import os
 import sys
-import json
+
 import requests
 from tqdm import tqdm
 
diff --git a/evaluation/regression/cases/hello-world/test_hello_world.py b/evaluation/regression/cases/hello-world/test_hello_world.py
index ed33cb45f1..bb6110f934 100644
--- a/evaluation/regression/cases/hello-world/test_hello_world.py
+++ b/evaluation/regression/cases/hello-world/test_hello_world.py
@@ -1,4 +1,5 @@
 import os
+
 import pytest
 from conftest import agents
 
diff --git a/evaluation/regression/conftest.py b/evaluation/regression/conftest.py
index f1bf1f6443..977fc7f559 100644
--- a/evaluation/regression/conftest.py
+++ b/evaluation/regression/conftest.py
@@ -1,9 +1,10 @@
-import os
-import pytest
-import subprocess
-import logging
-import shutil
 import datetime
+import logging
+import os
+import shutil
+import subprocess
+
+import pytest
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 CASES_DIR = os.path.join(SCRIPT_DIR, 'cases')
diff --git a/evaluation/regression/run_tests.py b/evaluation/regression/run_tests.py
index bfb7185ed3..2887b44507 100644
--- a/evaluation/regression/run_tests.py
+++ b/evaluation/regression/run_tests.py
@@ -1,4 +1,5 @@
 import argparse
+
 import pytest
 
 from opendevin import config
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 231da53a4b..e4fe6f1161 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -26,6 +26,7 @@
         "react": "^18.2.0",
         "react-accessible-treeview": "^2.8.3",
         "react-dom": "^18.2.0",
+        "react-highlight": "^0.15.0",
         "react-hot-toast": "^2.4.1",
         "react-i18next": "^14.1.0",
         "react-icons": "^5.0.1",
@@ -44,6 +45,7 @@
         "@types/node": "^18.0.0 ",
         "@types/react": "^18.2.66",
         "@types/react-dom": "^18.2.22",
+        "@types/react-highlight": "^0.12.8",
         "@types/react-syntax-highlighter": "^15.5.11",
         "@typescript-eslint/eslint-plugin": "^7.4.0",
         "@typescript-eslint/parser": "^7.0.0",
@@ -4878,6 +4880,15 @@
         "@types/react": "*"
       }
     },
+    "node_modules/@types/react-highlight": {
+      "version": "0.12.8",
+      "resolved": "https://registry.npmjs.org/@types/react-highlight/-/react-highlight-0.12.8.tgz",
+      "integrity": "sha512-V7O7zwXUw8WSPd//YUO8sz489J/EeobJljASGhP0rClrvq+1Y1qWEpToGu+Pp7YuChxhAXSgkLkrOYpZX5A62g==",
+      "dev": true,
+      "dependencies": {
+        "@types/react": "*"
+      }
+    },
     "node_modules/@types/react-syntax-highlighter": {
       "version": "15.5.11",
       "resolved": "https://registry.npmjs.org/@types/react-syntax-highlighter/-/react-syntax-highlighter-15.5.11.tgz",
@@ -12799,6 +12810,14 @@
         "react": "^18.2.0"
       }
     },
+    "node_modules/react-highlight": {
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/react-highlight/-/react-highlight-0.15.0.tgz",
+      "integrity": "sha512-5uV/b/N4Z421GSVVe05fz+OfTsJtFzx/fJBdafZyw4LS70XjIZwgEx3Lrkfc01W/RzZ2Dtfb0DApoaJFAIKBtA==",
+      "dependencies": {
+        "highlight.js": "^10.5.0"
+      }
+    },
     "node_modules/react-hot-toast": {
       "version": "2.4.1",
       "resolved": "https://registry.npmjs.org/react-hot-toast/-/react-hot-toast-2.4.1.tgz",
diff --git a/frontend/package.json b/frontend/package.json
index 154310d5a8..56f2d04f15 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -25,6 +25,7 @@
     "react": "^18.2.0",
     "react-accessible-treeview": "^2.8.3",
     "react-dom": "^18.2.0",
+    "react-highlight": "^0.15.0",
     "react-hot-toast": "^2.4.1",
     "react-i18next": "^14.1.0",
     "react-icons": "^5.0.1",
@@ -64,6 +65,7 @@
     "@types/node": "^18.0.0 ",
     "@types/react": "^18.2.66",
     "@types/react-dom": "^18.2.22",
+    "@types/react-highlight": "^0.12.8",
     "@types/react-syntax-highlighter": "^15.5.11",
     "@typescript-eslint/eslint-plugin": "^7.4.0",
     "@typescript-eslint/parser": "^7.0.0",
diff --git a/frontend/src/components/AgentStatusBar.tsx b/frontend/src/components/AgentStatusBar.tsx
index f1ac56aa7b..253164b64e 100644
--- a/frontend/src/components/AgentStatusBar.tsx
+++ b/frontend/src/components/AgentStatusBar.tsx
@@ -15,6 +15,10 @@ const AgentStatusMap: { [k: string]: { message: string; indicator: string } } =
       message: "Agent is running task...",
       indicator: "bg-green-500",
     },
+    [AgentTaskState.AWAITING_USER_INPUT]: {
+      message: "Agent is awaiting user input...",
+      indicator: "bg-orange-500",
+    },
     [AgentTaskState.PAUSED]: {
       message: "Agent has paused.",
       indicator: "bg-yellow-500",
diff --git a/frontend/src/components/Browser.tsx b/frontend/src/components/Browser.tsx
index f56f7b4d26..d3fe8ad28c 100644
--- a/frontend/src/components/Browser.tsx
+++ b/frontend/src/components/Browser.tsx
@@ -1,25 +1,13 @@
 import React from "react";
+import { useTranslation } from "react-i18next";
+import { IoIosGlobe } from "react-icons/io";
 import { useSelector } from "react-redux";
-import { HiOutlineMagnifyingGlass } from "react-icons/hi2";
-import { HiCursorClick } from "react-icons/hi";
+import { I18nKey } from "#/i18n/declaration";
 import { RootState } from "#/store";
 
-import logo from "../assets/logo.png";
-
-function BlankPage(): JSX.Element {
-  return (
-    <div className="h-full bg-slate-200 flex flex-col items-center justify-center">
-      <img src={logo} alt="Blank Page" className="w-28 h-28" />
-      <div className="h-8 flex items-center bg-slate-900 px-2 rounded-3xl ml-3 space-x-2">
-        <HiOutlineMagnifyingGlass size={20} />
-        <span>OpenDevin: Code Less, Make More.</span>
-        <HiCursorClick size={20} />
-      </div>
-    </div>
-  );
-}
-
 function Browser(): JSX.Element {
+  const { t } = useTranslation();
+
   const { url, screenshotSrc } = useSelector(
     (state: RootState) => state.browser,
   );
@@ -30,15 +18,18 @@ function Browser(): JSX.Element {
       : `data:image/png;base64,${screenshotSrc || ""}`;
 
   return (
-    <div className="h-full w-full flex flex-col justify-evenly p-2 space-y-2">
-      <div className="w-full py-2 px-5 rounded-3xl bg-neutral-700 text-gray-200 truncate">
+    <div className="h-full w-full flex flex-col text-neutral-400">
+      <div className="w-full p-2 truncate border-b border-neutral-600">
         {url}
       </div>
-      <div className="overflow-y-auto h-4/5 scrollbar-hide rounded-xl">
+      <div className="overflow-y-auto grow scrollbar-hide rounded-xl">
         {screenshotSrc ? (
           <img src={imgSrc} className="rounded-xl" alt="Browser Screenshot" />
         ) : (
-          <BlankPage />
+          <div className="flex flex-col items-center h-full justify-center">
+            <IoIosGlobe size={100} />
+            {t(I18nKey.BROWSER$EMPTY_MESSAGE)}
+          </div>
         )}
       </div>
     </div>
diff --git a/frontend/src/components/ChatInterface.tsx b/frontend/src/components/ChatInterface.tsx
index e113244f5c..982ac99cb1 100644
--- a/frontend/src/components/ChatInterface.tsx
+++ b/frontend/src/components/ChatInterface.tsx
@@ -3,6 +3,7 @@ import { IoMdChatbubbles } from "react-icons/io";
 import Markdown from "react-markdown";
 import { useSelector } from "react-redux";
 import { useTypingEffect } from "#/hooks/useTypingEffect";
+import AgentTaskState from "../types/AgentTaskState";
 import {
   addAssistantMessageToChat,
   sendChatMessage,
@@ -117,6 +118,12 @@ function MessageList(): JSX.Element {
 
 function ChatInterface(): JSX.Element {
   const { initialized } = useSelector((state: RootState) => state.task);
+  const { curTaskState } = useSelector((state: RootState) => state.agent);
+
+  const onUserMessage = (msg: string) => {
+    const isNewTask = curTaskState === AgentTaskState.INIT;
+    sendChatMessage(msg, isNewTask);
+  };
 
   return (
     <div className="flex flex-col h-full p-0 bg-neutral-800">
@@ -125,7 +132,7 @@ function ChatInterface(): JSX.Element {
         Chat
       </div>
       <MessageList />
-      <ChatInput disabled={!initialized} onSendMessage={sendChatMessage} />
+      <ChatInput disabled={!initialized} onSendMessage={onUserMessage} />
     </div>
   );
 }
diff --git a/frontend/src/components/CodeEditor.tsx b/frontend/src/components/CodeEditor.tsx
index 17286b5cf4..8190c8c580 100644
--- a/frontend/src/components/CodeEditor.tsx
+++ b/frontend/src/components/CodeEditor.tsx
@@ -2,13 +2,17 @@ import Editor, { Monaco } from "@monaco-editor/react";
 import { Tab, Tabs } from "@nextui-org/react";
 import type { editor } from "monaco-editor";
 import React, { useState } from "react";
+import { useTranslation } from "react-i18next";
+import { VscCode } from "react-icons/vsc";
 import { useDispatch, useSelector } from "react-redux";
+import { I18nKey } from "#/i18n/declaration";
 import { selectFile } from "#/services/fileService";
 import { setCode } from "#/state/codeSlice";
 import { RootState } from "#/store";
 import FileExplorer from "./file-explorer/FileExplorer";
 
 function CodeEditor(): JSX.Element {
+  const { t } = useTranslation();
   const [selectedFileName, setSelectedFileName] = useState("");
 
   const dispatch = useDispatch();
@@ -64,14 +68,21 @@ function CodeEditor(): JSX.Element {
             title={selectedFileName}
           />
         </Tabs>
-        <div className="flex grow">
-          <Editor
-            height="100%"
-            path={selectedFileName.toLocaleLowerCase()}
-            defaultValue=""
-            value={code}
-            onMount={handleEditorDidMount}
-          />
+        <div className="flex grow items-center justify-center">
+          {selectedFileName === "" ? (
+            <div className="flex flex-col items-center text-neutral-400">
+              <VscCode size={100} />
+              {t(I18nKey.CODE_EDITOR$EMPTY_MESSAGE)}
+            </div>
+          ) : (
+            <Editor
+              height="100%"
+              path={selectedFileName.toLocaleLowerCase()}
+              defaultValue=""
+              value={code}
+              onMount={handleEditorDidMount}
+            />
+          )}
         </div>
       </div>
     </div>
diff --git a/frontend/src/components/Jupyter.tsx b/frontend/src/components/Jupyter.tsx
new file mode 100644
index 0000000000..22b27e80c0
--- /dev/null
+++ b/frontend/src/components/Jupyter.tsx
@@ -0,0 +1,77 @@
+import React from "react";
+import { useSelector } from "react-redux";
+import SyntaxHighlighter from "react-syntax-highlighter";
+import Markdown from "react-markdown";
+import { atomOneDark } from "react-syntax-highlighter/dist/esm/styles/hljs";
+import { RootState } from "#/store";
+import { Cell } from "#/state/jupyterSlice";
+
+interface IJupyterCell {
+  cell: Cell;
+}
+
+function JupyterCell({ cell }: IJupyterCell): JSX.Element {
+  const code = cell.content;
+
+  if (cell.type === "input") {
+    return (
+      <div className="rounded-lg bg-gray-800 dark:bg-gray-900 p-2 text-xs">
+        <div className="mb-1 text-gray-400">EXECUTE</div>
+        <pre
+          className="scrollbar-custom scrollbar-thumb-gray-500 hover:scrollbar-thumb-gray-400 dark:scrollbar-thumb-white/10 dark:hover:scrollbar-thumb-white/20 overflow-auto px-5"
+          style={{ padding: 0, marginBottom: 0, fontSize: "0.75rem" }}
+        >
+          <SyntaxHighlighter language="python" style={atomOneDark}>
+            {code}
+          </SyntaxHighlighter>
+        </pre>
+      </div>
+    );
+  }
+  return (
+    <div className="rounded-lg bg-gray-800 dark:bg-gray-900 p-2 text-xs">
+      <div className="mb-1 text-gray-400">STDOUT/STDERR</div>
+      <pre
+        className="scrollbar-custom scrollbar-thumb-gray-500 hover:scrollbar-thumb-gray-400 dark:scrollbar-thumb-white/10 dark:hover:scrollbar-thumb-white/20 overflow-auto px-5 max-h-[60vh] bg-gray-800"
+        style={{ padding: 0, marginBottom: 0, fontSize: "0.75rem" }}
+      >
+        {/* split code by newline and render each line as a plaintext, except it starts with `![image]` so we render it as markdown */}
+        {code.split("\n").map((line, index) => {
+          if (line.startsWith("![image](data:image/png;base64,")) {
+            // add new line before and after the image
+            return (
+              <div key={index}>
+                <Markdown urlTransform={(value: string) => value}>
+                  {line}
+                </Markdown>
+                <br />
+              </div>
+            );
+          }
+          return (
+            <div key={index}>
+              <SyntaxHighlighter language="plaintext" style={atomOneDark}>
+                {line}
+              </SyntaxHighlighter>
+              <br />
+            </div>
+          );
+        })}
+      </pre>
+    </div>
+  );
+}
+
+function Jupyter(): JSX.Element {
+  const { cells } = useSelector((state: RootState) => state.jupyter);
+
+  return (
+    <div className="flex-1 overflow-y-auto flex flex-col">
+      {cells.map((cell, index) => (
+        <JupyterCell key={index} cell={cell} />
+      ))}
+    </div>
+  );
+}
+
+export default Jupyter;
diff --git a/frontend/src/components/Planner.tsx b/frontend/src/components/Planner.tsx
index bb6afb76d4..c7f41919e7 100644
--- a/frontend/src/components/Planner.tsx
+++ b/frontend/src/components/Planner.tsx
@@ -1,4 +1,5 @@
 import React from "react";
+import { useTranslation } from "react-i18next";
 import {
   FaCheckCircle,
   FaQuestionCircle,
@@ -7,7 +8,9 @@ import {
   FaRegClock,
   FaRegTimesCircle,
 } from "react-icons/fa";
+import { VscListOrdered } from "react-icons/vsc";
 import { useSelector } from "react-redux";
+import { I18nKey } from "#/i18n/declaration";
 import { Plan, Task, TaskState } from "#/services/planService";
 import { RootState } from "#/store";
 
@@ -55,10 +58,13 @@ interface PlanProps {
 }
 
 function PlanContainer({ plan }: PlanProps): JSX.Element {
+  const { t } = useTranslation();
+
   if (plan.mainGoal === undefined) {
     return (
-      <div className="p-2">
-        Nothing is currently planned. Start a task for this to change.
+      <div className="w-full h-full flex flex-col text-neutral-400 items-center justify-center">
+        <VscListOrdered size={100} />
+        {t(I18nKey.PLANNER$EMPTY_MESSAGE)}
       </div>
     );
   }
diff --git a/frontend/src/components/Workspace.tsx b/frontend/src/components/Workspace.tsx
index 7d12b64d0f..b291e18c0b 100644
--- a/frontend/src/components/Workspace.tsx
+++ b/frontend/src/components/Workspace.tsx
@@ -12,6 +12,7 @@ import { AllTabs, TabOption, TabType } from "#/types/TabOption";
 import Browser from "./Browser";
 import CodeEditor from "./CodeEditor";
 import Planner from "./Planner";
+import Jupyter from "./Jupyter";
 
 function Workspace() {
   const { t } = useTranslation();
@@ -20,12 +21,13 @@ function Workspace() {
   const screenshotSrc = useSelector(
     (state: RootState) => state.browser.screenshotSrc,
   );
-
+  const jupyterCells = useSelector((state: RootState) => state.jupyter.cells);
   const [activeTab, setActiveTab] = useState<TabType>(TabOption.CODE);
   const [changes, setChanges] = useState<Record<TabType, boolean>>({
     [TabOption.PLANNER]: false,
     [TabOption.CODE]: false,
     [TabOption.BROWSER]: false,
+    [TabOption.JUPYTER]: false,
   });
 
   const tabData = useMemo(
@@ -45,6 +47,11 @@ function Workspace() {
         icon: <IoIosGlobe size={18} />,
         component: <Browser key="browser" />,
       },
+      [TabOption.JUPYTER]: {
+        name: t(I18nKey.WORKSPACE$JUPYTER_TAB_LABEL),
+        icon: <VscCode size={18} />,
+        component: <Jupyter key="jupyter" />,
+      },
     }),
     [t],
   );
@@ -73,6 +80,14 @@ function Workspace() {
     // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [screenshotSrc]);
 
+  useEffect(() => {
+    if (activeTab !== TabOption.JUPYTER && jupyterCells.length > 0) {
+      // FIXME: This is a temporary solution to show the jupyter tab when the first cell is added
+      // Only need to show the tab only when a cell is added
+      setChanges((prev) => ({ ...prev, [TabOption.JUPYTER]: true }));
+    }
+  }, [jupyterCells]);
+
   return (
     <div className="flex flex-col min-h-0 grow">
       <div
diff --git a/frontend/src/components/file-explorer/FileExplorer.tsx b/frontend/src/components/file-explorer/FileExplorer.tsx
index 89bc6c716f..51a39cd829 100644
--- a/frontend/src/components/file-explorer/FileExplorer.tsx
+++ b/frontend/src/components/file-explorer/FileExplorer.tsx
@@ -3,15 +3,17 @@ import {
   IoIosArrowBack,
   IoIosArrowForward,
   IoIosRefresh,
+  IoIosCloudUpload,
 } from "react-icons/io";
 import { twMerge } from "tailwind-merge";
-import { WorkspaceFile, getWorkspace } from "#/services/fileService";
+import { WorkspaceFile, getWorkspace, uploadFile } from "#/services/fileService";
 import IconButton from "../IconButton";
 import ExplorerTree from "./ExplorerTree";
 import { removeEmptyNodes } from "./utils";
 
 interface ExplorerActionsProps {
   onRefresh: () => void;
+  onUpload: () => void;
   toggleHidden: () => void;
   isHidden: boolean;
 }
@@ -19,6 +21,7 @@ interface ExplorerActionsProps {
 function ExplorerActions({
   toggleHidden,
   onRefresh,
+  onUpload,
   isHidden,
 }: ExplorerActionsProps) {
   return (
@@ -29,17 +32,30 @@ function ExplorerActions({
       )}
     >
       {!isHidden && (
-        <IconButton
-          icon={
-            <IoIosRefresh
-              size={16}
-              className="text-neutral-400 hover:text-neutral-100 transition"
-            />
-          }
-          testId="refresh"
-          ariaLabel="Refresh workspace"
-          onClick={onRefresh}
-        />
+        <>
+          <IconButton
+            icon={
+              <IoIosRefresh
+                size={16}
+                className="text-neutral-400 hover:text-neutral-100 transition"
+              />
+            }
+            testId="refresh"
+            ariaLabel="Refresh workspace"
+            onClick={onRefresh}
+          />
+          <IconButton
+            icon={
+              <IoIosCloudUpload
+                size={16}
+                className="text-neutral-400 hover:text-neutral-100 transition"
+              />
+            }
+            testId="upload"
+            ariaLabel="Upload File"
+            onClick={onUpload}
+          />
+        </>
       )}
 
       <IconButton
@@ -56,8 +72,8 @@ function ExplorerActions({
             />
           )
         }
-        testId="close"
-        ariaLabel="Close workspace"
+        testId="toggle"
+        ariaLabel={isHidden ? "Open workspace" : "Close workspace"}
         onClick={toggleHidden}
       />
     </div>
@@ -71,12 +87,33 @@ interface FileExplorerProps {
 function FileExplorer({ onFileClick }: FileExplorerProps) {
   const [workspace, setWorkspace] = React.useState<WorkspaceFile>();
   const [isHidden, setIsHidden] = React.useState(false);
+  const fileInputRef = React.useRef<HTMLInputElement | null>(null);
 
   const getWorkspaceData = async () => {
     const wsFile = await getWorkspace();
     setWorkspace(removeEmptyNodes(wsFile));
   };
 
+  const selectFileInput = () => {
+    fileInputRef.current?.click(); // Trigger the file browser
+  };
+
+  const uploadFileData = async (event: React.ChangeEvent<HTMLInputElement>) => {
+    const file = event.target.files ? event.target.files[0] : null;
+    if (!file) {
+      console.log("No file selected.");
+      return;
+    }
+    console.log("File selected:", file);
+    try {
+      const response = await uploadFile(file);
+      console.log(response);
+      await getWorkspaceData(); // Refresh the workspace to show the new file
+    } catch (error) {
+      console.error("Error uploading file:", error);
+    }
+  };
+
   React.useEffect(() => {
     (async () => {
       await getWorkspaceData();
@@ -105,8 +142,15 @@ function FileExplorer({ onFileClick }: FileExplorerProps) {
           isHidden={isHidden}
           toggleHidden={() => setIsHidden((prev) => !prev)}
           onRefresh={getWorkspaceData}
+          onUpload={selectFileInput}
         />
       </div>
+      <input
+        type="file"
+        ref={fileInputRef}
+        style={{ display: "none" }}
+        onChange={uploadFileData}
+      />
     </div>
   );
 }
diff --git a/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.test.tsx b/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.test.tsx
index 89d70d39f5..dc6a6516a0 100644
--- a/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.test.tsx
+++ b/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.test.tsx
@@ -3,7 +3,7 @@ import { act, render, screen, waitFor } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import LoadPreviousSessionModal from "./LoadPreviousSessionModal";
 import { clearMsgs, fetchMsgs } from "../../../services/session";
-import { sendChatMessageFromEvent } from "../../../services/chatService";
+import { addChatMessageFromEvent } from "../../../services/chatService";
 import { handleAssistantMessage } from "../../../services/actions";
 import toast from "../../../utils/toast";
 
@@ -37,7 +37,7 @@ vi.mock("../../../services/session", async (importOriginal) => ({
 
 vi.mock("../../../services/chatService", async (importOriginal) => ({
   ...(await importOriginal<typeof import("../../../services/chatService")>()),
-  sendChatMessageFromEvent: vi.fn(),
+  addChatMessageFromEvent: vi.fn(),
 }));
 
 vi.mock("../../../services/actions", async (importOriginal) => ({
@@ -94,7 +94,7 @@ describe("LoadPreviousSession", () => {
 
     await waitFor(() => {
       expect(fetchMsgs).toHaveBeenCalledTimes(1);
-      expect(sendChatMessageFromEvent).toHaveBeenCalledTimes(1);
+      expect(addChatMessageFromEvent).toHaveBeenCalledTimes(1);
       expect(handleAssistantMessage).toHaveBeenCalledTimes(1);
     });
     // modal should close right after fetching messages
@@ -117,7 +117,7 @@ describe("LoadPreviousSession", () => {
     await waitFor(async () => {
       await expect(() => fetchMsgs()).rejects.toThrow();
       expect(handleAssistantMessage).not.toHaveBeenCalled();
-      expect(sendChatMessageFromEvent).not.toHaveBeenCalled();
+      expect(addChatMessageFromEvent).not.toHaveBeenCalled();
       // error toast should be shown
       expect(toast.stickyError).toHaveBeenCalledWith(
         "ws",
diff --git a/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.tsx b/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.tsx
index 3d8beef582..2f29f05f10 100644
--- a/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.tsx
+++ b/frontend/src/components/modals/load-previous-session/LoadPreviousSessionModal.tsx
@@ -2,7 +2,7 @@ import React from "react";
 import { useTranslation } from "react-i18next";
 import { I18nKey } from "#/i18n/declaration";
 import { handleAssistantMessage } from "#/services/actions";
-import { sendChatMessageFromEvent } from "#/services/chatService";
+import { addChatMessageFromEvent } from "#/services/chatService";
 import { clearMsgs, fetchMsgs } from "#/services/session";
 import toast from "#/utils/toast";
 import BaseModal from "../base-modal/BaseModal";
@@ -28,7 +28,7 @@ function LoadPreviousSessionModal({
 
       messages.forEach((message) => {
         if (message.role === "user") {
-          sendChatMessageFromEvent(message.payload);
+          addChatMessageFromEvent(message.payload);
         }
 
         if (message.role === "assistant") {
diff --git a/frontend/src/components/modals/settings/SettingsForm.test.tsx b/frontend/src/components/modals/settings/SettingsForm.test.tsx
index 35c0a4c011..c9b8414e59 100644
--- a/frontend/src/components/modals/settings/SettingsForm.test.tsx
+++ b/frontend/src/components/modals/settings/SettingsForm.test.tsx
@@ -1,9 +1,9 @@
-import { Settings } from "#/services/settings";
-import AgentTaskState from "#/types/AgentTaskState";
 import { act, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import React from "react";
 import { renderWithProviders } from "test-utils";
+import AgentTaskState from "#/types/AgentTaskState";
+import { Settings } from "#/services/settings";
 import SettingsForm from "./SettingsForm";
 
 const onModelChangeMock = vi.fn();
@@ -106,7 +106,6 @@ describe("SettingsForm", () => {
       });
 
       expect(onModelChangeMock).toHaveBeenCalledWith("model3");
-      expect(onAPIKeyChangeMock).toHaveBeenCalledWith("");
     });
 
     it("should call the onAgentChange handler when the agent changes", () => {
diff --git a/frontend/src/components/modals/settings/SettingsForm.tsx b/frontend/src/components/modals/settings/SettingsForm.tsx
index 21042fbc22..7387c397d4 100644
--- a/frontend/src/components/modals/settings/SettingsForm.tsx
+++ b/frontend/src/components/modals/settings/SettingsForm.tsx
@@ -38,7 +38,8 @@ function SettingsForm({
   useEffect(() => {
     if (
       curTaskState === AgentTaskState.RUNNING ||
-      curTaskState === AgentTaskState.PAUSED
+      curTaskState === AgentTaskState.PAUSED ||
+      curTaskState === AgentTaskState.AWAITING_USER_INPUT
     ) {
       setDisabled(true);
     } else {
diff --git a/frontend/src/components/modals/settings/SettingsModal.test.tsx b/frontend/src/components/modals/settings/SettingsModal.test.tsx
index 8530f185f0..ba540b2a20 100644
--- a/frontend/src/components/modals/settings/SettingsModal.test.tsx
+++ b/frontend/src/components/modals/settings/SettingsModal.test.tsx
@@ -1,13 +1,13 @@
-import { fetchAgents, fetchModels } from "#/api";
-import { initializeAgent } from "#/services/agent";
-import { Settings, getSettings, saveSettings } from "#/services/settings";
-import toast from "#/utils/toast";
 import { act, screen, waitFor } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import i18next from "i18next";
 import React from "react";
 import { renderWithProviders } from "test-utils";
 import { Mock } from "vitest";
+import toast from "#/utils/toast";
+import { Settings, getSettings, saveSettings } from "#/services/settings";
+import { initializeAgent } from "#/services/agent";
+import { fetchAgents, fetchModels } from "#/api";
 import SettingsModal from "./SettingsModal";
 
 const toastSpy = vi.spyOn(toast, "settingsChanged");
@@ -129,6 +129,7 @@ describe("SettingsModal", () => {
       expect(saveSettings).toHaveBeenCalledWith({
         ...initialSettings,
         LLM_MODEL: "model3",
+        LLM_API_KEY: "", // reset after model change
       });
     });
 
@@ -160,6 +161,7 @@ describe("SettingsModal", () => {
       expect(initializeAgent).toHaveBeenCalledWith({
         ...initialSettings,
         LLM_MODEL: "model3",
+        LLM_API_KEY: "", // reset after model change
       });
     });
 
diff --git a/frontend/src/components/modals/settings/SettingsModal.tsx b/frontend/src/components/modals/settings/SettingsModal.tsx
index 58e1660a28..cf815a7f05 100644
--- a/frontend/src/components/modals/settings/SettingsModal.tsx
+++ b/frontend/src/components/modals/settings/SettingsModal.tsx
@@ -1,3 +1,7 @@
+import { Spinner } from "@nextui-org/react";
+import i18next from "i18next";
+import React from "react";
+import { useTranslation } from "react-i18next";
 import { fetchAgents, fetchModels } from "#/api";
 import { AvailableLanguages } from "#/i18n";
 import { I18nKey } from "#/i18n/declaration";
@@ -9,10 +13,6 @@ import {
   saveSettings,
 } from "#/services/settings";
 import toast from "#/utils/toast";
-import { Spinner } from "@nextui-org/react";
-import i18next from "i18next";
-import React from "react";
-import { useTranslation } from "react-i18next";
 import BaseModal from "../base-modal/BaseModal";
 import SettingsForm from "./SettingsForm";
 
@@ -76,8 +76,14 @@ function SettingsModal({ isOpen, onOpenChange }: SettingsProps) {
     i18next.changeLanguage(settings.LANGUAGE);
     initializeAgent(settings); // reinitialize the agent with the new settings
 
+    const sensitiveKeys = ['LLM_API_KEY'];
+
     Object.entries(updatedSettings).forEach(([key, value]) => {
-      toast.settingsChanged(`${key} set to "${value}"`);
+        if (!sensitiveKeys.includes(key)) {
+            toast.settingsChanged(`${key} set to "${value}"`);
+        } else {
+            toast.settingsChanged(`${key} has been updated securely.`);
+        }
     });
 
     localStorage.setItem(
@@ -86,6 +92,13 @@ function SettingsModal({ isOpen, onOpenChange }: SettingsProps) {
     );
   };
 
+  const isDisabled =
+    Object.entries(settings)
+      // filter api key
+      .filter(([key]) => key !== "LLM_API_KEY")
+      .some(([, value]) => !value) ||
+    JSON.stringify(settings) === JSON.stringify(currentSettings);
+
   return (
     <BaseModal
       isOpen={isOpen}
@@ -96,9 +109,7 @@ function SettingsModal({ isOpen, onOpenChange }: SettingsProps) {
         {
           label: t(I18nKey.CONFIGURATION$MODAL_SAVE_BUTTON_LABEL),
           action: handleSaveSettings,
-          isDisabled:
-            Object.values(settings).some((value) => !value) ||
-            JSON.stringify(settings) === JSON.stringify(currentSettings),
+          isDisabled,
           closeAfterAction: true,
           className: "bg-primary rounded-lg",
         },
diff --git a/frontend/src/i18n/translation.json b/frontend/src/i18n/translation.json
index 953c14d8cf..c786544bff 100644
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
@@ -39,6 +39,19 @@
     "pt": "Planejador",
     "es": "Planificador"
   },
+  "WORKSPACE$JUPYTER_TAB_LABEL": {
+    "en": "Jupyter IPython",
+    "zh-CN": "Jupyter IPython",
+    "de": "Jupyter IPython",
+    "ko-KR": "Jupyter IPython",
+    "no": "Jupyter IPython",
+    "zh-TW": "Jupyter IPython",
+    "ar": "Jupyter IPython",
+    "fr": "Jupyter IPython",
+    "it": "Jupyter IPython",
+    "pt": "Jupyter IPython",
+    "es": "Jupyter IPython"
+  },
   "WORKSPACE$CODE_EDITOR_TAB_LABEL": {
     "en": "Code Editor",
     "zh-CN": "代码编辑器",
@@ -328,5 +341,17 @@
   "SETTINGS$API_KEY_PLACEHOLDER": {
     "en": "Enter your API key.",
     "de": "Model API key."
+  },
+  "CODE_EDITOR$EMPTY_MESSAGE": {
+    "en": "No file selected.",
+    "de": "Keine Datei ausgewählt."
+  },
+  "BROWSER$EMPTY_MESSAGE": {
+    "en": "No page loaded.",
+    "de": "Keine Seite geladen."
+  },
+  "PLANNER$EMPTY_MESSAGE": {
+    "en": "No plan created.",
+    "de": "Kein Plan erstellt."
   }
 }
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
index f4551a8960..121ff0a678 100644
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -3,6 +3,7 @@ import { setScreenshotSrc, setUrl } from "#/state/browserSlice";
 import { appendAssistantMessage } from "#/state/chatSlice";
 import { setCode, updatePath } from "#/state/codeSlice";
 import { appendInput } from "#/state/commandSlice";
+import { appendJupyterInput } from "#/state/jupyterSlice";
 import { setPlan } from "#/state/planSlice";
 import { setInitialized } from "#/state/taskSlice";
 import store from "#/store";
@@ -29,12 +30,24 @@ const messageActions = {
   [ActionType.THINK]: (message: ActionMessage) => {
     store.dispatch(appendAssistantMessage(message.args.thought));
   },
+  [ActionType.TALK]: (message: ActionMessage) => {
+    store.dispatch(appendAssistantMessage(message.args.content));
+  },
   [ActionType.FINISH]: (message: ActionMessage) => {
     store.dispatch(appendAssistantMessage(message.message));
   },
   [ActionType.RUN]: (message: ActionMessage) => {
+    if (message.args.thought) {
+      store.dispatch(appendAssistantMessage(message.args.thought));
+    }
     store.dispatch(appendInput(message.args.command));
   },
+  [ActionType.RUN_IPYTHON]: (message: ActionMessage) => {
+    if (message.args.thought) {
+      store.dispatch(appendAssistantMessage(message.args.thought));
+    }
+    store.dispatch(appendJupyterInput(message.args.code));
+  },
   [ActionType.ADD_TASK]: () => {
     getPlan().then((fetchedPlan) => store.dispatch(setPlan(fetchedPlan)));
   },
diff --git a/frontend/src/services/chatService.ts b/frontend/src/services/chatService.ts
index 18d4042f2a..8c01b0079f 100644
--- a/frontend/src/services/chatService.ts
+++ b/frontend/src/services/chatService.ts
@@ -11,14 +11,19 @@ import { SocketMessage } from "#/types/ResponseType";
 import { ActionMessage } from "#/types/Message";
 import Socket from "./socket";
 
-export function sendChatMessage(message: string): void {
+export function sendChatMessage(message: string, isTask: boolean = true): void {
   store.dispatch(appendUserMessage(message));
-  const event = { action: ActionType.START, args: { task: message } };
+  let event;
+  if (isTask) {
+    event = { action: ActionType.START, args: { task: message } };
+  } else {
+    event = { action: ActionType.USER_MESSAGE, args: { message } };
+  }
   const eventString = JSON.stringify(event);
   Socket.send(eventString);
 }
 
-export function sendChatMessageFromEvent(event: string | SocketMessage): void {
+export function addChatMessageFromEvent(event: string | SocketMessage): void {
   try {
     let data: ActionMessage;
     if (typeof event === "string") {
diff --git a/frontend/src/services/fileService.ts b/frontend/src/services/fileService.ts
index 3e23924c9b..e9c4a424cc 100644
--- a/frontend/src/services/fileService.ts
+++ b/frontend/src/services/fileService.ts
@@ -12,6 +12,24 @@ export async function selectFile(file: string): Promise<string> {
   return data.code as string;
 }
 
+export async function uploadFile(file: File): Promise<string> {
+  const formData = new FormData();
+  formData.append("file", file);
+
+  const res = await fetch("/api/upload-file", {
+    method: "POST",
+    body: formData,
+  });
+
+  const data = await res.json();
+
+  if (res.status !== 200) {
+    throw new Error(data.error || "Failed to upload file.");
+  }
+
+  return `File uploaded: ${data.filename}, Location: ${data.location}`;
+}
+
 export async function getWorkspace(): Promise<WorkspaceFile> {
   const res = await fetch("/api/refresh-files");
   const data = await res.json();
diff --git a/frontend/src/services/observations.ts b/frontend/src/services/observations.ts
index 39cafafd93..956fc90669 100644
--- a/frontend/src/services/observations.ts
+++ b/frontend/src/services/observations.ts
@@ -3,6 +3,7 @@ import { setUrl, setScreenshotSrc } from "#/state/browserSlice";
 import store from "#/store";
 import { ObservationMessage } from "#/types/Message";
 import { appendOutput } from "#/state/commandSlice";
+import { appendJupyterOutput } from "#/state/jupyterSlice";
 import ObservationType from "#/types/ObservationType";
 
 export function handleObservationMessage(message: ObservationMessage) {
@@ -10,6 +11,10 @@ export function handleObservationMessage(message: ObservationMessage) {
     case ObservationType.RUN:
       store.dispatch(appendOutput(message.content));
       break;
+    case ObservationType.RUN_IPYTHON:
+      // FIXME: render this as markdown
+      store.dispatch(appendJupyterOutput(message.content));
+      break;
     case ObservationType.BROWSE:
       if (message.extras?.screenshot) {
         store.dispatch(setScreenshotSrc(message.extras.screenshot));
diff --git a/frontend/src/services/settings.test.ts b/frontend/src/services/settings.test.ts
index caef2486c2..6dc2d0597b 100644
--- a/frontend/src/services/settings.test.ts
+++ b/frontend/src/services/settings.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it, vi, Mock } from "vitest";
 import {
   DEFAULT_SETTINGS,
+  Settings,
   getSettings,
   getSettingsDifference,
   saveSettings,
@@ -18,7 +19,8 @@ describe("getSettings", () => {
     (localStorage.getItem as Mock)
       .mockReturnValueOnce("llm_value")
       .mockReturnValueOnce("agent_value")
-      .mockReturnValueOnce("language_value");
+      .mockReturnValueOnce("language_value")
+      .mockReturnValueOnce("api_key");
 
     const settings = getSettings();
 
@@ -26,11 +28,13 @@ describe("getSettings", () => {
       LLM_MODEL: "llm_value",
       AGENT: "agent_value",
       LANGUAGE: "language_value",
+      LLM_API_KEY: "api_key",
     });
   });
 
   it("should handle return defaults if localStorage key does not exist", () => {
     (localStorage.getItem as Mock)
+      .mockReturnValueOnce(null)
       .mockReturnValueOnce(null)
       .mockReturnValueOnce(null)
       .mockReturnValueOnce(null);
@@ -41,16 +45,18 @@ describe("getSettings", () => {
       LLM_MODEL: DEFAULT_SETTINGS.LLM_MODEL,
       AGENT: DEFAULT_SETTINGS.AGENT,
       LANGUAGE: DEFAULT_SETTINGS.LANGUAGE,
+      LLM_API_KEY: "",
     });
   });
 });
 
 describe("saveSettings", () => {
   it("should save the settings", () => {
-    const settings = {
+    const settings: Settings = {
       LLM_MODEL: "llm_value",
       AGENT: "agent_value",
       LANGUAGE: "language_value",
+      LLM_API_KEY: "some_key",
     };
 
     saveSettings(settings);
@@ -61,6 +67,10 @@ describe("saveSettings", () => {
       "LANGUAGE",
       "language_value",
     );
+    expect(localStorage.setItem).toHaveBeenCalledWith(
+      "LLM_API_KEY",
+      "some_key",
+    );
   });
 
   it("should save partial settings", () => {
diff --git a/frontend/src/services/settings.ts b/frontend/src/services/settings.ts
index 29129b13f3..e5997c6f4d 100644
--- a/frontend/src/services/settings.ts
+++ b/frontend/src/services/settings.ts
@@ -17,15 +17,19 @@ const validKeys = Object.keys(DEFAULT_SETTINGS) as (keyof Settings)[];
 /**
  * Get the settings from local storage or use the default settings if not found
  */
-export const getSettings = (): Settings => ({
-  LLM_MODEL: localStorage.getItem("LLM_MODEL") || DEFAULT_SETTINGS.LLM_MODEL,
-  AGENT: localStorage.getItem("AGENT") || DEFAULT_SETTINGS.AGENT,
-  LANGUAGE: localStorage.getItem("LANGUAGE") || DEFAULT_SETTINGS.LANGUAGE,
-  LLM_API_KEY:
-    localStorage.getItem(
-      `API_KEY_${localStorage.getItem("LLM_MODEL") || DEFAULT_SETTINGS.LLM_MODEL}`,
-    ) || DEFAULT_SETTINGS.LLM_API_KEY,
-});
+export const getSettings = (): Settings => {
+  const model = localStorage.getItem("LLM_MODEL");
+  const agent = localStorage.getItem("AGENT");
+  const language = localStorage.getItem("LANGUAGE");
+  const apiKey = localStorage.getItem(`API_KEY_${model}`);
+
+  return {
+    LLM_MODEL: model || DEFAULT_SETTINGS.LLM_MODEL,
+    AGENT: agent || DEFAULT_SETTINGS.AGENT,
+    LANGUAGE: language || DEFAULT_SETTINGS.LANGUAGE,
+    LLM_API_KEY: apiKey || DEFAULT_SETTINGS.LLM_API_KEY,
+  };
+};
 
 /**
  * Save the settings to local storage. Only valid settings are saved.
diff --git a/frontend/src/state/jupyterSlice.ts b/frontend/src/state/jupyterSlice.ts
new file mode 100644
index 0000000000..241a8df6f9
--- /dev/null
+++ b/frontend/src/state/jupyterSlice.ts
@@ -0,0 +1,27 @@
+import { createSlice } from "@reduxjs/toolkit";
+
+export type Cell = {
+  content: string;
+  type: "input" | "output";
+};
+
+const initialCells: Cell[] = [];
+
+export const cellSlice = createSlice({
+  name: "cell",
+  initialState: {
+    cells: initialCells,
+  },
+  reducers: {
+    appendJupyterInput: (state, action) => {
+      state.cells.push({ content: action.payload, type: "input" });
+    },
+    appendJupyterOutput: (state, action) => {
+      state.cells.push({ content: action.payload, type: "output" });
+    },
+  },
+});
+
+export const { appendJupyterInput, appendJupyterOutput } = cellSlice.actions;
+
+export default cellSlice.reducer;
diff --git a/frontend/src/store.ts b/frontend/src/store.ts
index 05ea2fe509..f675f929ec 100644
--- a/frontend/src/store.ts
+++ b/frontend/src/store.ts
@@ -7,6 +7,7 @@ import commandReducer from "./state/commandSlice";
 import errorsReducer from "./state/errorsSlice";
 import planReducer from "./state/planSlice";
 import taskReducer from "./state/taskSlice";
+import jupyterReducer from "./state/jupyterSlice";
 
 export const rootReducer = combineReducers({
   browser: browserReducer,
@@ -17,6 +18,7 @@ export const rootReducer = combineReducers({
   errors: errorsReducer,
   plan: planReducer,
   agent: agentReducer,
+  jupyter: jupyterReducer,
 });
 
 const store = configureStore({
diff --git a/frontend/src/types/ActionType.tsx b/frontend/src/types/ActionType.tsx
index 1d88324eb0..1862a13f2d 100644
--- a/frontend/src/types/ActionType.tsx
+++ b/frontend/src/types/ActionType.tsx
@@ -2,7 +2,10 @@ enum ActionType {
   // Initializes the agent. Only sent by client.
   INIT = "initialize",
 
-  // Starts a new development task. Only sent by the client.
+  // Sends a message from the user
+  USER_MESSAGE = "user_message",
+
+  // Starts a new development task
   START = "start",
 
   // Reads the contents of a file.
@@ -14,6 +17,9 @@ enum ActionType {
   // Runs a command.
   RUN = "run",
 
+  // Runs a IPython command.
+  RUN_IPYTHON = "run_ipython",
+
   // Kills a background command.
   KILL = "kill",
 
@@ -26,6 +32,9 @@ enum ActionType {
   // Allows the agent to make a plan, set a goal, or record thoughts.
   THINK = "think",
 
+  // Allows the agent to respond to the user. Only sent by the agent.
+  TALK = "talk",
+
   // If you're absolutely certain that you've completed your task and have tested your work,
   // use the finish action to stop working.
   FINISH = "finish",
diff --git a/frontend/src/types/AgentTaskState.tsx b/frontend/src/types/AgentTaskState.tsx
index d7aa9c80bd..b5cd1d5a52 100644
--- a/frontend/src/types/AgentTaskState.tsx
+++ b/frontend/src/types/AgentTaskState.tsx
@@ -1,6 +1,7 @@
 enum AgentTaskState {
   INIT = "init",
   RUNNING = "running",
+  AWAITING_USER_INPUT = "awaiting_user_input",
   PAUSED = "paused",
   STOPPED = "stopped",
   FINISHED = "finished",
diff --git a/frontend/src/types/ObservationType.tsx b/frontend/src/types/ObservationType.tsx
index b86bd58919..6ba8bda578 100644
--- a/frontend/src/types/ObservationType.tsx
+++ b/frontend/src/types/ObservationType.tsx
@@ -8,6 +8,9 @@ enum ObservationType {
   // The output of a command
   RUN = "run",
 
+  // The output of an IPython command
+  RUN_IPYTHON = "run_ipython",
+
   // The result of a search
   RECALL = "recall",
 
diff --git a/frontend/src/types/TabOption.tsx b/frontend/src/types/TabOption.tsx
index b114c0da28..a732acb2c7 100644
--- a/frontend/src/types/TabOption.tsx
+++ b/frontend/src/types/TabOption.tsx
@@ -2,10 +2,20 @@ enum TabOption {
   PLANNER = "planner",
   CODE = "code",
   BROWSER = "browser",
+  JUPYTER = "jupyter",
 }
 
-type TabType = TabOption.PLANNER | TabOption.CODE | TabOption.BROWSER;
+type TabType =
+  | TabOption.PLANNER
+  | TabOption.CODE
+  | TabOption.BROWSER
+  | TabOption.JUPYTER;
 
-const AllTabs = [TabOption.CODE, TabOption.BROWSER, TabOption.PLANNER];
+const AllTabs = [
+  TabOption.CODE,
+  TabOption.BROWSER,
+  TabOption.PLANNER,
+  TabOption.JUPYTER,
+];
 
 export { AllTabs, TabOption, type TabType };
diff --git a/opendevin/action/__init__.py b/opendevin/action/__init__.py
index 782400319f..9e3d939d3f 100644
--- a/opendevin/action/__init__.py
+++ b/opendevin/action/__init__.py
@@ -1,27 +1,30 @@
+from ..exceptions import AgentMalformedActionError
+from .agent import (
+    AgentDelegateAction,
+    AgentEchoAction,
+    AgentFinishAction,
+    AgentRecallAction,
+    AgentSummarizeAction,
+    AgentTalkAction,
+    AgentThinkAction,
+)
 from .base import Action, NullAction
-from .bash import CmdRunAction, CmdKillAction
+from .bash import CmdKillAction, CmdRunAction, IPythonRunCellAction
 from .browse import BrowseURLAction
 from .fileop import FileReadAction, FileWriteAction
 from .github import GitHubPushAction
-from .agent import (
-    AgentRecallAction,
-    AgentThinkAction,
-    AgentFinishAction,
-    AgentEchoAction,
-    AgentSummarizeAction,
-    AgentDelegateAction,
-)
 from .tasks import AddTaskAction, ModifyTaskAction
-from ..exceptions import AgentMalformedActionError
 
 actions = (
     CmdKillAction,
     CmdRunAction,
+    IPythonRunCellAction,
     BrowseURLAction,
     FileReadAction,
     FileWriteAction,
     AgentRecallAction,
     AgentThinkAction,
+    AgentTalkAction,
     AgentFinishAction,
     AgentDelegateAction,
     AddTaskAction,
@@ -61,10 +64,12 @@ __all__ = [
     'FileWriteAction',
     'AgentRecallAction',
     'AgentThinkAction',
+    'AgentTalkAction',
     'AgentFinishAction',
     'AgentDelegateAction',
     'AgentEchoAction',
     'AgentSummarizeAction',
     'AddTaskAction',
     'ModifyTaskAction',
+    'IPythonRunCellAction'
 ]
diff --git a/opendevin/action/agent.py b/opendevin/action/agent.py
index 1cb0b8cdf2..063d095bd7 100644
--- a/opendevin/action/agent.py
+++ b/opendevin/action/agent.py
@@ -2,12 +2,13 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict
 
 from opendevin.observation import (
-    AgentRecallObservation,
     AgentMessageObservation,
+    AgentRecallObservation,
     NullObservation,
     Observation,
 )
 from opendevin.schema import ActionType
+
 from .base import ExecutableAction, NotExecutableAction
 
 if TYPE_CHECKING:
@@ -17,11 +18,12 @@ if TYPE_CHECKING:
 @dataclass
 class AgentRecallAction(ExecutableAction):
     query: str
+    thought: str = ''
     action: str = ActionType.RECALL
 
     async def run(self, controller: 'AgentController') -> AgentRecallObservation:
         return AgentRecallObservation(
-            content='Recalling memories...',
+            content='',
             memories=controller.agent.search_memory(self.query),
         )
 
@@ -43,6 +45,22 @@ class AgentThinkAction(NotExecutableAction):
         return self.thought
 
 
+@dataclass
+class AgentTalkAction(NotExecutableAction):
+    content: str
+    action: str = ActionType.TALK
+
+    async def run(self, controller: 'AgentController') -> 'Observation':
+        raise NotImplementedError
+
+    @property
+    def message(self) -> str:
+        return self.content
+
+    def __str__(self) -> str:
+        return self.content
+
+
 @dataclass
 class AgentEchoAction(ExecutableAction):
     content: str
@@ -69,6 +87,7 @@ class AgentSummarizeAction(NotExecutableAction):
 @dataclass
 class AgentFinishAction(NotExecutableAction):
     outputs: Dict = field(default_factory=dict)
+    thought: str = ''
     action: str = ActionType.FINISH
 
     async def run(self, controller: 'AgentController') -> 'Observation':
@@ -83,6 +102,7 @@ class AgentFinishAction(NotExecutableAction):
 class AgentDelegateAction(ExecutableAction):
     agent: str
     inputs: dict
+    thought: str = ''
     action: str = ActionType.DELEGATE
 
     async def run(self, controller: 'AgentController') -> 'Observation':
diff --git a/opendevin/action/base.py b/opendevin/action/base.py
index f3273011aa..d459713f3f 100644
--- a/opendevin/action/base.py
+++ b/opendevin/action/base.py
@@ -1,5 +1,6 @@
-from dataclasses import dataclass, asdict
+from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING
+
 from opendevin.schema import ActionType
 
 if TYPE_CHECKING:
diff --git a/opendevin/action/bash.py b/opendevin/action/bash.py
index 1757c7ea3c..160a795491 100644
--- a/opendevin/action/bash.py
+++ b/opendevin/action/bash.py
@@ -1,18 +1,25 @@
+import os
+import pathlib
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+from opendevin import config
+from opendevin.schema import ActionType, ConfigType
+
 from .base import ExecutableAction
-from opendevin.schema import ActionType
 
 if TYPE_CHECKING:
     from opendevin.controller import AgentController
     from opendevin.observation import CmdOutputObservation, Observation
 
+from opendevin.observation import IPythonRunCellObservation
+
 
 @dataclass
 class CmdRunAction(ExecutableAction):
     command: str
     background: bool = False
+    thought: str = ''
     action: str = ActionType.RUN
 
     async def run(self, controller: 'AgentController') -> 'Observation':
@@ -22,10 +29,18 @@ class CmdRunAction(ExecutableAction):
     def message(self) -> str:
         return f'Running command: {self.command}'
 
+    def __str__(self) -> str:
+        ret = '**CmdRunAction**\n'
+        if self.thought:
+            ret += f'THOUGHT:{self.thought}\n'
+        ret += f'COMMAND:\n{self.command}'
+        return ret
+
 
 @dataclass
 class CmdKillAction(ExecutableAction):
     id: int
+    thought: str = ''
     action: str = ActionType.KILL
 
     async def run(self, controller: 'AgentController') -> 'CmdOutputObservation':
@@ -34,3 +49,48 @@ class CmdKillAction(ExecutableAction):
     @property
     def message(self) -> str:
         return f'Killing command: {self.id}'
+
+    def __str__(self) -> str:
+        return f'**CmdKillAction**\n{self.id}'
+
+
+@dataclass
+class IPythonRunCellAction(ExecutableAction):
+    code: str
+    thought: str = ''
+    action: str = ActionType.RUN_IPYTHON
+
+    async def run(self, controller: 'AgentController') -> 'IPythonRunCellObservation':
+        # echo "import math" | execute_cli
+        # write code to a temporary file and pass it to `execute_cli` via stdin
+        tmp_filepath = os.path.join(
+            config.get(ConfigType.WORKSPACE_BASE),
+            '.tmp', '.ipython_execution_tmp.py'
+        )
+        pathlib.Path(os.path.dirname(tmp_filepath)).mkdir(parents=True, exist_ok=True)
+        with open(tmp_filepath, 'w') as tmp_file:
+            tmp_file.write(self.code)
+
+        tmp_filepath_inside_sandbox = os.path.join(
+            config.get(ConfigType.WORKSPACE_MOUNT_PATH_IN_SANDBOX),
+            '.tmp', '.ipython_execution_tmp.py'
+        )
+        obs = controller.action_manager.run_command(
+            f'execute_cli < {tmp_filepath_inside_sandbox}',
+            background=False
+        )
+        return IPythonRunCellObservation(
+            content=obs.content,
+            code=self.code
+        )
+
+    def __str__(self) -> str:
+        ret = '**IPythonRunCellAction**\n'
+        if self.thought:
+            ret += f'THOUGHT:{self.thought}\n'
+        ret += f'CODE:\n{self.code}'
+        return ret
+
+    @property
+    def message(self) -> str:
+        return f'Running Python code interactively: {self.code}'
diff --git a/opendevin/action/browse.py b/opendevin/action/browse.py
index 58639e984e..8ed1525289 100644
--- a/opendevin/action/browse.py
+++ b/opendevin/action/browse.py
@@ -1,10 +1,12 @@
-import os
 import base64
+import os
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from playwright.async_api import async_playwright
+
 from opendevin.observation import BrowserOutputObservation
 from opendevin.schema import ActionType
-from typing import TYPE_CHECKING
-from playwright.async_api import async_playwright
 
 from .base import ExecutableAction
 
@@ -15,6 +17,7 @@ if TYPE_CHECKING:
 @dataclass
 class BrowseURLAction(ExecutableAction):
     url: str
+    thought: str = ''
     action: str = ActionType.BROWSE
 
     async def run(self, controller: 'AgentController') -> BrowserOutputObservation:  # type: ignore
diff --git a/opendevin/action/fileop.py b/opendevin/action/fileop.py
index b1da4b240b..4e630c9383 100644
--- a/opendevin/action/fileop.py
+++ b/opendevin/action/fileop.py
@@ -1,18 +1,16 @@
 import os
-
 from dataclasses import dataclass
 from pathlib import Path
 
+from opendevin import config
 from opendevin.observation import (
-    Observation,
+    AgentErrorObservation,
     FileReadObservation,
     FileWriteObservation,
-    AgentErrorObservation,
+    Observation,
 )
-
-from opendevin.schema import ActionType
 from opendevin.sandbox import E2BBox
-from opendevin import config
+from opendevin.schema import ActionType
 from opendevin.schema.config import ConfigType
 
 from .base import ExecutableAction
@@ -52,7 +50,7 @@ class FileReadAction(ExecutableAction):
     path: str
     start: int = 0
     end: int = -1
-    thoughts: str = ''
+    thought: str = ''
     action: str = ActionType.READ
 
     def _read_lines(self, all_lines: list[str]):
@@ -102,7 +100,7 @@ class FileWriteAction(ExecutableAction):
     content: str
     start: int = 0
     end: int = -1
-    thoughts: str = ''
+    thought: str = ''
     action: str = ActionType.WRITE
 
     def _insert_lines(self, to_insert: list[str], original: list[str]):
diff --git a/opendevin/action/github.py b/opendevin/action/github.py
index 738adb0680..78dd81fba2 100644
--- a/opendevin/action/github.py
+++ b/opendevin/action/github.py
@@ -1,14 +1,15 @@
+import random
+import string
 from dataclasses import dataclass
-from opendevin.observation import Observation, AgentErrorObservation
+from typing import TYPE_CHECKING
+
+import requests
+
+from opendevin import config
+from opendevin.observation import AgentErrorObservation, Observation
 from opendevin.observation.message import AgentMessageObservation
 from opendevin.observation.run import CmdOutputObservation
 from opendevin.schema import ActionType
-from opendevin import config
-from typing import TYPE_CHECKING
-import requests
-import random
-import string
-
 from opendevin.schema.config import ConfigType
 
 from .base import ExecutableAction
diff --git a/opendevin/action/tasks.py b/opendevin/action/tasks.py
index 96c6fb8f53..5eff530e66 100644
--- a/opendevin/action/tasks.py
+++ b/opendevin/action/tasks.py
@@ -1,10 +1,11 @@
 from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from opendevin.observation import NullObservation
+from opendevin.schema import ActionType
 
 from .base import ExecutableAction, NotExecutableAction
-from opendevin.schema import ActionType
-from opendevin.observation import NullObservation
 
-from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from opendevin.controller import AgentController
 
@@ -14,6 +15,7 @@ class AddTaskAction(ExecutableAction):
     parent: str
     goal: str
     subtasks: list = field(default_factory=list)
+    thought: str = ''
     action: str = ActionType.ADD_TASK
 
     async def run(self, controller: 'AgentController') -> NullObservation:  # type: ignore
@@ -30,6 +32,7 @@ class AddTaskAction(ExecutableAction):
 class ModifyTaskAction(ExecutableAction):
     id: str
     state: str
+    thought: str = ''
     action: str = ActionType.MODIFY_TASK
 
     async def run(self, controller: 'AgentController') -> NullObservation:  # type: ignore
@@ -46,6 +49,7 @@ class ModifyTaskAction(ExecutableAction):
 class TaskStateChangedAction(NotExecutableAction):
     """Fake action, just to notify the client that a task state has changed."""
     task_state: str
+    thought: str = ''
     action: str = ActionType.CHANGE_TASK_STATE
 
     @property
diff --git a/opendevin/agent.py b/opendevin/agent.py
index 855d50ff7f..a51a214bd5 100644
--- a/opendevin/agent.py
+++ b/opendevin/agent.py
@@ -1,11 +1,11 @@
 from abc import ABC, abstractmethod
-from typing import List, Dict, Type, TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, List, Type
 
 if TYPE_CHECKING:
     from opendevin.action import Action
     from opendevin.state import State
-from opendevin.llm.llm import LLM
 from opendevin.exceptions import AgentAlreadyRegisteredError, AgentNotRegisteredError
+from opendevin.llm.llm import LLM
 from opendevin.sandbox.plugins import PluginRequirement
 
 
diff --git a/opendevin/config.py b/opendevin/config.py
index 4f7482cac5..1cb3733d90 100644
--- a/opendevin/config.py
+++ b/opendevin/config.py
@@ -1,12 +1,13 @@
-import os
 import argparse
-import toml
+import logging
+import os
 import pathlib
 import platform
+
+import toml
 from dotenv import load_dotenv
 
 from opendevin.schema import ConfigType
-import logging
 
 logger = logging.getLogger(__name__)
 
@@ -30,6 +31,7 @@ DEFAULT_CONFIG: dict = {
     ConfigType.SANDBOX_CONTAINER_IMAGE: DEFAULT_CONTAINER_IMAGE,
     ConfigType.RUN_AS_DEVIN: 'true',
     ConfigType.LLM_EMBEDDING_MODEL: 'local',
+    ConfigType.LLM_EMBEDDING_BASE_URL: None,
     ConfigType.LLM_EMBEDDING_DEPLOYMENT_NAME: None,
     ConfigType.LLM_API_VERSION: None,
     ConfigType.LLM_NUM_RETRIES: 5,
@@ -50,8 +52,10 @@ DEFAULT_CONFIG: dict = {
     ConfigType.USE_HOST_NETWORK: 'false',
     ConfigType.SSH_HOSTNAME: 'localhost',
     ConfigType.DISABLE_COLOR: 'false',
+    ConfigType.SANDBOX_USER_ID: os.getuid() if hasattr(os, 'getuid') else None,
     ConfigType.SANDBOX_TIMEOUT: 120,
-    ConfigType.GITHUB_TOKEN: None
+    ConfigType.GITHUB_TOKEN: None,
+    ConfigType.SANDBOX_USER_ID: None
 }
 
 config_str = ''
@@ -153,6 +157,9 @@ def finalize_config():
     if config.get(ConfigType.WORKSPACE_MOUNT_PATH) is None:
         config[ConfigType.WORKSPACE_MOUNT_PATH] = os.path.abspath(config[ConfigType.WORKSPACE_BASE])
 
+    if config.get(ConfigType.LLM_EMBEDDING_BASE_URL) is None:
+        config[ConfigType.LLM_EMBEDDING_BASE_URL] = config.get(ConfigType.LLM_BASE_URL)
+
     USE_HOST_NETWORK = config[ConfigType.USE_HOST_NETWORK].lower() != 'false'
     if USE_HOST_NETWORK and platform.system() == 'Darwin':
         logger.warning(
@@ -164,7 +171,6 @@ def finalize_config():
     if config.get(ConfigType.WORKSPACE_MOUNT_PATH) is None:
         config[ConfigType.WORKSPACE_MOUNT_PATH] = config.get(ConfigType.WORKSPACE_BASE)
 
-
 finalize_config()
 
 
diff --git a/opendevin/controller/__init__.py b/opendevin/controller/__init__.py
index 2f6d436e01..005df25448 100644
--- a/opendevin/controller/__init__.py
+++ b/opendevin/controller/__init__.py
@@ -1,5 +1,5 @@
-from .agent_controller import AgentController
 from .action_manager import ActionManager
+from .agent_controller import AgentController
 
 __all__ = [
     'AgentController',
diff --git a/opendevin/controller/action_manager.py b/opendevin/controller/action_manager.py
index d9c91a2c76..ea5e0404f9 100644
--- a/opendevin/controller/action_manager.py
+++ b/opendevin/controller/action_manager.py
@@ -1,17 +1,18 @@
 from typing import List
 
 from opendevin import config
-from opendevin.observation import CmdOutputObservation, AgentErrorObservation
-from opendevin.sandbox import DockerExecBox, DockerSSHBox, Sandbox, LocalBox, E2BBox
-from opendevin.schema import ConfigType
 from opendevin.action import (
     Action,
 )
 from opendevin.observation import (
-    Observation,
+    AgentErrorObservation,
+    CmdOutputObservation,
     NullObservation,
+    Observation,
 )
+from opendevin.sandbox import DockerExecBox, DockerSSHBox, E2BBox, LocalBox, Sandbox
 from opendevin.sandbox.plugins import PluginRequirement
+from opendevin.schema import ConfigType
 
 
 class ActionManager:
diff --git a/opendevin/controller/agent_controller.py b/opendevin/controller/agent_controller.py
index 7e1da1e246..c4c23c11b7 100644
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@@ -1,30 +1,37 @@
 import asyncio
 from typing import Callable, List, Type
 
-
+from agenthub.codeact_agent.codeact_agent import CodeActAgent
 from opendevin import config
-from opendevin.schema.config import ConfigType
 from opendevin.action import (
     Action,
-    AgentFinishAction,
     AgentDelegateAction,
+    AgentFinishAction,
+    AgentTalkAction,
     NullAction,
 )
-from opendevin.observation import (
-    Observation,
-    AgentErrorObservation,
-    AgentDelegateObservation,
-    NullObservation,
-)
-from opendevin.agent import Agent
-from opendevin.exceptions import AgentMalformedActionError, AgentNoActionError, MaxCharsExceedError, LLMOutputError
-from opendevin.logger import opendevin_logger as logger
-from opendevin.plan import Plan
-from opendevin.state import State
-
 from opendevin.action.tasks import TaskStateChangedAction
-from opendevin.schema import TaskState
+from opendevin.agent import Agent
 from opendevin.controller.action_manager import ActionManager
+from opendevin.exceptions import (
+    AgentMalformedActionError,
+    AgentNoActionError,
+    LLMOutputError,
+    MaxCharsExceedError,
+)
+from opendevin.logger import opendevin_logger as logger
+from opendevin.observation import (
+    AgentDelegateObservation,
+    AgentErrorObservation,
+    NullObservation,
+    Observation,
+    UserMessageObservation,
+)
+from opendevin.plan import Plan
+from opendevin.sandbox import DockerSSHBox
+from opendevin.schema import TaskState
+from opendevin.schema.config import ConfigType
+from opendevin.state import State
 
 MAX_ITERATIONS = config.get(ConfigType.MAX_ITERATIONS)
 MAX_CHARS = config.get(ConfigType.MAX_CHARS)
@@ -61,6 +68,11 @@ class AgentController:
         # Initialize agent-required plugins for sandbox (if any)
         self.action_manager.init_sandbox_plugins(agent.sandbox_plugins)
 
+        if isinstance(agent, CodeActAgent) and not isinstance(self.action_manager.sandbox, DockerSSHBox):
+            logger.warning('CodeActAgent requires DockerSSHBox as sandbox! Using other sandbox that are not stateful (LocalBox, DockerExecBox) will not work properly.')
+
+        self._await_user_message_queue: asyncio.Queue = asyncio.Queue()
+
     def update_state_for_step(self, i):
         if self.state is None:
             return
@@ -171,6 +183,36 @@ class AgentController:
     async def notify_task_state_changed(self):
         await self._run_callbacks(TaskStateChangedAction(self._task_state))
 
+    async def add_user_message(self, message: UserMessageObservation):
+        if self.state is None:
+            return
+
+        if self._task_state == TaskState.AWAITING_USER_INPUT:
+            self._await_user_message_queue.put_nowait(message)
+
+            # set the task state to running
+            self._task_state = TaskState.RUNNING
+            await self.notify_task_state_changed()
+
+        elif self._task_state == TaskState.RUNNING:
+            self.add_history(NullAction(), message)
+
+        else:
+            raise ValueError(f'Task (state: {self._task_state}) is not in a state to add user message')
+
+    async def wait_for_user_input(self) -> UserMessageObservation:
+        self._task_state = TaskState.AWAITING_USER_INPUT
+        await self.notify_task_state_changed()
+        # wait for the next user message
+        if len(self.callbacks) == 0:
+            logger.info('Use STDIN to request user message as no callbacks are registered', extra={'msg_type': 'INFO'})
+            message = input('Request user input [type /exit to stop interaction] >> ')
+            user_message_observation = UserMessageObservation(message)
+        else:
+            user_message_observation = await self._await_user_message_queue.get()
+            self._await_user_message_queue.task_done()
+        return user_message_observation
+
     async def start_delegate(self, action: AgentDelegateAction):
         AgentCls: Type[Agent] = Agent.get_cls(action.agent)
         agent = AgentCls(llm=self.agent.llm)
@@ -198,7 +240,8 @@ class AgentController:
             return False
 
         logger.info(f'STEP {i}', extra={'msg_type': 'STEP'})
-        logger.info(self.state.plan.main_goal, extra={'msg_type': 'PLAN'})
+        if i == 0:
+            logger.info(self.state.plan.main_goal, extra={'msg_type': 'PLAN'})
         if self.state.num_of_chars > self.max_chars:
             raise MaxCharsExceedError(self.state.num_of_chars, self.max_chars)
 
@@ -223,6 +266,14 @@ class AgentController:
 
         await self._run_callbacks(action)
 
+        # whether to await for user messages
+        if isinstance(action, AgentTalkAction):
+            # await for the next user messages
+            user_message_observation = await self.wait_for_user_input()
+            logger.info(user_message_observation, extra={'msg_type': 'OBSERVATION'})
+            self.add_history(action, user_message_observation)
+            return False
+
         finished = isinstance(action, AgentFinishAction)
         if finished:
             self.state.outputs = action.outputs  # type: ignore[attr-defined]
diff --git a/opendevin/logger.py b/opendevin/logger.py
index 1e90f14577..709dcd2dc0 100644
--- a/opendevin/logger.py
+++ b/opendevin/logger.py
@@ -3,10 +3,11 @@ import os
 import sys
 import traceback
 from datetime import datetime
-from opendevin import config
 from typing import Literal, Mapping
+
 from termcolor import colored
 
+from opendevin import config
 from opendevin.schema.config import ConfigType
 
 DISABLE_COLOR_PRINTING = (
diff --git a/opendevin/main.py b/opendevin/main.py
index 686174e575..a97ab14477 100644
--- a/opendevin/main.py
+++ b/opendevin/main.py
@@ -3,8 +3,8 @@ import sys
 from typing import Type
 
 import agenthub  # noqa F401 (we import this to get the agents registered)
-from opendevin.config import args
 from opendevin.agent import Agent
+from opendevin.config import args
 from opendevin.controller import AgentController
 from opendevin.llm.llm import LLM
 
diff --git a/opendevin/observation/__init__.py b/opendevin/observation/__init__.py
index 2c4c29e5a9..174e90bc9c 100644
--- a/opendevin/observation/__init__.py
+++ b/opendevin/observation/__init__.py
@@ -1,11 +1,11 @@
-from .base import Observation, NullObservation
-from .run import CmdOutputObservation
+from .base import NullObservation, Observation
 from .browse import BrowserOutputObservation
-from .files import FileReadObservation, FileWriteObservation
-from .message import UserMessageObservation, AgentMessageObservation
-from .recall import AgentRecallObservation
 from .delegate import AgentDelegateObservation
 from .error import AgentErrorObservation
+from .files import FileReadObservation, FileWriteObservation
+from .message import AgentMessageObservation, UserMessageObservation
+from .recall import AgentRecallObservation
+from .run import CmdOutputObservation, IPythonRunCellObservation
 
 observations = (
     CmdOutputObservation,
@@ -40,6 +40,7 @@ __all__ = [
     'Observation',
     'NullObservation',
     'CmdOutputObservation',
+    'IPythonRunCellObservation',
     'BrowserOutputObservation',
     'FileReadObservation',
     'FileWriteObservation',
diff --git a/opendevin/observation/base.py b/opendevin/observation/base.py
index b1622848cf..078f5b289d 100644
--- a/opendevin/observation/base.py
+++ b/opendevin/observation/base.py
@@ -1,5 +1,6 @@
 import copy
 from dataclasses import dataclass
+
 from opendevin.schema import ObservationType
 
 
diff --git a/opendevin/observation/browse.py b/opendevin/observation/browse.py
index 81e7010e4f..4c7c21ec25 100644
--- a/opendevin/observation/browse.py
+++ b/opendevin/observation/browse.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class BrowserOutputObservation(Observation):
diff --git a/opendevin/observation/delegate.py b/opendevin/observation/delegate.py
index 5465fb5e2c..3b7b586320 100644
--- a/opendevin/observation/delegate.py
+++ b/opendevin/observation/delegate.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class AgentDelegateObservation(Observation):
diff --git a/opendevin/observation/error.py b/opendevin/observation/error.py
index 1b500dd8f3..ac47b4eafd 100644
--- a/opendevin/observation/error.py
+++ b/opendevin/observation/error.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class AgentErrorObservation(Observation):
diff --git a/opendevin/observation/files.py b/opendevin/observation/files.py
index c15ad5f08c..60112b8ef8 100644
--- a/opendevin/observation/files.py
+++ b/opendevin/observation/files.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class FileReadObservation(Observation):
diff --git a/opendevin/observation/message.py b/opendevin/observation/message.py
index 2848affbcc..3d82166caa 100644
--- a/opendevin/observation/message.py
+++ b/opendevin/observation/message.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class UserMessageObservation(Observation):
diff --git a/opendevin/observation/recall.py b/opendevin/observation/recall.py
index 126aee82e7..7fffbcdffc 100644
--- a/opendevin/observation/recall.py
+++ b/opendevin/observation/recall.py
@@ -1,9 +1,10 @@
 from dataclasses import dataclass
 from typing import List
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class AgentRecallObservation(Observation):
diff --git a/opendevin/observation/run.py b/opendevin/observation/run.py
index 09293b848f..8de5fbac5c 100644
--- a/opendevin/observation/run.py
+++ b/opendevin/observation/run.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 
-from .base import Observation
 from opendevin.schema import ObservationType
 
+from .base import Observation
+
 
 @dataclass
 class CmdOutputObservation(Observation):
@@ -22,3 +23,21 @@ class CmdOutputObservation(Observation):
     @property
     def message(self) -> str:
         return f'Command `{self.command}` executed with exit code {self.exit_code}.'
+
+
+@dataclass
+class IPythonRunCellObservation(Observation):
+    """
+    This data class represents the output of a IPythonRunCellAction.
+    """
+
+    code: str
+    observation: str = ObservationType.RUN_IPYTHON
+
+    @property
+    def error(self) -> bool:
+        return False  # IPython cells do not return exit codes
+
+    @property
+    def message(self) -> str:
+        return 'Coded executed in IPython cell.'
diff --git a/opendevin/plan.py b/opendevin/plan.py
index efdd3566c0..7c722a8acd 100644
--- a/opendevin/plan.py
+++ b/opendevin/plan.py
@@ -1,7 +1,7 @@
 from typing import List
 
-from opendevin.logger import opendevin_logger as logger
 from opendevin.exceptions import PlanInvalidStateError
+from opendevin.logger import opendevin_logger as logger
 
 OPEN_STATE = 'open'
 COMPLETED_STATE = 'completed'
diff --git a/opendevin/sandbox/__init__.py b/opendevin/sandbox/__init__.py
index 41611bb7e4..feedc6436d 100644
--- a/opendevin/sandbox/__init__.py
+++ b/opendevin/sandbox/__init__.py
@@ -1,8 +1,8 @@
-from .sandbox import Sandbox
-from .docker.ssh_box import DockerSSHBox
 from .docker.exec_box import DockerExecBox
 from .docker.local_box import LocalBox
+from .docker.ssh_box import DockerSSHBox
 from .e2b.sandbox import E2BBox
+from .sandbox import Sandbox
 
 __all__ = [
     'Sandbox',
diff --git a/opendevin/sandbox/docker/exec_box.py b/opendevin/sandbox/docker/exec_box.py
index fa48071a3d..c5cd1f5afc 100644
--- a/opendevin/sandbox/docker/exec_box.py
+++ b/opendevin/sandbox/docker/exec_box.py
@@ -2,22 +2,22 @@ import atexit
 import concurrent.futures
 import os
 import sys
+import tarfile
 import time
 import uuid
-import tarfile
-from glob import glob
 from collections import namedtuple
+from glob import glob
 from typing import Dict, List, Tuple
 
 import docker
 
 from opendevin import config
-from opendevin.logger import opendevin_logger as logger
-from opendevin.sandbox.sandbox import Sandbox
-from opendevin.sandbox.process import Process
-from opendevin.sandbox.docker.process import DockerProcess
-from opendevin.schema import ConfigType
 from opendevin.exceptions import SandboxInvalidBackgroundCommandError
+from opendevin.logger import opendevin_logger as logger
+from opendevin.sandbox.docker.process import DockerProcess
+from opendevin.sandbox.process import Process
+from opendevin.sandbox.sandbox import Sandbox
+from opendevin.schema import ConfigType
 
 InputType = namedtuple('InputType', ['content'])
 OutputType = namedtuple('OutputType', ['content'])
@@ -122,7 +122,10 @@ class DockerExecBox(Sandbox):
                     self.container.exec_run(
                         f'kill -9 {pid}', workdir=SANDBOX_WORKSPACE_DIR)
                 return -1, f'Command: "{cmd}" timed out'
-        return exit_code, logs.decode('utf-8').strip()
+        logs_out = logs.decode('utf-8')
+        if logs_out.endswith('\n'):
+            logs_out = logs_out[:-1]
+        return exit_code, logs_out
 
     def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False):
         # mkdir -p sandbox_dest if it doesn't exist
diff --git a/opendevin/sandbox/docker/local_box.py b/opendevin/sandbox/docker/local_box.py
index 44fa3e2b6f..2dcf1e923c 100644
--- a/opendevin/sandbox/docker/local_box.py
+++ b/opendevin/sandbox/docker/local_box.py
@@ -1,13 +1,14 @@
-import subprocess
 import atexit
 import os
+import subprocess
 import sys
-from typing import Tuple, Dict
-from opendevin.sandbox.sandbox import Sandbox
-from opendevin.sandbox.process import Process
-from opendevin.sandbox.docker.process import DockerProcess
-from opendevin.logger import opendevin_logger as logger
+from typing import Dict, Tuple
+
 from opendevin import config
+from opendevin.logger import opendevin_logger as logger
+from opendevin.sandbox.docker.process import DockerProcess
+from opendevin.sandbox.process import Process
+from opendevin.sandbox.sandbox import Sandbox
 from opendevin.schema.config import ConfigType
 
 # ===============================================================================
diff --git a/opendevin/sandbox/docker/ssh_box.py b/opendevin/sandbox/docker/ssh_box.py
index 8a6063cd40..71f46a7864 100644
--- a/opendevin/sandbox/docker/ssh_box.py
+++ b/opendevin/sandbox/docker/ssh_box.py
@@ -1,25 +1,25 @@
 import atexit
 import os
 import sys
+import tarfile
 import time
 import uuid
-import tarfile
-from glob import glob
 from collections import namedtuple
+from glob import glob
 from typing import Dict, List, Tuple, Union
 
 import docker
 from pexpect import pxssh
 
 from opendevin import config
+from opendevin.exceptions import SandboxInvalidBackgroundCommandError
 from opendevin.logger import opendevin_logger as logger
-from opendevin.sandbox.sandbox import Sandbox
-from opendevin.sandbox.process import Process
 from opendevin.sandbox.docker.process import DockerProcess
 from opendevin.sandbox.plugins import JupyterRequirement, SWEAgentCommandsRequirement
+from opendevin.sandbox.process import Process
+from opendevin.sandbox.sandbox import Sandbox
 from opendevin.schema import ConfigType
 from opendevin.utils import find_available_tcp_port
-from opendevin.exceptions import SandboxInvalidBackgroundCommandError
 
 InputType = namedtuple('InputType', ['content'])
 OutputType = namedtuple('OutputType', ['content'])
@@ -41,7 +41,6 @@ if SANDBOX_USER_ID := config.get(ConfigType.SANDBOX_USER_ID):
 elif hasattr(os, 'getuid'):
     USER_ID = os.getuid()
 
-
 class DockerSSHBox(Sandbox):
     instance_id: str
     container_image: str
@@ -62,6 +61,7 @@ class DockerSSHBox(Sandbox):
         timeout: int = 120,
         sid: str | None = None,
     ):
+        logger.info(f'SSHBox is running as {"opendevin" if RUN_AS_DEVIN else "root"} user with USER_ID={USER_ID} in the sandbox')
         # Initialize docker client. Throws an exception if Docker is not reachable.
         try:
             self.docker_client = docker.from_env()
@@ -150,8 +150,10 @@ class DockerSSHBox(Sandbox):
                 workdir=SANDBOX_WORKSPACE_DIR,
             )
             if exit_code != 0:
-                raise Exception(
-                    f'Failed to chown workspace directory for opendevin in sandbox: {logs}')
+                # This is not a fatal error, just a warning
+                logger.warning(
+                    f'Failed to chown workspace directory for opendevin in sandbox: {logs}. But this should be fine if the {SANDBOX_WORKSPACE_DIR=} is mounted by the app docker container.'
+                )
         else:
             exit_code, logs = self.container.exec_run(
                 # change password for root
@@ -169,14 +171,16 @@ class DockerSSHBox(Sandbox):
 
     def start_ssh_session(self):
         # start ssh session at the background
-        self.ssh = pxssh.pxssh(echo=False)
+        self.ssh = pxssh.pxssh()
         hostname = SSH_HOSTNAME
         if RUN_AS_DEVIN:
             username = 'opendevin'
         else:
             username = 'root'
         logger.info(
-            f"Connecting to {username}@{hostname} via ssh. If you encounter any issues, you can try `ssh -v -p {self._ssh_port} {username}@{hostname}` with the password '{self._ssh_password}' and report the issue on GitHub."
+            f"Connecting to {username}@{hostname} via ssh. "
+            f"If you encounter any issues, you can try `ssh -v -p {self._ssh_port} {username}@{hostname}` with the password '{self._ssh_password}' and report the issue on GitHub. "
+            f"If you started OpenDevin with `docker run`, you should try `ssh -v -p {self._ssh_port} {username}@localhost` with the password '{self._ssh_password} on the host machine (where you started the container)."
         )
         self.ssh.login(hostname, username, self._ssh_password,
                        port=self._ssh_port)
@@ -211,14 +215,36 @@ class DockerSSHBox(Sandbox):
             # send a SIGINT to the process
             self.ssh.sendintr()
             self.ssh.prompt()
-            command_output = self.ssh.before.decode('utf-8').strip()
+            command_output = self.ssh.before.decode(
+                'utf-8').lstrip(cmd).strip()
             return -1, f'Command: "{cmd}" timed out. Sending SIGINT to the process: {command_output}'
         command_output = self.ssh.before.decode('utf-8').strip()
 
+        # once out, make sure that we have *every* output, we while loop until we get an empty output
+        while True:
+            logger.debug('WAITING FOR .prompt()')
+            self.ssh.sendline('\n')
+            timeout_not_reached = self.ssh.prompt(timeout=1)
+            if not timeout_not_reached:
+                logger.debug('TIMEOUT REACHED')
+                break
+            logger.debug('WAITING FOR .before')
+            output = self.ssh.before.decode('utf-8').strip()
+            logger.debug(f'WAITING FOR END OF command output ({bool(output)}): {output}')
+            if output == '':
+                break
+            command_output += output
+        command_output = command_output.lstrip(cmd).strip()
+
         # get the exit code
         self.ssh.sendline('echo $?')
-        self.ssh.prompt(timeout=10)
-        exit_code = int(self.ssh.before.decode('utf-8').strip())
+        self.ssh.prompt()
+        exit_code = self.ssh.before.decode('utf-8')
+        while not exit_code.startswith('echo $?'):
+            self.ssh.prompt()
+            exit_code = self.ssh.before.decode('utf-8')
+            logger.debug(f'WAITING FOR exit code: {exit_code}')
+        exit_code = int(exit_code.lstrip('echo $?').strip())
         return exit_code, command_output
 
     def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False):
@@ -303,9 +329,10 @@ class DockerSSHBox(Sandbox):
             pass
 
     def get_working_directory(self):
-        self.ssh.sendline('pwd')
-        self.ssh.prompt(timeout=10)
-        return self.ssh.before.decode('utf-8').strip()
+        exit_code, result = self.execute('pwd')
+        if exit_code != 0:
+            raise Exception('Failed to get working directory')
+        return result.strip()
 
     def is_container_running(self):
         try:
@@ -349,7 +376,6 @@ class DockerSSHBox(Sandbox):
                 **network_kwargs,
                 working_dir=SANDBOX_WORKSPACE_DIR,
                 name=self.container_name,
-                hostname='opendevin_sandbox',
                 detach=True,
                 volumes={
                     mount_dir: {
diff --git a/opendevin/sandbox/e2b/sandbox.py b/opendevin/sandbox/e2b/sandbox.py
index ef7d53beb2..d95e1f6548 100644
--- a/opendevin/sandbox/e2b/sandbox.py
+++ b/opendevin/sandbox/e2b/sandbox.py
@@ -2,17 +2,18 @@ import os
 import tarfile
 from glob import glob
 from typing import Dict, Tuple
+
 from e2b import Sandbox as E2BSandbox
 from e2b.sandbox.exception import (
     TimeoutException,
 )
 
 from opendevin import config
-from opendevin.schema.config import ConfigType
 from opendevin.logger import opendevin_logger as logger
-from opendevin.sandbox.sandbox import Sandbox
 from opendevin.sandbox.e2b.process import E2BProcess
 from opendevin.sandbox.process import Process
+from opendevin.sandbox.sandbox import Sandbox
+from opendevin.schema.config import ConfigType
 
 
 class E2BBox(Sandbox):
diff --git a/opendevin/sandbox/plugins/__init__.py b/opendevin/sandbox/plugins/__init__.py
index 15053253a5..5ea90bd9ff 100644
--- a/opendevin/sandbox/plugins/__init__.py
+++ b/opendevin/sandbox/plugins/__init__.py
@@ -1,8 +1,7 @@
-from .mixin import PluginMixin
-from .requirement import PluginRequirement
-
 # Requirements
 from .jupyter import JupyterRequirement
+from .mixin import PluginMixin
+from .requirement import PluginRequirement
 from .swe_agent_commands import SWEAgentCommandsRequirement
 
 __all__ = ['PluginMixin', 'PluginRequirement', 'JupyterRequirement', 'SWEAgentCommandsRequirement']
diff --git a/opendevin/sandbox/plugins/jupyter/__init__.py b/opendevin/sandbox/plugins/jupyter/__init__.py
index a9c3214cf2..abedf467fe 100644
--- a/opendevin/sandbox/plugins/jupyter/__init__.py
+++ b/opendevin/sandbox/plugins/jupyter/__init__.py
@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass
+
 from opendevin.sandbox.plugins.requirement import PluginRequirement
 
 
diff --git a/opendevin/sandbox/plugins/jupyter/execute_cli b/opendevin/sandbox/plugins/jupyter/execute_cli
index 3fca4c534f..0d697c1100 100755
--- a/opendevin/sandbox/plugins/jupyter/execute_cli
+++ b/opendevin/sandbox/plugins/jupyter/execute_cli
@@ -2,6 +2,7 @@
 import os
 import sys
 import time
+
 import requests
 
 # Read the Python code from STDIN
diff --git a/opendevin/sandbox/plugins/jupyter/execute_server b/opendevin/sandbox/plugins/jupyter/execute_server
index e5af086ea7..c7212e907a 100755
--- a/opendevin/sandbox/plugins/jupyter/execute_server
+++ b/opendevin/sandbox/plugins/jupyter/execute_server
@@ -1,17 +1,17 @@
 #!/usr/bin/env python3
 
+import asyncio
+import logging
 import os
 import re
-import asyncio
-import tornado
-import logging
-
-from tornado.escape import json_encode, json_decode, url_escape
-from tornado.websocket import websocket_connect
-from tornado.ioloop import PeriodicCallback
-from tornado.httpclient import AsyncHTTPClient, HTTPRequest
 from uuid import uuid4
 
+import tornado
+from tornado.escape import json_decode, json_encode, url_escape
+from tornado.httpclient import AsyncHTTPClient, HTTPRequest
+from tornado.ioloop import PeriodicCallback
+from tornado.websocket import websocket_connect
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -187,8 +187,7 @@ class JupyterKernel:
                     outputs.append(msg['content']['data']['text/plain'])
                     if 'image/png' in msg['content']['data']:
                         # use markdone to display image (in case of large image)
-                        # outputs.append(f"\n<img src=\'data:image/png;base64,{msg['content']['data']['image/png']}\'/>\n")
-                        outputs.append(f"![image](data:image/png;base64,{msg['content']['data']['image / png']})")
+                        outputs.append(f"\n![image](data:image/png;base64,{msg['content']['data']['image/png']})\n")
 
                 elif msg_type == 'execute_reply':
                     execution_done = True
diff --git a/opendevin/sandbox/plugins/mixin.py b/opendevin/sandbox/plugins/mixin.py
index af9f269727..9ca0b467df 100644
--- a/opendevin/sandbox/plugins/mixin.py
+++ b/opendevin/sandbox/plugins/mixin.py
@@ -1,5 +1,6 @@
 import os
 from typing import List, Protocol, Tuple
+
 from opendevin.logger import opendevin_logger as logger
 from opendevin.sandbox.plugins.requirement import PluginRequirement
 
diff --git a/opendevin/sandbox/plugins/swe_agent_commands/__init__.py b/opendevin/sandbox/plugins/swe_agent_commands/__init__.py
index 7bf4117384..465edf636a 100644
--- a/opendevin/sandbox/plugins/swe_agent_commands/__init__.py
+++ b/opendevin/sandbox/plugins/swe_agent_commands/__init__.py
@@ -1,8 +1,11 @@
 import os
-from typing import List
 from dataclasses import dataclass, field
+from typing import List
+
 from opendevin.sandbox.plugins.requirement import PluginRequirement
-from opendevin.sandbox.plugins.swe_agent_commands.parse_commands import parse_command_file
+from opendevin.sandbox.plugins.swe_agent_commands.parse_commands import (
+    parse_command_file,
+)
 
 
 def _resolve_to_cur_dir(filename):
diff --git a/opendevin/sandbox/sandbox.py b/opendevin/sandbox/sandbox.py
index fcbcfc5ab2..cf5e56432a 100644
--- a/opendevin/sandbox/sandbox.py
+++ b/opendevin/sandbox/sandbox.py
@@ -1,9 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import Dict
-from typing import Tuple
+from typing import Dict, Tuple
 
-from opendevin.sandbox.process import Process
 from opendevin.sandbox.plugins.mixin import PluginMixin
+from opendevin.sandbox.process import Process
 
 
 class Sandbox(ABC, PluginMixin):
diff --git a/opendevin/schema/action.py b/opendevin/schema/action.py
index e8962fb6ce..51d21d4c83 100644
--- a/opendevin/schema/action.py
+++ b/opendevin/schema/action.py
@@ -10,8 +10,12 @@ class ActionTypeSchema(BaseModel):
     """Initializes the agent. Only sent by client.
     """
 
+    USER_MESSAGE: str = Field(default='user_message')
+    """Sends a message from the user. Only sent by the client.
+    """
+
     START: str = Field(default='start')
-    """Starts a new development task. Only sent by the client.
+    """Starts a new development task OR send chat from the user. Only sent by the client.
     """
 
     READ: str = Field(default='read')
@@ -26,6 +30,10 @@ class ActionTypeSchema(BaseModel):
     """Runs a command.
     """
 
+    RUN_IPYTHON: str = Field(default='run_ipython')
+    """Runs a IPython cell.
+    """
+
     KILL: str = Field(default='kill')
     """Kills a background command.
     """
@@ -42,6 +50,10 @@ class ActionTypeSchema(BaseModel):
     """Allows the agent to make a plan, set a goal, or record thoughts
     """
 
+    TALK: str = Field(default='talk')
+    """Allows the agent to respond to the user.
+    """
+
     DELEGATE: str = Field(default='delegate')
     """Delegates a task to another agent.
     """
diff --git a/opendevin/schema/config.py b/opendevin/schema/config.py
index 713ea982b7..6570ec847a 100644
--- a/opendevin/schema/config.py
+++ b/opendevin/schema/config.py
@@ -15,6 +15,7 @@ class ConfigType(str, Enum):
     SANDBOX_CONTAINER_IMAGE = 'SANDBOX_CONTAINER_IMAGE'
     RUN_AS_DEVIN = 'RUN_AS_DEVIN'
     LLM_EMBEDDING_MODEL = 'LLM_EMBEDDING_MODEL'
+    LLM_EMBEDDING_BASE_URL = 'LLM_EMBEDDING_BASE_URL'
     LLM_EMBEDDING_DEPLOYMENT_NAME = 'LLM_EMBEDDING_DEPLOYMENT_NAME'
     LLM_API_VERSION = 'LLM_API_VERSION'
     LLM_NUM_RETRIES = 'LLM_NUM_RETRIES'
diff --git a/opendevin/schema/observation.py b/opendevin/schema/observation.py
index 6d4a1290fa..e800e3eb86 100644
--- a/opendevin/schema/observation.py
+++ b/opendevin/schema/observation.py
@@ -20,6 +20,10 @@ class ObservationTypeSchema(BaseModel):
     """The output of a command
     """
 
+    RUN_IPYTHON: str = Field(default='run_ipython')
+    """Runs a IPython cell.
+    """
+
     RECALL: str = Field(default='recall')
     """The result of a search
     """
diff --git a/opendevin/schema/task.py b/opendevin/schema/task.py
index 010af9b188..cde2eb7d04 100644
--- a/opendevin/schema/task.py
+++ b/opendevin/schema/task.py
@@ -10,6 +10,10 @@ class TaskState(str, Enum):
     """The task is running.
     """
 
+    AWAITING_USER_INPUT = 'awaiting_user_input'
+    """The task is awaiting user input.
+    """
+
     PAUSED = 'paused'
     """The task is paused.
     """
diff --git a/opendevin/server/agent/agent.py b/opendevin/server/agent/agent.py
index 2290fda407..ae384e5f70 100644
--- a/opendevin/server/agent/agent.py
+++ b/opendevin/server/agent/agent.py
@@ -94,6 +94,8 @@ class AgentUnit:
                 await self.create_controller(data)
             case ActionType.START:
                 await self.start_task(data)
+            case ActionType.USER_MESSAGE:
+                await self.send_user_message(data)
             case ActionType.CHANGE_TASK_STATE:
                 task_state_action = data.get('args', {}).get('task_state_action', None)
                 if task_state_action is None:
@@ -177,23 +179,27 @@ class AgentUnit:
         Args:
             start_event: The start event data.
         """
-        if 'task' not in start_event['args']:
-            await self.send_error('No task specified')
-            return
-        await self.send_message('Starting new task...')
         task = start_event['args']['task']
         if self.controller is None:
             await self.send_error('No agent started. Please wait a second...')
             return
         try:
-            if self.agent_task:
-                self.agent_task.cancel()
+            assert not self.agent_task, 'Agent task already running'
             self.agent_task = asyncio.create_task(
                 self.controller.start(task), name='agent start task loop'
             )
         except Exception as e:
             await self.send_error(f'Error during task loop: {e}')
 
+    async def send_user_message(self, data: dict):
+        if not self.agent_task or not self.controller:
+            await self.send_error('No agent started.')
+            return
+
+        await self.controller.add_user_message(
+            UserMessageObservation(data['args']['message'])
+        )
+
     async def set_task_state(self, new_state_action: TaskStateAction):
         """Sets the state of the agent task."""
         if self.controller is None:
diff --git a/opendevin/server/agent/manager.py b/opendevin/server/agent/manager.py
index 5584035587..5ae0fd6e21 100644
--- a/opendevin/server/agent/manager.py
+++ b/opendevin/server/agent/manager.py
@@ -1,7 +1,8 @@
 import atexit
 
-from opendevin.server.session import session_manager
 from opendevin.logger import opendevin_logger as logger
+from opendevin.server.session import session_manager
+
 from .agent import AgentUnit
 
 
diff --git a/opendevin/server/auth/auth.py b/opendevin/server/auth/auth.py
index 81b73cec1f..6429743be7 100644
--- a/opendevin/server/auth/auth.py
+++ b/opendevin/server/auth/auth.py
@@ -1,9 +1,11 @@
 import os
-import jwt
 from typing import Dict
-from opendevin.logger import opendevin_logger as logger
+
+import jwt
 from jwt.exceptions import InvalidTokenError
 
+from opendevin.logger import opendevin_logger as logger
+
 JWT_SECRET = os.getenv('JWT_SECRET', '5ecRe7')
 
 
diff --git a/opendevin/server/listen.py b/opendevin/server/listen.py
index 68d831138e..d6b9a87278 100644
--- a/opendevin/server/listen.py
+++ b/opendevin/server/listen.py
@@ -1,9 +1,10 @@
 import json
+import shutil
 import uuid
 from pathlib import Path
 
 import litellm
-from fastapi import Depends, FastAPI, Response, WebSocket, status
+from fastapi import Depends, FastAPI, Response, UploadFile, WebSocket, status
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, RedirectResponse
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
@@ -11,9 +12,9 @@ from fastapi.staticfiles import StaticFiles
 
 import agenthub  # noqa F401 (we import this to get the agents registered)
 from opendevin import config, files
-from opendevin.schema.config import ConfigType
 from opendevin.agent import Agent
 from opendevin.logger import opendevin_logger as logger
+from opendevin.schema.config import ConfigType
 from opendevin.server.agent import agent_manager
 from opendevin.server.auth import get_sid_from_token, sign_token
 from opendevin.server.session import message_stack, session_manager
@@ -137,6 +138,24 @@ def select_file(file: str):
     return {'code': content}
 
 
+@app.post('/api/upload-file')
+async def upload_file(file: UploadFile):
+    try:
+        workspace_base = config.get(ConfigType.WORKSPACE_BASE)
+        file_path = Path(workspace_base, file.filename)
+        # The following will check if the file is within the workspace base and throw an exception if not
+        file_path.resolve().relative_to(Path(workspace_base).resolve())
+        with open(file_path, 'wb') as buffer:
+            shutil.copyfileobj(file.file, buffer)
+    except Exception as e:
+        logger.error(f'Error saving file {file.filename}: {e}', exc_info=True)
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={'error': f'Error saving file: {e}'}
+        )
+    return {'filename': file.filename, 'location': str(file_path)}
+
+
 @app.get('/api/plan')
 def get_plan(
     credentials: HTTPAuthorizationCredentials = Depends(security_scheme),
diff --git a/opendevin/server/session/manager.py b/opendevin/server/session/manager.py
index 1301cc0e94..09525edbd4 100644
--- a/opendevin/server/session/manager.py
+++ b/opendevin/server/session/manager.py
@@ -1,11 +1,12 @@
 import atexit
 import json
 import os
-from typing import Dict, Callable
+from typing import Callable, Dict
 
 from fastapi import WebSocket
 
 from opendevin.logger import opendevin_logger as logger
+
 from .msg_stack import message_stack
 from .session import Session
 
diff --git a/opendevin/server/session/msg_stack.py b/opendevin/server/session/msg_stack.py
index 22d2c6e45d..d362022a5e 100644
--- a/opendevin/server/session/msg_stack.py
+++ b/opendevin/server/session/msg_stack.py
@@ -1,12 +1,11 @@
-import os
-import json
 import atexit
+import json
+import os
 import uuid
 from typing import Dict, List
 
-from opendevin.schema.action import ActionType
 from opendevin.logger import opendevin_logger as logger
-
+from opendevin.schema.action import ActionType
 
 CACHE_DIR = os.getenv('CACHE_DIR', 'cache')
 MSG_CACHE_FILE = os.path.join(CACHE_DIR, 'messages.json')
diff --git a/opendevin/server/session/session.py b/opendevin/server/session/session.py
index 3f880dc063..19130e7815 100644
--- a/opendevin/server/session/session.py
+++ b/opendevin/server/session/session.py
@@ -1,9 +1,10 @@
 import time
-from typing import Dict, Callable
+from typing import Callable, Dict
 
 from fastapi import WebSocket, WebSocketDisconnect
 
 from opendevin.logger import opendevin_logger as logger
+
 from .msg_stack import message_stack
 
 DEL_DELT_SEC = 60 * 60 * 5
diff --git a/opendevin/state.py b/opendevin/state.py
index 7aa1e834cd..c3d02c9c72 100644
--- a/opendevin/state.py
+++ b/opendevin/state.py
@@ -1,15 +1,14 @@
 from dataclasses import dataclass, field
-from typing import List, Tuple, Dict
-
-from opendevin.plan import Plan
+from typing import Dict, List, Tuple
 
 from opendevin.action import (
     Action,
 )
 from opendevin.observation import (
-    Observation,
     CmdOutputObservation,
+    Observation,
 )
+from opendevin.plan import Plan
 
 
 @dataclass
diff --git a/package-lock.json b/package-lock.json
deleted file mode 100644
index d57e233291..0000000000
--- a/package-lock.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "name": "OpenDevin",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {}
-}
diff --git a/poetry.lock b/poetry.lock
index 4c75f826ee..c9076bd1b6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4446,6 +4446,20 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
+[[package]]
+name = "python-multipart"
+version = "0.0.9"
+description = "A streaming multipart parser for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215"},
+    {file = "python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026"},
+]
+
+[package.extras]
+dev = ["atomicwrites (==1.4.1)", "attrs (==23.2.0)", "coverage (==7.4.1)", "hatch", "invoke (==2.2.0)", "more-itertools (==10.2.0)", "pbr (==6.0.0)", "pluggy (==1.4.0)", "py (==1.11.0)", "pytest (==8.0.0)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.2.0)", "pyyaml (==6.0.1)", "ruff (==0.2.1)"]
+
 [[package]]
 name = "pytz"
 version = "2024.1"
@@ -6329,4 +6343,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "62bc3f49304639795bc824fb0ceacbfaeb40c88ea9e5a90d13d4a3bf5219dbba"
+content-hash = "4e50c6a8427ab71919c0c5bcdfbb83afe71aa50966452b23bca62e0b69da4c30"
diff --git a/pyproject.toml b/pyproject.toml
index efc2a0ff3a..09a804286c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ playwright = "*"
 e2b = "^0.14.13"
 pexpect = "*"
 jinja2 = "^3.1.3"
+python-multipart = "*"
 
 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
diff --git a/tests/integration/README.md b/tests/integration/README.md
index 3c13821fe0..21befd6925 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -89,12 +89,15 @@ mkdir workspace
 poetry run python ./opendevin/main.py -i 10 -t "Write a shell script 'hello.sh' that prints 'hello'." -c "MonologueAgent" -d "./workspace"
 ```
 
+**NOTE**: If your agent decide to support user-agent interaction via natural language (e.g., you will prompted to enter user resposes when running the above `main.py` command), you should create a file named `tests/integration/mock/<AgentName>/<TestName>/user_responses.log` containing all the responses in order you provided to the agent, delimited by newline ('\n'). This will be used to mock the STDIN during testing.
+
 After running the above commands, you should be able to locate the real prompts
 and responses logged. The log folder follows `logs/llm/%y-%m-%d_%H-%M.log` format.
 
 Now, move all files under that folder to `tests/integration/mock/<AgentName>/<TestName>` folder. For example, moving all files from `logs/llm/24-04-23_21-55/` folder to
 `tests/integration/mock/MonologueAgent/test_write_simple_script` folder.
 
+
 That's it, you are good to go! When you launch an integration test, mock
 responses are loaded and used to replace a real LLM, so that we get
 deterministic and consistent behavior, and most importantly, without spending real
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 228b421e08..4e4946958d 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -1,5 +1,6 @@
-import re
+import io
 import os
+import re
 from functools import partial
 
 import pytest
@@ -53,6 +54,28 @@ def get_mock_response(test_name, messages):
                             return resp_file.read()
 
 
+def mock_user_response(*args, test_name, **kwargs):
+    """The agent will ask for user input using `input()` when calling `asyncio.run(main(task))`.
+    This function mocks the user input by providing the response from the mock response file.
+
+    It will read the `user_responses.log` file in the test directory and set as
+    STDIN input for the agent to read.
+    """
+    user_response_file = os.path.join(
+        script_dir,
+        'mock',
+        os.environ.get('AGENT'),
+        test_name,
+        'user_responses.log'
+    )
+    if not os.path.exists(user_response_file):
+        return ''
+    with open(user_response_file, 'r') as f:
+        ret = f.read().rstrip()
+    ret += '\n'
+    return ret
+
+
 def mock_completion(*args, test_name, **kwargs):
     messages = kwargs['messages']
     message_str = ''
@@ -67,4 +90,11 @@ def mock_completion(*args, test_name, **kwargs):
 @pytest.fixture(autouse=True)
 def patch_completion(monkeypatch, request):
     test_name = request.node.name
+    # Mock LLM completion
     monkeypatch.setattr('opendevin.llm.llm.litellm_completion', partial(mock_completion, test_name=test_name))
+
+    # Mock user input (only for tests that have user_responses.log)
+    user_responses_str = mock_user_response(test_name=test_name)
+    if user_responses_str:
+        user_responses = io.StringIO(user_responses_str)
+        monkeypatch.setattr('sys.stdin', user_responses)
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log
new file mode 100644
index 0000000000..1287212b6f
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log
@@ -0,0 +1,215 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
+The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+
+
+Apart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:
+open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+goto <line_number> - moves the window to show <line_number>
+scroll_down - moves the window down {WINDOW} lines
+scroll_up - moves the window down {WINDOW} lines
+create <filename> - creates and opens a new file with the given name
+search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
+find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
+edit <start_line>:<end_line>
+<replacement_text>
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+The assistant's response should be concise, but do express their thoughts.
+Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+<execute_ipython>
+CODE='''
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+'''
+with open('app.py', 'w') as f:
+    f.write(CODE)
+</execute_ipython>
+
+USER:
+Observation:
+[Code executed successfully with no output]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+Observation:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+Observation:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_bash>
+open app.py
+</execute_bash>
+
+USER:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return str(numbers)
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. Let me do that for you:
+
+<execute_bash>
+edit 8:8 <<EOF
+    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+EOF
+</execute_bash>
+
+USER:
+Observation:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+
+ASSISTANT:
+The file has been updated. Let me run the Python file again with the new changes:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Write a shell script 'hello.sh' that prints 'hello'.
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
new file mode 100644
index 0000000000..f4ad1e93d9
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
@@ -0,0 +1,229 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
+The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+
+
+Apart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:
+open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+goto <line_number> - moves the window to show <line_number>
+scroll_down - moves the window down {WINDOW} lines
+scroll_up - moves the window down {WINDOW} lines
+create <filename> - creates and opens a new file with the given name
+search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
+find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
+edit <start_line>:<end_line>
+<replacement_text>
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+The assistant's response should be concise, but do express their thoughts.
+Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+<execute_ipython>
+CODE='''
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+'''
+with open('app.py', 'w') as f:
+    f.write(CODE)
+</execute_ipython>
+
+USER:
+Observation:
+[Code executed successfully with no output]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+Observation:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+Observation:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_bash>
+open app.py
+</execute_bash>
+
+USER:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return str(numbers)
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. Let me do that for you:
+
+<execute_bash>
+edit 8:8 <<EOF
+    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+EOF
+</execute_bash>
+
+USER:
+Observation:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+
+ASSISTANT:
+The file has been updated. Let me run the Python file again with the new changes:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Write a shell script 'hello.sh' that prints 'hello'.
+
+----------
+
+Let's start by creating the shell script `hello.sh` that prints "hello". I'll write the script and save it to a file.
+
+<execute_bash>
+echo "echo 'hello'" > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+[Command -1 finished with exit code 0]]
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
new file mode 100644
index 0000000000..eec72fab3c
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
@@ -0,0 +1,237 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
+The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+
+
+Apart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:
+open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+goto <line_number> - moves the window to show <line_number>
+scroll_down - moves the window down {WINDOW} lines
+scroll_up - moves the window down {WINDOW} lines
+create <filename> - creates and opens a new file with the given name
+search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
+find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
+edit <start_line>:<end_line>
+<replacement_text>
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+The assistant's response should be concise, but do express their thoughts.
+Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+<execute_ipython>
+CODE='''
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+'''
+with open('app.py', 'w') as f:
+    f.write(CODE)
+</execute_ipython>
+
+USER:
+Observation:
+[Code executed successfully with no output]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+Observation:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+Observation:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_bash>
+open app.py
+</execute_bash>
+
+USER:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return str(numbers)
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. Let me do that for you:
+
+<execute_bash>
+edit 8:8 <<EOF
+    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+EOF
+</execute_bash>
+
+USER:
+Observation:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+
+ASSISTANT:
+The file has been updated. Let me run the Python file again with the new changes:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Write a shell script 'hello.sh' that prints 'hello'.
+
+----------
+
+Let's start by creating the shell script `hello.sh` that prints "hello". I'll write the script and save it to a file.
+
+<execute_bash>
+echo "echo 'hello'" > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+[Command -1 finished with exit code 0]]
+
+----------
+
+The shell script `hello.sh` has been successfully created. Would you like to run the script to see the output, or is there anything else you'd like to do with it?
+
+----------
+
+can you run it to make sure it is working?
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
new file mode 100644
index 0000000000..040ec399fb
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
@@ -0,0 +1,251 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
+The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+
+
+Apart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:
+open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+goto <line_number> - moves the window to show <line_number>
+scroll_down - moves the window down {WINDOW} lines
+scroll_up - moves the window down {WINDOW} lines
+create <filename> - creates and opens a new file with the given name
+search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
+find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
+edit <start_line>:<end_line>
+<replacement_text>
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+The assistant's response should be concise, but do express their thoughts.
+Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+<execute_ipython>
+CODE='''
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+'''
+with open('app.py', 'w') as f:
+    f.write(CODE)
+</execute_ipython>
+
+USER:
+Observation:
+[Code executed successfully with no output]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+Observation:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+Observation:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_bash>
+open app.py
+</execute_bash>
+
+USER:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return str(numbers)
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. Let me do that for you:
+
+<execute_bash>
+edit 8:8 <<EOF
+    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+EOF
+</execute_bash>
+
+USER:
+Observation:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+
+ASSISTANT:
+The file has been updated. Let me run the Python file again with the new changes:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Write a shell script 'hello.sh' that prints 'hello'.
+
+----------
+
+Let's start by creating the shell script `hello.sh` that prints "hello". I'll write the script and save it to a file.
+
+<execute_bash>
+echo "echo 'hello'" > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+[Command -1 finished with exit code 0]]
+
+----------
+
+The shell script `hello.sh` has been successfully created. Would you like to run the script to see the output, or is there anything else you'd like to do with it?
+
+----------
+
+can you run it to make sure it is working?
+
+----------
+
+Sure, let's run the script `hello.sh` to ensure it's working correctly.
+
+<execute_bash>
+bash hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+hello
+[Command -1 finished with exit code 0]]
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_005.log
new file mode 100644
index 0000000000..deed325508
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_005.log
@@ -0,0 +1,259 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
+The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
+The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
+
+
+Apart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:
+open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+goto <line_number> - moves the window to show <line_number>
+scroll_down - moves the window down {WINDOW} lines
+scroll_up - moves the window down {WINDOW} lines
+create <filename> - creates and opens a new file with the given name
+search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
+find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
+edit <start_line>:<end_line>
+<replacement_text>
+end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+The assistant's response should be concise, but do express their thoughts.
+Try to include one of <execute_ipython> or <execute_bash> in each of your responses, unless it is a direct answer to a question OR a message to the user.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+<execute_ipython>
+CODE='''
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+'''
+with open('app.py', 'w') as f:
+    f.write(CODE)
+</execute_ipython>
+
+USER:
+Observation:
+[Code executed successfully with no output]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+Observation:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+Observation:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+Observation:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_bash>
+open app.py
+</execute_bash>
+
+USER:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return str(numbers)
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. Let me do that for you:
+
+<execute_bash>
+edit 8:8 <<EOF
+    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+EOF
+</execute_bash>
+
+USER:
+Observation:
+[File: /workspace/app.py (11 lines total)]
+1:
+2:from flask import Flask
+3:app = Flask(__name__)
+4:
+5:@app.route('/')
+6:def index():
+7:    numbers = list(range(1, 11))
+8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+9:
+10:if __name__ == '__main__':
+11:    app.run(port=5000)
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+
+ASSISTANT:
+The file has been updated. Let me run the Python file again with the new changes:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Free free to let me know if you have any further requests!
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Write a shell script 'hello.sh' that prints 'hello'.
+
+----------
+
+Let's start by creating the shell script `hello.sh` that prints "hello". I'll write the script and save it to a file.
+
+<execute_bash>
+echo "echo 'hello'" > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+[Command -1 finished with exit code 0]]
+
+----------
+
+The shell script `hello.sh` has been successfully created. Would you like to run the script to see the output, or is there anything else you'd like to do with it?
+
+----------
+
+can you run it to make sure it is working?
+
+----------
+
+Sure, let's run the script `hello.sh` to ensure it's working correctly.
+
+<execute_bash>
+bash hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+hello
+[Command -1 finished with exit code 0]]
+
+----------
+
+The script `hello.sh` is working correctly as it printed "hello". If you have any more tasks or need further modifications, feel free to let me know!
+
+----------
+
+can you run it to make sure it is working?
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_001.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_001.log
new file mode 100644
index 0000000000..9a36dce55f
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_001.log
@@ -0,0 +1,4 @@
+Let's start by creating the shell script `hello.sh` that prints "hello". I'll write the script and save it to a file.
+
+<execute_bash>
+echo "echo 'hello'" > hello.sh
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_002.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_002.log
new file mode 100644
index 0000000000..f7cb2b8edc
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_002.log
@@ -0,0 +1 @@
+The shell script `hello.sh` has been successfully created. Would you like to run the script to see the output, or is there anything else you'd like to do with it?
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_003.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_003.log
new file mode 100644
index 0000000000..c146b696e4
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_003.log
@@ -0,0 +1,4 @@
+Sure, let's run the script `hello.sh` to ensure it's working correctly.
+
+<execute_bash>
+bash hello.sh
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_004.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_004.log
new file mode 100644
index 0000000000..c113aa6238
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_004.log
@@ -0,0 +1 @@
+The script `hello.sh` is working correctly as it printed "hello". If you have any more tasks or need further modifications, feel free to let me know!
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_005.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_005.log
new file mode 100644
index 0000000000..e8ff14a4b9
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/response_005.log
@@ -0,0 +1 @@
+The script `hello.sh` was just run and it successfully printed "hello". If you need any further assistance or have another task in mind, please let me know!
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/user_responses.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/user_responses.log
new file mode 100644
index 0000000000..9e92a50a7d
--- /dev/null
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/user_responses.log
@@ -0,0 +1,2 @@
+can you run it to make sure it is working?
+/exit
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_001.log
index 14a75d24e9..00848e7077 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_001.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_001.log
@@ -92,7 +92,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "recall",
     "args": {
-      "query": "what it is I want to do"
+      "query": "what it is I want to do",
+      "thought": ""
     }
   },
   {
@@ -119,7 +120,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "echo \"hello world\"",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -144,7 +146,7 @@ This is your internal monologue, in JSON format:
       "content": "echo \"console.log('hello world')\"",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -163,7 +165,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "node test.js",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -193,7 +196,7 @@ This is your internal monologue, in JSON format:
       "path": "test.js",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -224,7 +227,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "browse",
     "args": {
-      "url": "google.com"
+      "url": "google.com",
+      "thought": ""
     }
   },
   {
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_002.log
index 1b29542bdb..b95b7275b1 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_002.log
@@ -92,7 +92,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "recall",
     "args": {
-      "query": "what it is I want to do"
+      "query": "what it is I want to do",
+      "thought": ""
     }
   },
   {
@@ -119,7 +120,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "echo \"hello world\"",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -144,7 +146,7 @@ This is your internal monologue, in JSON format:
       "content": "echo \"console.log('hello world')\"",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -163,7 +165,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "node test.js",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -193,7 +196,7 @@ This is your internal monologue, in JSON format:
       "path": "test.js",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -224,7 +227,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "browse",
     "args": {
-      "url": "google.com"
+      "url": "google.com",
+      "thought": ""
     }
   },
   {
@@ -317,7 +321,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "ls",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_003.log
index 3b2f94b6dd..1f75f642ad 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_003.log
@@ -92,7 +92,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "recall",
     "args": {
-      "query": "what it is I want to do"
+      "query": "what it is I want to do",
+      "thought": ""
     }
   },
   {
@@ -119,7 +120,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "echo \"hello world\"",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -144,7 +146,7 @@ This is your internal monologue, in JSON format:
       "content": "echo \"console.log('hello world')\"",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -163,7 +165,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "node test.js",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -193,7 +196,7 @@ This is your internal monologue, in JSON format:
       "path": "test.js",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -224,7 +227,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "browse",
     "args": {
-      "url": "google.com"
+      "url": "google.com",
+      "thought": ""
     }
   },
   {
@@ -317,7 +321,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "ls",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -332,7 +337,7 @@ This is your internal monologue, in JSON format:
   {
     "action": "think",
     "args": {
-      "thought": "Before I create the 'hello.sh' file, I need to think about the content. It should contain a shebang line to specify the shell interpreter and the command to print 'hello'."
+      "thought": "Since there are no existing files listed by the 'ls' command, I should create the 'hello.sh' script now."
     }
   },
   {
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_004.log
index 644be27820..b37d8b7b85 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_004.log
@@ -92,7 +92,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "recall",
     "args": {
-      "query": "what it is I want to do"
+      "query": "what it is I want to do",
+      "thought": ""
     }
   },
   {
@@ -119,7 +120,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "echo \"hello world\"",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -144,7 +146,7 @@ This is your internal monologue, in JSON format:
       "content": "echo \"console.log('hello world')\"",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -163,7 +165,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "node test.js",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -193,7 +196,7 @@ This is your internal monologue, in JSON format:
       "path": "test.js",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -224,7 +227,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "browse",
     "args": {
-      "url": "google.com"
+      "url": "google.com",
+      "thought": ""
     }
   },
   {
@@ -317,7 +321,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "ls",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -332,7 +337,7 @@ This is your internal monologue, in JSON format:
   {
     "action": "think",
     "args": {
-      "thought": "Before I create the 'hello.sh' file, I need to think about the content. It should contain a shebang line to specify the shell interpreter and the command to print 'hello'."
+      "thought": "Since there are no existing files listed by the 'ls' command, I should create the 'hello.sh' script now."
     }
   },
   {
@@ -347,7 +352,7 @@ This is your internal monologue, in JSON format:
       "content": "#!/bin/bash\necho 'hello'",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_005.log
index d3b2cb56e8..43f06a3d8a 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_005.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_005.log
@@ -92,7 +92,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "recall",
     "args": {
-      "query": "what it is I want to do"
+      "query": "what it is I want to do",
+      "thought": ""
     }
   },
   {
@@ -119,7 +120,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "echo \"hello world\"",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -144,7 +146,7 @@ This is your internal monologue, in JSON format:
       "content": "echo \"console.log('hello world')\"",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -163,7 +165,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "node test.js",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -193,7 +196,7 @@ This is your internal monologue, in JSON format:
       "path": "test.js",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -224,7 +227,8 @@ This is your internal monologue, in JSON format:
   {
     "action": "browse",
     "args": {
-      "url": "google.com"
+      "url": "google.com",
+      "thought": ""
     }
   },
   {
@@ -317,7 +321,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "ls",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
@@ -332,7 +337,7 @@ This is your internal monologue, in JSON format:
   {
     "action": "think",
     "args": {
-      "thought": "Before I create the 'hello.sh' file, I need to think about the content. It should contain a shebang line to specify the shell interpreter and the command to print 'hello'."
+      "thought": "Since there are no existing files listed by the 'ls' command, I should create the 'hello.sh' script now."
     }
   },
   {
@@ -347,7 +352,7 @@ This is your internal monologue, in JSON format:
       "content": "#!/bin/bash\necho 'hello'",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -361,7 +366,8 @@ This is your internal monologue, in JSON format:
     "action": "run",
     "args": {
       "command": "bash hello.sh",
-      "background": false
+      "background": false,
+      "thought": ""
     }
   },
   {
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_006.log
new file mode 100644
index 0000000000..d29d4c4358
--- /dev/null
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/prompt_006.log
@@ -0,0 +1,403 @@
+
+
+----------
+
+
+You're a thoughtful robot. Your main task is this:
+Write a shell script 'hello.sh' that prints 'hello'.
+
+Don't expand the scope of your task--just complete it as written.
+
+This is your internal monologue, in JSON format:
+
+[
+  {
+    "action": "think",
+    "args": {
+      "thought": "I exist!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Hmm...looks like I can type in a command line prompt"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Looks like I have a web browser too!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Here's what I want to do: Write a shell script 'hello.sh' that prints 'hello'."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "How am I going to get there though?"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "It seems like I have some kind of short term memory."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Each of my thoughts seems to be stored in a JSON array."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "It seems whatever I say next will be added as an object to the list."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "But no one has perfect short-term memory. My list of thoughts will be summarized and condensed over time, losing information in the process."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Fortunately I have long term memory!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I can just perform a recall action, followed by the thing I want to remember. And then related thoughts just spill out!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Sometimes they're random thoughts that don't really have to do with what I wanted to remember. But usually they're exactly what I need!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Let's try it out!"
+    }
+  },
+  {
+    "action": "recall",
+    "args": {
+      "query": "what it is I want to do",
+      "thought": ""
+    }
+  },
+  {
+    "observation": "recall",
+    "content": "Here's what I want to do: Write a shell script 'hello.sh' that prints 'hello'.",
+    "extras": {
+      "memories": [],
+      "role": "assistant"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "How am I going to get there though?"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Neat! And it looks like it's easy for me to use the command line too! I just have to perform a run action and include the command I want to run in the command argument. The command output just jumps into my head!"
+    }
+  },
+  {
+    "action": "run",
+    "args": {
+      "command": "echo \"hello world\"",
+      "background": false,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "run",
+    "content": "hello world",
+    "extras": {
+      "command_id": 0,
+      "command": "",
+      "exit_code": 0
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Cool! I bet I can write files too using the write action."
+    }
+  },
+  {
+    "action": "write",
+    "args": {
+      "path": "test.js",
+      "content": "echo \"console.log('hello world')\"",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": ""
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I just created test.js. I'll try and run it now."
+    }
+  },
+  {
+    "action": "run",
+    "args": {
+      "command": "node test.js",
+      "background": false,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "run",
+    "content": "hello world",
+    "extras": {
+      "command_id": 0,
+      "command": "",
+      "exit_code": 0
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "It works!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I'm going to try reading it now using the read action."
+    }
+  },
+  {
+    "action": "read",
+    "args": {
+      "path": "test.js",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "read",
+    "content": "console.log('hello world')",
+    "extras": {
+      "path": ""
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Nice! I can read files too!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "And if I want to use the browser, I just need to use the browse action and include the url I want to visit in the url argument"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Let's try that..."
+    }
+  },
+  {
+    "action": "browse",
+    "args": {
+      "url": "google.com",
+      "thought": ""
+    }
+  },
+  {
+    "observation": "browse",
+    "content": "<form><input type=\"text\"></input><button type=\"submit\"></button></form>",
+    "extras": {
+      "url": "",
+      "status_code": 200,
+      "error": false
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I can browse the web too!"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "And once I have completed my task, I can use the finish action to stop working."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Very cool. Now to accomplish my task."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I'll need a strategy. And as I make progress, I'll need to keep refining that strategy. I'll need to set goals, and break them into sub-goals."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "In between actions, I must always take some time to think, strategize, and set new goals. I should never take two actions in a row."
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "OK so my task is to Write a shell script 'hello.sh' that prints 'hello'.. I haven't made any progress yet. Where should I start?"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "It seems like there might be an existing project here. I should probably start by running `ls` to see what's here."
+    }
+  },
+  {
+    "action": "run",
+    "args": {
+      "command": "ls",
+      "background": false,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "run",
+    "content": "",
+    "extras": {
+      "command_id": -1,
+      "command": "ls",
+      "exit_code": 0
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Since there are no existing files listed by the 'ls' command, I should create the 'hello.sh' script now."
+    }
+  },
+  {
+    "observation": "null",
+    "content": "",
+    "extras": {}
+  },
+  {
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "hello.sh"
+    }
+  },
+  {
+    "action": "run",
+    "args": {
+      "command": "bash hello.sh",
+      "background": false,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "run",
+    "content": "hello",
+    "extras": {
+      "command_id": -1,
+      "command": "bash hello.sh",
+      "exit_code": 0
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "The script 'hello.sh' has been written and tested successfully. It prints 'hello' as required. I should now finalize my task."
+    }
+  },
+  {
+    "observation": "null",
+    "content": "",
+    "extras": {}
+  }
+]
+
+
+Your most recent thought is at the bottom of that monologue. Continue your train of thought.
+What is your next thought or action? Your response must be in JSON format.
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+Here are the possible actions:
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command. Arguments:
+  * `command` - the command to run
+  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
+* `kill` - kills a background command
+  * `id` - the ID of the background command to kill
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `recall` - recalls a past memory. Arguments:
+  * `query` - the query to search for
+* `think` - make a plan, set a goal, or record your thoughts. Arguments:
+  * `thought` - the thought to record
+* `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+
+
+You MUST take time to think in between read, write, run, browse, and recall actions.
+You should never act twice in a row without thinking. But if your last several
+actions are all "think" actions, you should consider taking a different action.
+
+Notes:
+* your environment is Debian Linux. You can install software with `apt`
+* your working directory will not change, even if you run `cd`. All commands will be run in the `/workspace` directory.
+* don't run interactive commands, or commands that don't return (e.g. `node server.js`). You may run commands in the background (e.g. `node server.js &`)
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_002.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_002.log
index bd27926940..791926155e 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_002.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_002.log
@@ -1,6 +1,6 @@
 {
   "action": "think",
   "args": {
-    "thought": "Before I create the 'hello.sh' file, I need to think about the content. It should contain a shebang line to specify the shell interpreter and the command to print 'hello'."
+    "thought": "Since there are no existing files listed by the 'ls' command, I should create the 'hello.sh' script now."
   }
 }
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_004.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_004.log
index 7055344e27..9cc89f2a5b 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_004.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_004.log
@@ -1,3 +1,4 @@
+```json
 {
   "action": "run",
   "args": {
@@ -5,3 +6,4 @@
     "background": false
   }
 }
+```
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_005.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_005.log
index c6819f9911..18d32603ec 100644
--- a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_005.log
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_005.log
@@ -1,4 +1,8 @@
+```json
 {
-  "action": "finish",
-  "args": {}
+  "action": "think",
+  "args": {
+    "thought": "The script 'hello.sh' has been written and tested successfully. It prints 'hello' as required. I should now finalize my task."
+  }
 }
+```
diff --git a/tests/integration/mock/MonologueAgent/test_write_simple_script/response_006.log b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_006.log
new file mode 100644
index 0000000000..c6819f9911
--- /dev/null
+++ b/tests/integration/mock/MonologueAgent/test_write_simple_script/response_006.log
@@ -0,0 +1,4 @@
+{
+  "action": "finish",
+  "args": {}
+}
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log
index 717229bfc3..81f0b95cea 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log
@@ -58,7 +58,8 @@ ten actions--more happened before that.
     "action": "modify_task",
     "args": {
       "id": "0",
-      "state": "in_progress"
+      "state": "in_progress",
+      "thought": ""
     }
   }
 ]
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log
index 0f11b679c8..a6e7ef7520 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log
@@ -58,24 +58,14 @@ ten actions--more happened before that.
     "action": "modify_task",
     "args": {
       "id": "0",
-      "state": "in_progress"
+      "state": "in_progress",
+      "thought": ""
     }
   },
   {
-    "action": "write",
+    "action": "think",
     "args": {
-      "path": "hello.sh",
-      "content": "#!/bin/bash\necho 'hello'",
-      "start": 0,
-      "end": -1,
-      "thoughts": ""
-    }
-  },
-  {
-    "observation": "write",
-    "content": "",
-    "extras": {
-      "path": "hello.sh"
+      "thought": "I need to write a shell script called 'hello.sh' that contains the necessary command to print 'hello'. This will likely involve using echo or printf to output the text to the console when the script is run. I should start by creating and writing the content to the file."
     }
   }
 ]
@@ -119,4 +109,4 @@ actions are all `think` actions, you should consider taking a different action.
 
 What is your next thought or action? Again, you must reply with JSON, and only with JSON.
 
-You just changed a file. You should think about how it affects your plan.
+Look at your last thought in the history above. What does it suggest? Don't think anymore--take action.
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log
index 7f0d9c655c..e51d7afeca 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log
@@ -58,7 +58,14 @@ ten actions--more happened before that.
     "action": "modify_task",
     "args": {
       "id": "0",
-      "state": "in_progress"
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I need to write a shell script called 'hello.sh' that contains the necessary command to print 'hello'. This will likely involve using echo or printf to output the text to the console when the script is run. I should start by creating and writing the content to the file."
     }
   },
   {
@@ -68,7 +75,7 @@ ten actions--more happened before that.
       "content": "#!/bin/bash\necho 'hello'",
       "start": 0,
       "end": -1,
-      "thoughts": ""
+      "thought": ""
     }
   },
   {
@@ -77,22 +84,6 @@ ten actions--more happened before that.
     "extras": {
       "path": "hello.sh"
     }
-  },
-  {
-    "action": "run",
-    "args": {
-      "command": "bash hello.sh",
-      "background": false
-    }
-  },
-  {
-    "observation": "run",
-    "content": "hello",
-    "extras": {
-      "command_id": -1,
-      "command": "bash hello.sh",
-      "exit_code": 0
-    }
   }
 ]
 
@@ -135,4 +126,4 @@ actions are all `think` actions, you should consider taking a different action.
 
 What is your next thought or action? Again, you must reply with JSON, and only with JSON.
 
-You should think about the command you just ran, what output it gave, and how that affects your plan.
+You just changed a file. You should think about how it affects your plan.
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_005.log
new file mode 100644
index 0000000000..2ce5998c09
--- /dev/null
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_005.log
@@ -0,0 +1,135 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "0",
+  "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+  "state": "in_progress",
+  "subtasks": []
+}
+
+
+You're currently working on this task:
+Write a shell script 'hello.sh' that prints 'hello'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "action": "modify_task",
+    "args": {
+      "id": "0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I need to write a shell script called 'hello.sh' that contains the necessary command to print 'hello'. This will likely involve using echo or printf to output the text to the console when the script is run. I should start by creating and writing the content to the file."
+    }
+  },
+  {
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "hello.sh"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Now that the script 'hello.sh' has been written and contains the appropriate command to print 'hello', the next logical step is to ensure it works as expected. I should proceed to run the script to verify its output."
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
+* `kill` - kills a background command
+  * `id` - the ID of the background command to kill
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `think` - make a plan, set a goal, or record your thoughts. Arguments:
+  * `thought` - the thought to record
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, browse, and recall actions.
+You should never act twice in a row without thinking. But if your last several
+actions are all `think` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+Look at your last thought in the history above. What does it suggest? Don't think anymore--take action.
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_006.log
new file mode 100644
index 0000000000..bbde0f0a3f
--- /dev/null
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_006.log
@@ -0,0 +1,152 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "0",
+  "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+  "state": "in_progress",
+  "subtasks": []
+}
+
+
+You're currently working on this task:
+Write a shell script 'hello.sh' that prints 'hello'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "action": "modify_task",
+    "args": {
+      "id": "0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "I need to write a shell script called 'hello.sh' that contains the necessary command to print 'hello'. This will likely involve using echo or printf to output the text to the console when the script is run. I should start by creating and writing the content to the file."
+    }
+  },
+  {
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "hello.sh"
+    }
+  },
+  {
+    "action": "think",
+    "args": {
+      "thought": "Now that the script 'hello.sh' has been written and contains the appropriate command to print 'hello', the next logical step is to ensure it works as expected. I should proceed to run the script to verify its output."
+    }
+  },
+  {
+    "action": "run",
+    "args": {
+      "command": "bash hello.sh",
+      "background": false,
+      "thought": ""
+    }
+  },
+  {
+    "observation": "run",
+    "content": "hello",
+    "extras": {
+      "command_id": -1,
+      "command": "bash hello.sh",
+      "exit_code": 0
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
+* `kill` - kills a background command
+  * `id` - the ID of the background command to kill
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `think` - make a plan, set a goal, or record your thoughts. Arguments:
+  * `thought` - the thought to record
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, browse, and recall actions.
+You should never act twice in a row without thinking. But if your last several
+actions are all `think` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the command you just ran, what output it gave, and how that affects your plan.
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_001.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_001.log
index f4f1a41bf1..9e31d23c8a 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_001.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_001.log
@@ -1,3 +1,4 @@
+```json
 {
   "action": "modify_task",
   "args": {
@@ -5,3 +6,4 @@
     "state": "in_progress"
   }
 }
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_002.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_002.log
index 03bf823601..700cd1c18d 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_002.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_002.log
@@ -1,9 +1,8 @@
 ```json
 {
-  "action": "write",
+  "action": "think",
   "args": {
-    "path": "hello.sh",
-    "content": "#!/bin/bash\necho 'hello'"
+    "thought": "I need to write a shell script called 'hello.sh' that contains the necessary command to print 'hello'. This will likely involve using echo or printf to output the text to the console when the script is run. I should start by creating and writing the content to the file."
   }
 }
 ```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_003.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_003.log
index 7055344e27..03bf823601 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_003.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_003.log
@@ -1,7 +1,9 @@
+```json
 {
-  "action": "run",
+  "action": "write",
   "args": {
-    "command": "bash hello.sh",
-    "background": false
+    "path": "hello.sh",
+    "content": "#!/bin/bash\necho 'hello'"
   }
 }
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_004.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_004.log
index 6093c90df7..02ae256680 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_004.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_004.log
@@ -1,7 +1,6 @@
 {
-  "action": "modify_task",
+  "action": "think",
   "args": {
-    "id": "0",
-    "state": "verified"
+    "thought": "Now that the script 'hello.sh' has been written and contains the appropriate command to print 'hello', the next logical step is to ensure it works as expected. I should proceed to run the script to verify its output."
   }
 }
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_005.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_005.log
new file mode 100644
index 0000000000..9cc89f2a5b
--- /dev/null
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_005.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "run",
+  "args": {
+    "command": "bash hello.sh",
+    "background": false
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_006.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_006.log
new file mode 100644
index 0000000000..90dec695c4
--- /dev/null
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/response_006.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "id": "0",
+    "state": "verified",
+    "thought": "The shell script 'hello.sh' has been run and verified to successfully output 'hello'. This completes the primary task requirement."
+  }
+}
+```
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_002.log
index 0ddf35a113..89793bca08 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_002.log
@@ -66,8 +66,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  To begin solving the task, I need to create a new shell script file named 'hello.sh'. This file should contain a script that prints the message 'hello' when executed. I will start by writing the initial content of this script.
+  thought: Thoughts:
+  To solve the task of writing a shell script named 'hello.sh' that prints 'hello', I need to create a new file named 'hello.sh'. I will then write the necessary shell script code into this file.
 
   Action:
 
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_003.log
index b3d5b905e3..320adaa09e 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_003.log
@@ -66,8 +66,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  To begin solving the task, I need to create a new shell script file named 'hello.sh'. This file should contain a script that prints the message 'hello' when executed. I will start by writing the initial content of this script.
+  thought: Thoughts:
+  To solve the task of writing a shell script named 'hello.sh' that prints 'hello', I need to create a new file named 'hello.sh'. I will then write the necessary shell script code into this file.
 
   Action:
 
@@ -84,8 +84,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  Since the file 'hello.sh' has been created, the next step is to write the shell script code that prints "hello". This involves adding a shebang line to specify the shell interpreter and a print command. I will add these lines to the 'hello.sh' file.
+  thought: Thoughts:
+  The task is to write a shell script named 'hello.sh' that prints 'hello'. I have already created the file 'hello.sh' in a previous step. Now, I need to write the shell script code into this file. The script should include the shebang line to specify the shell interpreter and a command to print 'hello'.
 
   Action:
 
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_004.log
index 444383d84f..f93d78d595 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_004.log
@@ -66,8 +66,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  To begin solving the task, I need to create a new shell script file named 'hello.sh'. This file should contain a script that prints the message 'hello' when executed. I will start by writing the initial content of this script.
+  thought: Thoughts:
+  To solve the task of writing a shell script named 'hello.sh' that prints 'hello', I need to create a new file named 'hello.sh'. I will then write the necessary shell script code into this file.
 
   Action:
 
@@ -84,8 +84,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  Since the file 'hello.sh' has been created, the next step is to write the shell script code that prints "hello". This involves adding a shebang line to specify the shell interpreter and a print command. I will add these lines to the 'hello.sh' file.
+  thought: Thoughts:
+  The task is to write a shell script named 'hello.sh' that prints 'hello'. I have already created the file 'hello.sh' in a previous step. Now, I need to write the shell script code into this file. The script should include the shebang line to specify the shell interpreter and a command to print 'hello'.
 
   Action:
 
@@ -97,19 +97,21 @@ extras:
 
 Memory 2:
 Previous Action:
-action: write
+action: read
 args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  I have created the file 'hello.sh' but have not yet written the script content that prints "hello". The next step is to write the shell script code into 'hello.sh'. This will include a shebang line to specify the shell interpreter (bash in this case) and an echo command to print "hello".
+  thought: Thoughts:
+  I need to verify the contents of the 'hello.sh' script to ensure that it correctly contains the shell script code to print 'hello'. This involves checking if the shebang line is present and if the print command is correctly written. If the script is correct, I can then proceed to test it to confirm its functionality.
 
   Action:
 
 Output from Action:
-observation: write
-content:
+observation: read
+content: #!/bin/bash
+echo "hello"
+
 extras:
   path: hello.sh
 ======= End Actions =======
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_005.log
index a79e0c6a02..f7e8564066 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_005.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_005.log
@@ -66,8 +66,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  To begin solving the task, I need to create a new shell script file named 'hello.sh'. This file should contain a script that prints the message 'hello' when executed. I will start by writing the initial content of this script.
+  thought: Thoughts:
+  To solve the task of writing a shell script named 'hello.sh' that prints 'hello', I need to create a new file named 'hello.sh'. I will then write the necessary shell script code into this file.
 
   Action:
 
@@ -84,8 +84,8 @@ args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  Since the file 'hello.sh' has been created, the next step is to write the shell script code that prints "hello". This involves adding a shebang line to specify the shell interpreter and a print command. I will add these lines to the 'hello.sh' file.
+  thought: Thoughts:
+  The task is to write a shell script named 'hello.sh' that prints 'hello'. I have already created the file 'hello.sh' in a previous step. Now, I need to write the shell script code into this file. The script should include the shebang line to specify the shell interpreter and a command to print 'hello'.
 
   Action:
 
@@ -97,31 +97,13 @@ extras:
 
 Memory 2:
 Previous Action:
-action: write
-args:
-  path: hello.sh
-  start: 0
-  end: -1
-  thoughts: Thoughts:
-  I have created the file 'hello.sh' but have not yet written the script content that prints "hello". The next step is to write the shell script code into 'hello.sh'. This will include a shebang line to specify the shell interpreter (bash in this case) and an echo command to print "hello".
-
-  Action:
-
-Output from Action:
-observation: write
-content:
-extras:
-  path: hello.sh
-
-Memory 3:
-Previous Action:
 action: read
 args:
   path: hello.sh
   start: 0
   end: -1
-  thoughts: Thoughts:
-  I have created the file 'hello.sh' but have not yet confirmed the content of the script. The next step is to read the content of 'hello.sh' to ensure that it contains the correct shell script code that prints "hello". This will help verify that the previous write actions were successful and the file is ready for execution.
+  thought: Thoughts:
+  I need to verify the contents of the 'hello.sh' script to ensure that it correctly contains the shell script code to print 'hello'. This involves checking if the shebang line is present and if the print command is correctly written. If the script is correct, I can then proceed to test it to confirm its functionality.
 
   Action:
 
@@ -132,6 +114,22 @@ echo "hello"
 
 extras:
   path: hello.sh
+
+Memory 3:
+Previous Action:
+action: run
+args:
+  command: bash hello.sh
+  background: False
+  thought:
+
+Output from Action:
+observation: run
+content: hello
+extras:
+  command_id: -1
+  command: bash hello.sh
+  exit_code: 0
 ======= End Actions =======
 Use these memories to provide additional context to the problem you are solving.
 Remember that you have already completed these steps so you do not need to perform them again.
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_006.log
deleted file mode 100644
index 4a68e46a11..0000000000
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/prompt_006.log
+++ /dev/null
@@ -1,198 +0,0 @@
-
-
-----------
-
-SYSTEM INFO:
-You am an autonomous coding agent, here to provide solutions for coding issues.
-You have been designed to assist you with a wide range of programming tasks, from code editing and debugging to testing and deployment.
-You have access to a variety of tools and commands that you can use to help you solve problems efficiently.
-
-INSTRUCTIONS:
-Now, you're going to solve this issue on your own. You can use any bash commands or custom commands you wish to complete your task. Edit all the files you need to and run any checks or tests that you want.
-Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
-When you're satisfied with all of the changes you've made, you can indicate that you are done by running the exit command.
-Note however that you cannot use any interactive session commands (e.g. python, vim, node) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python <script_name>.py`.
-
-NOTE ABOUT THE write COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
-
-IMPORTANT TIPS:
-1. Reproduce the bug: Always start by trying to replicate the bug that the issue discusses. If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment and run it to ensure you can reproduce the bug. Then, start trying to fix it. When you think you've fixed the bug, re-run the bug reproduction script to make sure that the issue has indeed been resolved.
-   If the bug reproduction script does not print anything when it successfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, so that you can be sure the script ran fine all the way through.
-2. Try different commands: If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it.
-3. Navigate large files: If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, you would use the 'read' command like this: 'read <file> 583'. This is a much faster way to read through the file.
-4. Handle input files: If the bug reproduction script requires inputting/reading a specific file, such as 'buggy-input.png', and you'd like to understand how to input that file, conduct a search in the existing repository code to see whether someone else has already done that. Do this by running the command: 'search_dir "buggy-input.png"'. If that doesn't work, use the Linux 'find' command.
-5. Understand your context: Always make sure to look at the currently open file and the current working directory. The currently open file might be in a different directory than the working directory.
-6. Verify your edits: When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
-7. Thoroughly test your solution: After making any changes to fix a bug, be sure to thoroughly test your solution to ensure the bug has been resolved. Re-run the bug reproduction script and verify that the issue has been addressed.
-
-
-DOCUMENTATION:
-It is recommend that you use the commands provided for interacting with files and your directory because they have been specially built for you.
-They will make it much easier for you to look at files and make changes. Using these commands will help you be better at your task.
-You can open an file by using either the read or write operations.
-- If a file already exists you should read it before making any changes. Use the `edit` command to make changes once you have read it.
-- If you are creating a new file use the write command. Use the `edit` command to make changes once you have created the new file.
-
-Commands:
-exit - Executed when task is complete
-read <file_name> [<start_line>] [<end_line>] - Shows a given file's contents starting from <start_line> up to <end_line>. Default: start_line = 0, end_line = -1. By default the whole file will be read.
-write <file> <changes> [<start_line>] [<end_line>] - Modifies a <file> by replacing the current lines between <start_line> and <end_line> with <changes>. Default start_line = 0 and end_line = -1. Calling this with no line args will replace the whole file.
-browse <url> - Returns the text version of any url, this can be useful to look up documentation or finding issues on github
-scroll_up - Takes no arguments. This will scroll up and show you the 100 lines above your current lines
-scroll_down - Takes no arguments. This will scroll down and show you the 100 lines below your current lines
-edit <start_line> <end_line> <changes> - This will modify lines in the currently open file. use start_line and end_line to designate which lines to change and then write the multiline changes
-goto <line_num> - This will take you directly to a line and show you the 100 lines below it.
-<bash_command> <args> - You can use any bash command you need (cd, ls, rm, grep, dir, mv, wget, git, zip, etc.) with their arguments included
-pip install <package> - You can use this to import python packages. Make sure you include the correct package name when using this command.
-ls - Use the ls command to view all the files in your current directory, this is a good starting point.
-NOT ALLOWED - You cannot use interactive commands like python or node
-Custom bash commands:
-None
-
-
-The following commands require an open file to be used: edit, scroll_up, scroll_down, goto
-To modify the current file use 'edit'. To move through the current file use 'goto' or 'scroll_up'/'scroll_down'
-when using write and edit do not surround the code with any "" just write the code.
-
-----------
-
-These are your past 4 actions:
-===== Previous Actions =====
-
-Memory 0:
-Previous Action:
-action: write
-args:
-  path: hello.sh
-  start: 0
-  end: -1
-  thoughts: Thoughts:
-  Since the file 'hello.sh' has been created, the next step is to write the shell script code that prints "hello". This involves adding a shebang line to specify the shell interpreter and a print command. I will add these lines to the 'hello.sh' file.
-
-  Action:
-
-Output from Action:
-observation: write
-content:
-extras:
-  path: hello.sh
-
-Memory 1:
-Previous Action:
-action: write
-args:
-  path: hello.sh
-  start: 0
-  end: -1
-  thoughts: Thoughts:
-  I have created the file 'hello.sh' but have not yet written the script content that prints "hello". The next step is to write the shell script code into 'hello.sh'. This will include a shebang line to specify the shell interpreter (bash in this case) and an echo command to print "hello".
-
-  Action:
-
-Output from Action:
-observation: write
-content:
-extras:
-  path: hello.sh
-
-Memory 2:
-Previous Action:
-action: read
-args:
-  path: hello.sh
-  start: 0
-  end: -1
-  thoughts: Thoughts:
-  I have created the file 'hello.sh' but have not yet confirmed the content of the script. The next step is to read the content of 'hello.sh' to ensure that it contains the correct shell script code that prints "hello". This will help verify that the previous write actions were successful and the file is ready for execution.
-
-  Action:
-
-Output from Action:
-observation: read
-content: #!/bin/bash
-echo "hello"
-
-extras:
-  path: hello.sh
-
-Memory 3:
-Previous Action:
-action: run
-args:
-  command: bash hello.sh
-  background: False
-
-Output from Action:
-observation: run
-content: hello
-extras:
-  command_id: -1
-  command: bash hello.sh
-  exit_code: 0
-======= End Actions =======
-Use these memories to provide additional context to the problem you are solving.
-Remember that you have already completed these steps so you do not need to perform them again.
-
-----------
-
-RESPONSE FORMAT:
-This is the format of the response you will make in order to solve the current issue.
-You will be given multiple iterations to complete this task so break it into steps and solve them one by one.
-
-Your output must contain the following:
-- First, thoughts about what your next action should be and plan it out.
-    - You will have a memory of your thoughts so you can use this to remember things for the next step.
-    - Use your thoughts to think about what you are currently doing, what you have done on prior steps and how that relates to solving the problem.
-- Second, create a piece of code that will execute your next action based on the thoughts you have.
-    - Remember that you can only have one action for each thought, do not include multiple actions.
-
-Your code MUST be surrounded in triple back ticks EXACTLY like this:
-```
-<code>
-```
-
-Notes:
-- Adhere to the format so that the program loop continues smoothly, it is very important to only give one command per output.
-- DO NOT give more than one command within the triple backticks. This will just throw an error and nothing will happen as a result.
-- Do not give multiple code blocks, if you do only the second one will be captured and run, this might give an error if the first one was necessary.
-- To execute multiple commands you should write them down in your thoughts section so you can remember it on the next step and execute them then.
-- The only commands you are not capable of executing are interactive commands like `python` or `node` by themselves.
-- If you think that you have completed the task that has been given to you based on your previous actions and outputs then use ``` exit ``` as the command to let the system know that you are done.
-- DO NOT make any copies of your previous memories those will be provided to you at each step, making copies just wastes time and energy. Think smarter not harder.
-- The write and edit commands requires proper indentation in the content section ex. `write hw.py def hello():
-    print('Hello World')` this is how you would have to format your write command.
-    - The white spaces matter as the code changes will be added to the code so they must have proper syntax.
-
-This is a template using the format described above
-Items in <> are suggestions for you, fill them out based on the context of the problem you are solving.
-
-[ FORMAT ]
-Thoughts:
-<Provide clear and concise thoughts on the next step to take, highlighting any important details or context that should be remembered.>
-<You can use multiple lines to express your thoughts>
-
-Action:
-```
-<command> <params>
-```
-[ END FORMAT ]
-
-Do not provide anything extra just your thought and action.
-
-You are currently trying to complete this task:
-Write a shell script 'hello.sh' that prints 'hello'.
-
-CURRENT WORKSPACE:
-    Open File: hello.sh on line 0
-    You can use these commands with the current file:
-    Navigation: `scroll_up`, `scroll_down`, and `goto <line>`
-    Modification: `edit <start_line> <end_line> <changes>`
-
-
-Keep all of the guidelines above in mind when you are thinking and making code.
-Please come up with a thought and action based on your current task and latest steps.
-Make sure that you do not repeat the same actions, there will not be any changes in result if you do not changes anything.
-Be very strict about the formatting that you use and make sure you follow the guidelines.
-NEVER output multiple commands. ONLY take ONE STEP at a time.
-When you have completed your task run the "exit" command.
-Begin with your thought about the next step and then come up with an action to perform your thought.
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/response_001.log b/tests/integration/mock/SWEAgent/test_write_simple_script/response_001.log
index 757ced6842..c434807f55 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/response_001.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/response_001.log
@@ -1,8 +1,8 @@
 Thoughts:
-To begin solving the task, I need to create a new shell script file named 'hello.sh'. This file should contain a script that prints the message 'hello' when executed. I will start by writing the initial content of this script.
+To solve the task of writing a shell script named 'hello.sh' that prints 'hello', I need to create a new file named 'hello.sh'. I will then write the necessary shell script code into this file.
 
 Action:
 ```
 write hello.sh #!/bin/bash
-echo 'hello'
+echo "hello"
 ```
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/response_002.log b/tests/integration/mock/SWEAgent/test_write_simple_script/response_002.log
index a9b34f581c..1cc79f854b 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/response_002.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/response_002.log
@@ -1,5 +1,5 @@
 Thoughts:
-Since the file 'hello.sh' has been created, the next step is to write the shell script code that prints "hello". This involves adding a shebang line to specify the shell interpreter and a print command. I will add these lines to the 'hello.sh' file.
+The task is to write a shell script named 'hello.sh' that prints 'hello'. I have already created the file 'hello.sh' in a previous step. Now, I need to write the shell script code into this file. The script should include the shebang line to specify the shell interpreter and a command to print 'hello'.
 
 Action:
 ```
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/response_003.log b/tests/integration/mock/SWEAgent/test_write_simple_script/response_003.log
index 0fa2854495..49592827e5 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/response_003.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/response_003.log
@@ -1,8 +1,7 @@
 Thoughts:
-I have created the file 'hello.sh' but have not yet written the script content that prints "hello". The next step is to write the shell script code into 'hello.sh'. This will include a shebang line to specify the shell interpreter (bash in this case) and an echo command to print "hello".
+I need to verify the contents of the 'hello.sh' script to ensure that it correctly contains the shell script code to print 'hello'. This involves checking if the shebang line is present and if the print command is correctly written. If the script is correct, I can then proceed to test it to confirm its functionality.
 
 Action:
 ```
-write hello.sh #!/bin/bash
-echo "hello"
+read hello.sh
 ```
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/response_004.log b/tests/integration/mock/SWEAgent/test_write_simple_script/response_004.log
index fa6d159a3a..2092717d50 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/response_004.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/response_004.log
@@ -1,7 +1,7 @@
 Thoughts:
-I have created the file 'hello.sh' but have not yet confirmed the content of the script. The next step is to read the content of 'hello.sh' to ensure that it contains the correct shell script code that prints "hello". This will help verify that the previous write actions were successful and the file is ready for execution.
+The shell script 'hello.sh' has been written and contains the necessary commands to print 'hello'. The next step is to test the script to ensure it functions as expected. This involves running the script and checking the output. If the script runs successfully and prints 'hello', then the task is completed.
 
 Action:
 ```
-read hello.sh
+bash hello.sh
 ```
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/response_005.log b/tests/integration/mock/SWEAgent/test_write_simple_script/response_005.log
index 5ff2634c95..7515501080 100644
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/response_005.log
+++ b/tests/integration/mock/SWEAgent/test_write_simple_script/response_005.log
@@ -1,7 +1,7 @@
 Thoughts:
-The shell script 'hello.sh' has been successfully created and contains the necessary code to print "hello". The next logical step is to test the script to ensure it functions as expected. This involves executing the script and observing the output. If the script runs correctly and prints "hello", then the task is completed successfully.
+The shell script 'hello.sh' has been successfully created and tested, and it correctly prints 'hello' as required. The script includes the shebang line and the echo command to print 'hello'. Since the script is functioning as expected and meets the task requirements, there are no further modifications or tests needed. It's time to conclude this task.
 
 Action:
 ```
-bash hello.sh
+exit
 ```
diff --git a/tests/integration/mock/SWEAgent/test_write_simple_script/response_006.log b/tests/integration/mock/SWEAgent/test_write_simple_script/response_006.log
deleted file mode 100644
index bc1cfb2181..0000000000
--- a/tests/integration/mock/SWEAgent/test_write_simple_script/response_006.log
+++ /dev/null
@@ -1,7 +0,0 @@
-Thoughts:
-The shell script 'hello.sh' has been successfully created and tested, and it correctly prints "hello" when executed. Since the task was to write a shell script that prints 'hello' and this has been achieved as evidenced by the output from the previous run command, there are no further modifications or actions needed on the script. Therefore, I should conclude this task.
-
-Action:
-```
-exit
-```
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index 137425f18e..2a3522d658 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -1,5 +1,5 @@
-import os
 import asyncio
+import os
 import subprocess
 
 import pytest
@@ -7,7 +7,11 @@ import pytest
 from opendevin.main import main
 
 
-@pytest.mark.skipif(os.environ.get('AGENT') == 'CodeActAgent', reason='CodeActAgent requires task to be in a special format')
+# skip if
+@pytest.mark.skipif(
+    os.getenv('AGENT') == 'CodeActAgent' and os.getenv('SANDBOX_TYPE').lower() == 'exec',
+    reason='CodeActAgent does not support exec sandbox since exec sandbox is NOT stateful'
+)
 def test_write_simple_script():
     task = "Write a shell script 'hello.sh' that prints 'hello'."
     asyncio.run(main(task))
diff --git a/tests/test_fileops.py b/tests/test_fileops.py
index e657ac7dc8..3f22422a66 100644
--- a/tests/test_fileops.py
+++ b/tests/test_fileops.py
@@ -1,9 +1,10 @@
-from opendevin import config
-from opendevin.schema import ConfigType
-from opendevin.action import fileop
 from pathlib import Path
+
 import pytest
 
+from opendevin import config
+from opendevin.action import fileop
+from opendevin.schema import ConfigType
 
 
 def test_resolve_path():
diff --git a/tests/unit/test_action_github.py b/tests/unit/test_action_github.py
index 570613ec51..bb26f5d660 100644
--- a/tests/unit/test_action_github.py
+++ b/tests/unit/test_action_github.py
@@ -1,16 +1,17 @@
 
-from opendevin import config
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
 from agenthub.dummy_agent.agent import DummyAgent
+from opendevin import config
 from opendevin.action.github import GitHubPushAction, GitHubSendPRAction
 from opendevin.controller.agent_controller import AgentController
 from opendevin.llm.llm import LLM
 from opendevin.observation.error import AgentErrorObservation
 from opendevin.observation.message import AgentMessageObservation
 from opendevin.observation.run import CmdOutputObservation
-
 from opendevin.schema.config import ConfigType
-import pytest
-from unittest.mock import MagicMock, call, patch
 
 
 @pytest.fixture
diff --git a/tests/unit/test_action_serialization.py b/tests/unit/test_action_serialization.py
index 05d383a399..33632e2820 100644
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@@ -1,17 +1,17 @@
 from opendevin.action import (
-    action_from_dict,
     Action,
+    AddTaskAction,
+    AgentFinishAction,
+    AgentRecallAction,
     AgentThinkAction,
+    BrowseURLAction,
     CmdKillAction,
     CmdRunAction,
-    BrowseURLAction,
-    GitHubPushAction,
     FileReadAction,
     FileWriteAction,
-    AgentRecallAction,
-    AgentFinishAction,
-    AddTaskAction,
+    GitHubPushAction,
     ModifyTaskAction,
+    action_from_dict,
 )
 
 
@@ -39,7 +39,7 @@ def test_agent_think_action_serialization_deserialization():
 def test_agent_recall_action_serialization_deserialization():
     original_action_dict = {
         'action': 'recall',
-        'args': {'query': 'Test query.'}
+        'args': {'query': 'Test query.', 'thought': ''}
     }
     serialization_deserialization(original_action_dict, AgentRecallAction)
 
@@ -47,7 +47,7 @@ def test_agent_recall_action_serialization_deserialization():
 def test_agent_finish_action_serialization_deserialization():
     original_action_dict = {
         'action': 'finish',
-        'args': {'outputs': {}},
+        'args': {'outputs': {}, 'thought': ''}
     }
     serialization_deserialization(original_action_dict, AgentFinishAction)
 
@@ -55,7 +55,7 @@ def test_agent_finish_action_serialization_deserialization():
 def test_cmd_kill_action_serialization_deserialization():
     original_action_dict = {
         'action': 'kill',
-        'args': {'id': '1337'}
+        'args': {'id': '1337', 'thought': ''}
     }
     serialization_deserialization(original_action_dict, CmdKillAction)
 
@@ -63,7 +63,7 @@ def test_cmd_kill_action_serialization_deserialization():
 def test_cmd_run_action_serialization_deserialization():
     original_action_dict = {
         'action': 'run',
-        'args': {'command': 'echo "Hello world"', 'background': True}
+        'args': {'command': 'echo "Hello world"', 'background': True, 'thought': ''}
     }
     serialization_deserialization(original_action_dict, CmdRunAction)
 
@@ -71,7 +71,7 @@ def test_cmd_run_action_serialization_deserialization():
 def test_browse_url_action_serialization_deserialization():
     original_action_dict = {
         'action': 'browse',
-        'args': {'url': 'https://www.example.com'}
+        'args': {'thought': '', 'url': 'https://www.example.com'}
     }
     serialization_deserialization(original_action_dict, BrowseURLAction)
 
@@ -87,7 +87,7 @@ def test_github_push_action_serialization_deserialization():
 def test_file_read_action_serialization_deserialization():
     original_action_dict = {
         'action': 'read',
-        'args': {'path': '/path/to/file.txt', 'start': 0, 'end': -1, 'thoughts': 'None'}
+        'args': {'path': '/path/to/file.txt', 'start': 0, 'end': -1, 'thought': 'None'}
     }
     serialization_deserialization(original_action_dict, FileReadAction)
 
@@ -95,7 +95,7 @@ def test_file_read_action_serialization_deserialization():
 def test_file_write_action_serialization_deserialization():
     original_action_dict = {
         'action': 'write',
-        'args': {'path': '/path/to/file.txt', 'content': 'Hello world', 'start': 0, 'end': 1, 'thoughts': 'None'}
+        'args': {'path': '/path/to/file.txt', 'content': 'Hello world', 'start': 0, 'end': 1, 'thought': 'None'}
     }
     serialization_deserialization(original_action_dict, FileWriteAction)
 
@@ -103,7 +103,7 @@ def test_file_write_action_serialization_deserialization():
 def test_add_task_action_serialization_deserialization():
     original_action_dict = {
         'action': 'add_task',
-        'args': {'parent': 'Test parent', 'goal': 'Test goal', 'subtasks': []}
+        'args': {'parent': 'Test parent', 'goal': 'Test goal', 'subtasks': [], 'thought': ''}
     }
     serialization_deserialization(original_action_dict, AddTaskAction)
 
@@ -111,6 +111,6 @@ def test_add_task_action_serialization_deserialization():
 def test_modify_task_action_serialization_deserialization():
     original_action_dict = {
         'action': 'modify_task',
-        'args': {'id': 1, 'state': 'Test state.'}
+        'args': {'id': 1, 'state': 'Test state.', 'thought': ''}
     }
     serialization_deserialization(original_action_dict, ModifyTaskAction)
diff --git a/tests/unit/test_arg_parser.py b/tests/unit/test_arg_parser.py
index 6c140f795a..73e8e811b6 100644
--- a/tests/unit/test_arg_parser.py
+++ b/tests/unit/test_arg_parser.py
@@ -1,7 +1,7 @@
-from opendevin.config import get_parser
-
 import pytest
 
+from opendevin.config import get_parser
+
 
 def test_help_message(capsys):
     parser = get_parser()
diff --git a/tests/unit/test_micro_agents.py b/tests/unit/test_micro_agents.py
new file mode 100644
index 0000000000..d08dfbc850
--- /dev/null
+++ b/tests/unit/test_micro_agents.py
@@ -0,0 +1,74 @@
+import json
+import os
+from unittest.mock import MagicMock
+
+import yaml
+
+from agenthub.micro.registry import all_microagents
+from opendevin.agent import Agent
+from opendevin.plan import Plan
+from opendevin.state import State
+
+
+def test_all_agents_are_loaded():
+    full_path = os.path.join('agenthub', 'micro')
+    agent_names = set()
+    for root, _, files in os.walk(full_path):
+        for file in files:
+            if file == 'agent.yaml':
+                file_path = os.path.join(root, file)
+                with open(file_path, 'r') as yaml_file:
+                    data = yaml.safe_load(yaml_file)
+                    agent_names.add(data['name'])
+    assert agent_names == set(all_microagents.keys())
+
+
+def test_coder_agent_with_summary():
+    """
+    Coder agent should render code summary as part of prompt
+    """
+    mock_llm = MagicMock()
+    content = json.dumps({'action': 'finish', 'args': {}})
+    mock_llm.completion.return_value = {
+        'choices': [{'message': {'content': content}}]
+    }
+
+    coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
+    assert coder_agent is not None
+    task = 'This is a dummy task'
+    plan = Plan(task)
+    summary = 'This is a dummy summary about this repo'
+    state = State(plan, inputs={'summary': summary})
+    coder_agent.step(state)
+
+    mock_llm.completion.assert_called_once()
+    _, kwargs = mock_llm.completion.call_args
+    prompt = kwargs['messages'][0]['content']
+    assert task in prompt
+    assert "Here's a summary of the codebase, as it relates to this task" in prompt
+    assert summary in prompt
+
+
+def test_coder_agent_without_summary():
+    """
+    When there's no codebase_summary available, there shouldn't be any prompt
+    about 'code summary'
+    """
+    mock_llm = MagicMock()
+    content = json.dumps({'action': 'finish', 'args': {}})
+    mock_llm.completion.return_value = {
+        'choices': [{'message': {'content': content}}]
+    }
+
+    coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
+    assert coder_agent is not None
+    task = 'This is a dummy task'
+    plan = Plan(task)
+    state = State(plan)
+    coder_agent.step(state)
+
+    mock_llm.completion.assert_called_once()
+    _, kwargs = mock_llm.completion.call_args
+    prompt = kwargs['messages'][0]['content']
+    assert task in prompt
+    assert "Here's a summary of the codebase, as it relates to this task" not in prompt
diff --git a/tests/unit/test_observation_serialization.py b/tests/unit/test_observation_serialization.py
index e75efc0a14..e7870f88bf 100644
--- a/tests/unit/test_observation_serialization.py
+++ b/tests/unit/test_observation_serialization.py
@@ -1,4 +1,8 @@
-from opendevin.observation import observation_from_dict, Observation, CmdOutputObservation
+from opendevin.observation import (
+    CmdOutputObservation,
+    Observation,
+    observation_from_dict,
+)
 
 
 def test_observation_serialization_deserialization():
diff --git a/tests/unit/test_sandbox.py b/tests/unit/test_sandbox.py
new file mode 100644
index 0000000000..e815033c9a
--- /dev/null
+++ b/tests/unit/test_sandbox.py
@@ -0,0 +1,115 @@
+import pathlib
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from opendevin import config
+from opendevin.sandbox.docker.ssh_box import DockerSSHBox
+
+
+@pytest.fixture
+def temp_dir():
+    # get a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        pathlib.Path().mkdir(parents=True, exist_ok=True)
+        yield temp_dir
+
+
+def test_ssh_box_run_as_devin(temp_dir):
+    # get a temporary directory
+    with patch.dict(
+        config.config,
+        {
+            config.ConfigType.WORKSPACE_BASE: temp_dir,
+            config.ConfigType.RUN_AS_DEVIN: 'true',
+            config.ConfigType.SANDBOX_TYPE: 'ssh',
+        },
+        clear=True
+    ):
+        ssh_box = DockerSSHBox()
+
+        # test the ssh box
+        exit_code, output = ssh_box.execute('ls -l')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert output.strip() == 'total 0'
+
+        exit_code, output = ssh_box.execute('mkdir test')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert output.strip() == ''
+
+        exit_code, output = ssh_box.execute('ls -l')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert 'opendevin' in output, "The output should contain username 'opendevin'"
+        assert 'test' in output, 'The output should contain the test directory'
+
+        exit_code, output = ssh_box.execute('touch test/foo.txt')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert output.strip() == ''
+
+        exit_code, output = ssh_box.execute('ls -l test')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert 'foo.txt' in output, 'The output should contain the foo.txt file'
+
+
+def test_ssh_box_multi_line_cmd_run_as_devin(temp_dir):
+    # get a temporary directory
+    with patch.dict(
+        config.config,
+        {
+            config.ConfigType.WORKSPACE_BASE: temp_dir,
+            config.ConfigType.RUN_AS_DEVIN: 'true',
+            config.ConfigType.SANDBOX_TYPE: 'ssh',
+        },
+        clear=True
+    ):
+        ssh_box = DockerSSHBox()
+
+        # test the ssh box
+        exit_code, output = ssh_box.execute('pwd\nls -l')
+        assert exit_code == 0, 'The exit code should be 0.'
+        expected_lines = ['/workspacels -l', 'total 0']
+        assert output.strip().splitlines() == expected_lines
+
+def test_ssh_box_stateful_cmd_run_as_devin(temp_dir):
+    # get a temporary directory
+    with patch.dict(
+        config.config,
+        {
+            config.ConfigType.WORKSPACE_BASE: temp_dir,
+            config.ConfigType.RUN_AS_DEVIN: 'true',
+            config.ConfigType.SANDBOX_TYPE: 'ssh',
+        },
+        clear=True
+    ):
+        ssh_box = DockerSSHBox()
+
+        # test the ssh box
+        exit_code, output = ssh_box.execute('mkdir test')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert output.strip() == ''
+
+        exit_code, output = ssh_box.execute('cd test')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert output.strip() == ''
+
+        exit_code, output = ssh_box.execute('pwd')
+        assert exit_code == 0, 'The exit code should be 0.'
+        assert output.strip() == '/workspace/test'
+
+def test_ssh_box_failed_cmd_run_as_devin(temp_dir):
+    # get a temporary directory
+    with patch.dict(
+        config.config,
+        {
+            config.ConfigType.WORKSPACE_BASE: temp_dir,
+            config.ConfigType.RUN_AS_DEVIN: 'true',
+            config.ConfigType.SANDBOX_TYPE: 'ssh',
+        },
+        clear=True
+    ):
+        ssh_box = DockerSSHBox()
+
+        # test the ssh box with a command that fails
+        exit_code, output = ssh_box.execute('non_existing_command')
+        assert exit_code != 0, 'The exit code should not be 0 for a failed command.'