From 8bfae8413e74e1b6095d5380a7ea93b6b73a4537 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sun, 12 May 2024 15:57:33 +0800
Subject: [PATCH] Support passing sandbox as argument and iteration reminder
 (#1730)

* support custom sandbox;
add iteration_reminder

* Enable iteration reminder in CodeActAgent integration test

* Don't remove numbers when comparing prompts

* Update tests/integration/README.md

---------

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
---
 .github/workflows/run-integration-tests.yml   |  1 +
 opendevin/controller/agent_controller.py      | 38 ++++++++++++++--
 opendevin/core/config.py                      |  1 +
 opendevin/core/schema/config.py               |  1 +
 opendevin/runtime/runtime.py                  |  7 ++-
 tests/integration/README.md                   |  9 ++--
 tests/integration/conftest.py                 |  6 +--
 .../CodeActAgent/test_edits/prompt_002.log    |  9 ++--
 .../CodeActAgent/test_edits/prompt_003.log    | 44 ++++++++++---------
 .../test_write_simple_script/prompt_002.log   |  1 +
 tests/integration/regenerate.sh               | 15 +++++--
 tests/integration/test_agent.py               |  1 -
 12 files changed, 89 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/run-integration-tests.yml b/.github/workflows/run-integration-tests.yml
index 82b4a9355a..76c6fa4620 100644
--- a/.github/workflows/run-integration-tests.yml
+++ b/.github/workflows/run-integration-tests.yml
@@ -57,6 +57,7 @@ jobs:
           AGENT: ${{ matrix.agent }}
           MAX_ITERATIONS: 10
           LLM_EMBEDDING_MODEL: ${{ matrix.embedding-model }}
+          REMIND_ITERATIONS: ${{ (matrix.agent == 'CodeActAgent') && 'true' || 'false' }}
         run: |
           rm -rf workspace
           mkdir workspace
diff --git a/opendevin/controller/agent_controller.py b/opendevin/controller/agent_controller.py
index 43ea966a81..d4bfb6513d 100644
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@@ -33,7 +33,7 @@ from opendevin.events.observation import (
     Observation,
 )
 from opendevin.events.stream import EventSource, EventStream, EventStreamSubscriber
-from opendevin.runtime import DockerSSHBox
+from opendevin.runtime import DockerSSHBox, Sandbox
 from opendevin.runtime.runtime import Runtime
 from opendevin.runtime.server.runtime import ServerRuntime
 
@@ -60,6 +60,8 @@ class AgentController:
         sid: str = 'default',
         max_iterations: int = MAX_ITERATIONS,
         max_chars: int = MAX_CHARS,
+        sandbox: Optional[Sandbox] = None,
+        remind_iterations: bool = config.remind_iterations,
     ):
         """Initializes a new instance of the AgentController class.
 
@@ -68,6 +70,8 @@ class AgentController:
             sid: The session ID of the agent.
             max_iterations: The maximum number of iterations the agent can run.
             max_chars: The maximum number of characters the agent can output.
+            sandbox: An optional initialized sandbox to run the agent in. If not provided, a default sandbox will be created based on config.
+            remind_iterations: A boolean value indicating whether to remind the agent its remaining budget of interaction.
         """
         self.id = sid
         self.agent = agent
@@ -76,8 +80,15 @@ class AgentController:
             EventStreamSubscriber.AGENT_CONTROLLER, self.on_event
         )
         self.max_iterations = max_iterations
-        self.runtime = ServerRuntime(self.id)
+
+        self.remind_iterations = remind_iterations
+        if self.remind_iterations:
+            logger.info(
+                'Iteration reminder is ENABLED: agent will be reminded of remaining turns.'
+            )
+        self.runtime = ServerRuntime(sandbox=sandbox, sid=self.id)
         self.max_chars = max_chars
+
         # Initialize agent-required plugins for sandbox (if any)
         self.runtime.init_sandbox_plugins(agent.sandbox_plugins)
 
@@ -187,7 +198,9 @@ class AgentController:
         self.agent.reset()
 
     async def set_agent_state_to(self, new_state: AgentState):
-        logger.info(f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}')
+        logger.info(
+            f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}'
+        )
         if new_state == self._agent_state:
             return
 
@@ -201,7 +214,11 @@ class AgentController:
             self._cur_step += 1
             if self.agent_task is not None:
                 self.agent_task.cancel()
-        elif new_state == AgentState.STOPPED or new_state == AgentState.ERROR or new_state == AgentState.FINISHED:
+        elif (
+            new_state == AgentState.STOPPED
+            or new_state == AgentState.ERROR
+            or new_state == AgentState.FINISHED
+        ):
             await self.reset_task()
 
         await self.event_stream.add_event(
@@ -225,6 +242,17 @@ class AgentController:
         task = action.inputs.get('task') or ''
         await self.delegate.setup_task(task, action.inputs)
 
+    def add_iteration_reminder_when_needed(self, i: int, obs: Observation):
+        """Add iteration reminder to the observation if needed.
+
+        Args:
+            i: The current iteration number (0-indexed).
+            obs: The observation to add the reminder to.
+        """
+        if self.remind_iterations:
+            obs.content += f'\n\nENVIRONMENT REMINDER: You have {self.max_iterations - i - 1} turns left to complete the task.'
+        return obs
+
     async def step(self, i: int) -> bool:
         if self.state is None:
             raise ValueError('No task to run')
@@ -265,6 +293,7 @@ class AgentController:
         if isinstance(action, AgentFinishAction):
             self.state.outputs = action.outputs  # type: ignore[attr-defined]
             logger.info(action, extra={'msg_type': 'INFO'})
+            await self.add_history(action, NullObservation(''))
             return True
         elif isinstance(action, MessageAction) and action.wait_for_response:
             # FIXME: remove this once history is managed outside the agent controller
@@ -280,6 +309,7 @@ class AgentController:
         elif not isinstance(observation, ErrorObservation):
             observation = await self.runtime.run_action(action)
 
+        observation = self.add_iteration_reminder_when_needed(i, observation)
         if not isinstance(observation, NullObservation):
             logger.info(observation, extra={'msg_type': 'OBSERVATION'})
         await self.add_history(action, observation)
diff --git a/opendevin/core/config.py b/opendevin/core/config.py
index f453ce5cd7..030b054ce8 100644
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@@ -82,6 +82,7 @@ class AppConfig(metaclass=Singleton):
     )
     run_as_devin: bool = True
     max_iterations: int = 100
+    remind_iterations: bool = False
     e2b_api_key: str = ''
     sandbox_type: str = 'ssh'  # Can be 'ssh', 'exec', or 'e2b'
     use_host_network: bool = False
diff --git a/opendevin/core/schema/config.py b/opendevin/core/schema/config.py
index f409b9940a..a58ef8ade4 100644
--- a/opendevin/core/schema/config.py
+++ b/opendevin/core/schema/config.py
@@ -31,6 +31,7 @@ class ConfigType(str, Enum):
     AGENT_MEMORY_MAX_THREADS = 'AGENT_MEMORY_MAX_THREADS'
     AGENT_MEMORY_ENABLED = 'AGENT_MEMORY_ENABLED'
     MAX_ITERATIONS = 'MAX_ITERATIONS'
+    REMIND_ITERATIONS = 'REMIND_ITERATIONS'
     MAX_CHARS = 'MAX_CHARS'
     AGENT = 'AGENT'
     E2B_API_KEY = 'E2B_API_KEY'
diff --git a/opendevin/runtime/runtime.py b/opendevin/runtime/runtime.py
index 2c5735334f..25088adfa9 100644
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@@ -51,14 +51,17 @@ class Runtime:
     """
 
     sid: str
-    sandbox: Sandbox
 
     def __init__(
         self,
+        sandbox: Sandbox | None = None,
         sid: str = 'default',
     ):
         self.sid = sid
-        self.sandbox = create_sandbox(sid, config.sandbox_type)
+        if sandbox is None:
+            self.sandbox = create_sandbox(sid, config.sandbox_type)
+        else:
+            self.sandbox = sandbox
         self.browser = BrowserEnv()
 
     def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:
diff --git a/tests/integration/README.md b/tests/integration/README.md
index d4d8c8dd1a..dd82d88024 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -20,11 +20,10 @@ not possible with benchmarks.
 
 Known limitations:
 1. To avoid the potential impact of non-determinism, we remove all special
-characters and numbers (often used as PIDs) when doing the comparison. If two
-prompts for the same task only differ in non-alpha characters, a wrong mock
-response might be picked up.
-2. It is required that the agent itself doesn't do anything non-deterministic,
-including but not limited to using randomly generated numbers.
+characters when doing the comparison. If two prompts for the same task only
+differ in non-alphanumeric characters, a wrong mock response might be picked up.
+2. It is required that everything has to be deterministic. For example, agent
+must not use randomly generated numbers.
 
 The folder is organised as follows:
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 346cad6a11..5fba9302e0 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -11,7 +11,7 @@ workspace_path = os.getenv('WORKSPACE_BASE')
 
 
 def filter_out_symbols(input):
-    return ' '.join([char for char in input if char.isalpha()])
+    return ' '.join([char for char in input if char.isalnum()])
 
 
 def get_log_id(prompt_log_name):
@@ -100,7 +100,7 @@ def patch_completion(monkeypatch, request):
         monkeypatch.setattr('sys.stdin', user_responses)
 
 
-def clean_up():
+def set_up():
     assert workspace_path is not None
     if os.path.exists(workspace_path):
         for file in os.listdir(workspace_path):
@@ -109,7 +109,7 @@ def clean_up():
 
 @pytest.fixture(autouse=True)
 def resource_setup():
-    clean_up()
+    set_up()
     if not os.path.exists(workspace_path):
         os.makedirs(workspace_path)
     # Yield to test execution
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
index 74d51f5ebb..59bb6b5e81 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
@@ -229,9 +229,10 @@ open bad.txt
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typoo.
-2:Really?
-3:No mor typos!
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typoo.
+2:Really?
+3:No mor typos!
 4:Enjoy!
+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 [Command -1 finished with exit code 0]]
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
index 2f7d232698..cd7320e838 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
@@ -229,11 +229,12 @@ open bad.txt
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typoo.
-2:Really?
-3:No mor typos!
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typoo.
+2:Really?
+3:No mor typos!
 4:Enjoy!
+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 [Command -1 finished with exit code 0]]
 
 ----------
@@ -257,22 +258,23 @@ end_of_edit
 ----------
 
 OBSERVATION:
-> This is a stupid typo.
-> EOF
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typo.
-2:Really?
-3:No mor typos!
-4:Enjoy!
-File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
-> No more typos!
-> EOF
-
-
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typo.
-2:Really?
-3:No more typos!
-4:Enjoy!
+> This is a stupid typo.
+> EOF
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typo.
+2:Really?
+3:No mor typos!
+4:Enjoy!
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
+> No more typos!
+> EOF
+
+
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typo.
+2:Really?
+3:No more typos!
+4:Enjoy!
 File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+ENVIRONMENT REMINDER: You have 8 turns left to complete the task.
 [Command -1 finished with exit code 0]]
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
index d28f72a101..b66209b26e 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
@@ -230,4 +230,5 @@ echo "echo hello" > hello.sh
 
 OBSERVATION:
 
+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 [Command -1 finished with exit code 0]]
diff --git a/tests/integration/regenerate.sh b/tests/integration/regenerate.sh
index f17b5d98bb..d333bf347a 100755
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@@ -4,20 +4,26 @@ set -eo pipefail
 WORKSPACE_MOUNT_PATH=$(pwd)/_test_workspace
 WORKSPACE_BASE=$(pwd)/_test_workspace
 SANDBOX_TYPE="ssh"
+MAX_ITERATIONS=10
 
 # FIXME: SWEAgent hangs, so it goes last
 agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
+# only enable iteration reminder for CodeActAgent in tests
+remind_iterations_config=(false true false false)
 tasks=("Fix typos in bad.txt." "Write a shell script 'hello.sh' that prints 'hello'.")
 test_names=("test_edits" "test_write_simple_script")
 
 num_of_tests=${#tasks[@]}
+num_of_agents=${#agents[@]}
 
 rm -rf logs
 rm -rf _test_workspace
 for ((i = 0; i < num_of_tests; i++)); do
   task=${tasks[i]}
   test_name=${test_names[i]}
-  for agent in "${agents[@]}"; do
+  for ((j = 0; j < num_of_agents; j++)); do
+    agent=${agents[j]}
+    remind_iterations=${remind_iterations_config[j]}
     echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
     rm -rf $WORKSPACE_BASE
     mkdir $WORKSPACE_BASE
@@ -28,7 +34,8 @@ for ((i = 0; i < num_of_tests; i++)); do
       # Temporarily disable 'exit on error'
       set +e
     fi
-    SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE MAX_ITERATIONS=10 \
+    SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
+      MAX_ITERATIONS=$MAX_ITERATIONS REMIND_ITERATIONS=$remind_iterations \
       WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
       poetry run pytest -s ./tests/integration/test_agent.py::$test_name
     TEST_STATUS=$?
@@ -43,10 +50,10 @@ for ((i = 0; i < num_of_tests; i++)); do
       rm -rf logs
       rm -rf tests/integration/mock/$agent/$test_name/*
       echo -e "/exit\n" | SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
-        DEBUG=true \
+        DEBUG=true REMIND_ITERATIONS=$remind_iterations \
         WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
         poetry run python ./opendevin/core/main.py \
-        -i 10 \
+        -i $MAX_ITERATIONS \
         -t "$task Do not ask me for confirmation at any point." \
         -c $agent
 
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index 3b6d474744..f99dab1220 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -54,7 +54,6 @@ def test_edits():
         dest_file = os.path.join(workspace_base, file)
         if os.path.exists(dest_file):
             os.remove(dest_file)
-        print('source = ', os.path.join(source_dir, file), ' dest = ', dest_file)
         shutil.copy(os.path.join(source_dir, file), dest_file)
 
     # Execute the task