Support passing sandbox as argument and iteration reminder (#1730)

* support custom sandbox; add iteration_reminder * Enable iteration reminder in CodeActAgent integration test * Don't remove numbers when comparing prompts * Update tests/integration/README.md --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2025-12-26 05:48:36 +08:00 · 2024-05-12 15:57:33 +08:00 · 2024-05-12 15:57:33 +08:00 · 8bfae8413e
commit 8bfae8413e
parent 1d58917bc8
12 changed files with 89 additions and 44 deletions
--- a/.github/workflows/run-integration-tests.yml
+++ b/.github/workflows/run-integration-tests.yml
@ -57,6 +57,7 @@ jobs:
          AGENT: ${{ matrix.agent }}
          MAX_ITERATIONS: 10
          LLM_EMBEDDING_MODEL: ${{ matrix.embedding-model }}
+          REMIND_ITERATIONS: ${{ (matrix.agent == 'CodeActAgent') && 'true' || 'false' }}
        run: |
          rm -rf workspace
          mkdir workspace
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@ -33,7 +33,7 @@ from opendevin.events.observation import (
    Observation,
 )
 from opendevin.events.stream import EventSource, EventStream, EventStreamSubscriber
-from opendevin.runtime import DockerSSHBox
+from opendevin.runtime import DockerSSHBox, Sandbox
 from opendevin.runtime.runtime import Runtime
 from opendevin.runtime.server.runtime import ServerRuntime

@ -60,6 +60,8 @@ class AgentController:
        sid: str = 'default',
        max_iterations: int = MAX_ITERATIONS,
        max_chars: int = MAX_CHARS,
+        sandbox: Optional[Sandbox] = None,
+        remind_iterations: bool = config.remind_iterations,
    ):
        """Initializes a new instance of the AgentController class.

@ -68,6 +70,8 @@ class AgentController:
            sid: The session ID of the agent.
            max_iterations: The maximum number of iterations the agent can run.
            max_chars: The maximum number of characters the agent can output.
+            sandbox: An optional initialized sandbox to run the agent in. If not provided, a default sandbox will be created based on config.
+            remind_iterations: A boolean value indicating whether to remind the agent its remaining budget of interaction.
        """
        self.id = sid
        self.agent = agent
@ -76,8 +80,15 @@ class AgentController:
            EventStreamSubscriber.AGENT_CONTROLLER, self.on_event
        )
        self.max_iterations = max_iterations
-        self.runtime = ServerRuntime(self.id)
+
+        self.remind_iterations = remind_iterations
+        if self.remind_iterations:
+            logger.info(
+                'Iteration reminder is ENABLED: agent will be reminded of remaining turns.'
+            )
+        self.runtime = ServerRuntime(sandbox=sandbox, sid=self.id)
        self.max_chars = max_chars
+
        # Initialize agent-required plugins for sandbox (if any)
        self.runtime.init_sandbox_plugins(agent.sandbox_plugins)

@ -187,7 +198,9 @@ class AgentController:
        self.agent.reset()

    async def set_agent_state_to(self, new_state: AgentState):
-        logger.info(f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}')
+        logger.info(
+            f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}'
+        )
        if new_state == self._agent_state:
            return

@ -201,7 +214,11 @@ class AgentController:
            self._cur_step += 1
            if self.agent_task is not None:
                self.agent_task.cancel()
-        elif new_state == AgentState.STOPPED or new_state == AgentState.ERROR or new_state == AgentState.FINISHED:
+        elif (
+            new_state == AgentState.STOPPED
+            or new_state == AgentState.ERROR
+            or new_state == AgentState.FINISHED
+        ):
            await self.reset_task()

        await self.event_stream.add_event(
@ -225,6 +242,17 @@ class AgentController:
        task = action.inputs.get('task') or ''
        await self.delegate.setup_task(task, action.inputs)

+    def add_iteration_reminder_when_needed(self, i: int, obs: Observation):
+        """Add iteration reminder to the observation if needed.
+
+        Args:
+            i: The current iteration number (0-indexed).
+            obs: The observation to add the reminder to.
+        """
+        if self.remind_iterations:
+            obs.content += f'\n\nENVIRONMENT REMINDER: You have {self.max_iterations - i - 1} turns left to complete the task.'
+        return obs
+
    async def step(self, i: int) -> bool:
        if self.state is None:
            raise ValueError('No task to run')
@ -265,6 +293,7 @@ class AgentController:
        if isinstance(action, AgentFinishAction):
            self.state.outputs = action.outputs  # type: ignore[attr-defined]
            logger.info(action, extra={'msg_type': 'INFO'})
+            await self.add_history(action, NullObservation(''))
            return True
        elif isinstance(action, MessageAction) and action.wait_for_response:
            # FIXME: remove this once history is managed outside the agent controller
@ -280,6 +309,7 @@ class AgentController:
        elif not isinstance(observation, ErrorObservation):
            observation = await self.runtime.run_action(action)

+        observation = self.add_iteration_reminder_when_needed(i, observation)
        if not isinstance(observation, NullObservation):
            logger.info(observation, extra={'msg_type': 'OBSERVATION'})
        await self.add_history(action, observation)
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@ -82,6 +82,7 @@ class AppConfig(metaclass=Singleton):
    )
    run_as_devin: bool = True
    max_iterations: int = 100
+    remind_iterations: bool = False
    e2b_api_key: str = ''
    sandbox_type: str = 'ssh'  # Can be 'ssh', 'exec', or 'e2b'
    use_host_network: bool = False
--- a/opendevin/core/schema/config.py
+++ b/opendevin/core/schema/config.py
@ -31,6 +31,7 @@ class ConfigType(str, Enum):
    AGENT_MEMORY_MAX_THREADS = 'AGENT_MEMORY_MAX_THREADS'
    AGENT_MEMORY_ENABLED = 'AGENT_MEMORY_ENABLED'
    MAX_ITERATIONS = 'MAX_ITERATIONS'
+    REMIND_ITERATIONS = 'REMIND_ITERATIONS'
    MAX_CHARS = 'MAX_CHARS'
    AGENT = 'AGENT'
    E2B_API_KEY = 'E2B_API_KEY'
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@ -51,14 +51,17 @@ class Runtime:
    """

    sid: str
-    sandbox: Sandbox

    def __init__(
        self,
+        sandbox: Sandbox | None = None,
        sid: str = 'default',
    ):
        self.sid = sid
-        self.sandbox = create_sandbox(sid, config.sandbox_type)
+        if sandbox is None:
+            self.sandbox = create_sandbox(sid, config.sandbox_type)
+        else:
+            self.sandbox = sandbox
        self.browser = BrowserEnv()

    def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -20,11 +20,10 @@ not possible with benchmarks.

 Known limitations:
 1. To avoid the potential impact of non-determinism, we remove all special
-characters and numbers (often used as PIDs) when doing the comparison. If two
-prompts for the same task only differ in non-alpha characters, a wrong mock
-response might be picked up.
-2. It is required that the agent itself doesn't do anything non-deterministic,
-including but not limited to using randomly generated numbers.
+characters when doing the comparison. If two prompts for the same task only
+differ in non-alphanumeric characters, a wrong mock response might be picked up.
+2. It is required that everything has to be deterministic. For example, agent
+must not use randomly generated numbers.

 The folder is organised as follows:

--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -11,7 +11,7 @@ workspace_path = os.getenv('WORKSPACE_BASE')


 def filter_out_symbols(input):
-    return ' '.join([char for char in input if char.isalpha()])
+    return ' '.join([char for char in input if char.isalnum()])


 def get_log_id(prompt_log_name):
@ -100,7 +100,7 @@ def patch_completion(monkeypatch, request):
        monkeypatch.setattr('sys.stdin', user_responses)


-def clean_up():
+def set_up():
    assert workspace_path is not None
    if os.path.exists(workspace_path):
        for file in os.listdir(workspace_path):
@ -109,7 +109,7 @@ def clean_up():

@pytest.fixture(autouse=True)
 def resource_setup():
-    clean_up()
+    set_up()
    if not os.path.exists(workspace_path):
        os.makedirs(workspace_path)
    # Yield to test execution
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
@ -229,9 +229,10 @@ open bad.txt
 ----------

 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typoo.
-2:Really?
-3:No mor typos!
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typoo.
+2:Really?
+3:No mor typos!
 4:Enjoy!
+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 [Command -1 finished with exit code 0]]
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
@ -229,11 +229,12 @@ open bad.txt
 ----------

 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typoo.
-2:Really?
-3:No mor typos!
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typoo.
+2:Really?
+3:No mor typos!
 4:Enjoy!
+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 [Command -1 finished with exit code 0]]

 ----------
@ -257,22 +258,23 @@ end_of_edit
 ----------

 OBSERVATION:
-> This is a stupid typo.
-> EOF
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typo.
-2:Really?
-3:No mor typos!
-4:Enjoy!
-File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
-> No more typos!
-> EOF
-
-
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typo.
-2:Really?
-3:No more typos!
-4:Enjoy!
+> This is a stupid typo.
+> EOF
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typo.
+2:Really?
+3:No mor typos!
+4:Enjoy!
+File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
+> No more typos!
+> EOF
+
+
+[File: /workspace/bad.txt (4 lines total)]
+1:This is a stupid typo.
+2:Really?
+3:No more typos!
+4:Enjoy!
 File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+ENVIRONMENT REMINDER: You have 8 turns left to complete the task.
 [Command -1 finished with exit code 0]]
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
@ -230,4 +230,5 @@ echo "echo hello" > hello.sh

 OBSERVATION:

+ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 [Command -1 finished with exit code 0]]
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@ -4,20 +4,26 @@ set -eo pipefail
 WORKSPACE_MOUNT_PATH=$(pwd)/_test_workspace
 WORKSPACE_BASE=$(pwd)/_test_workspace
 SANDBOX_TYPE="ssh"
+MAX_ITERATIONS=10

 # FIXME: SWEAgent hangs, so it goes last
 agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
+# only enable iteration reminder for CodeActAgent in tests
+remind_iterations_config=(false true false false)
 tasks=("Fix typos in bad.txt." "Write a shell script 'hello.sh' that prints 'hello'.")
 test_names=("test_edits" "test_write_simple_script")

 num_of_tests=${#tasks[@]}
+num_of_agents=${#agents[@]}

 rm -rf logs
 rm -rf _test_workspace
 for ((i = 0; i < num_of_tests; i++)); do
  task=${tasks[i]}
  test_name=${test_names[i]}
-  for agent in "${agents[@]}"; do
+  for ((j = 0; j < num_of_agents; j++)); do
+    agent=${agents[j]}
+    remind_iterations=${remind_iterations_config[j]}
    echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
    rm -rf $WORKSPACE_BASE
    mkdir $WORKSPACE_BASE
@ -28,7 +34,8 @@ for ((i = 0; i < num_of_tests; i++)); do
      # Temporarily disable 'exit on error'
      set +e
    fi
-    SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE MAX_ITERATIONS=10 \
+    SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
+      MAX_ITERATIONS=$MAX_ITERATIONS REMIND_ITERATIONS=$remind_iterations \
      WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
      poetry run pytest -s ./tests/integration/test_agent.py::$test_name
    TEST_STATUS=$?
@ -43,10 +50,10 @@ for ((i = 0; i < num_of_tests; i++)); do
      rm -rf logs
      rm -rf tests/integration/mock/$agent/$test_name/*
      echo -e "/exit\n" | SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
-        DEBUG=true \
+        DEBUG=true REMIND_ITERATIONS=$remind_iterations \
        WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
        poetry run python ./opendevin/core/main.py \
-        -i 10 \
+        -i $MAX_ITERATIONS \
        -t "$task Do not ask me for confirmation at any point." \
        -c $agent

--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@ -54,7 +54,6 @@ def test_edits():
        dest_file = os.path.join(workspace_base, file)
        if os.path.exists(dest_file):
            os.remove(dest_file)
-        print('source = ', os.path.join(source_dir, file), ' dest = ', dest_file)
        shutil.copy(os.path.join(source_dir, file), dest_file)

    # Execute the task