From 8bfae8413e74e1b6095d5380a7ea93b6b73a4537 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 12 May 2024 15:57:33 +0800 Subject: [PATCH] Support passing sandbox as argument and iteration reminder (#1730) * support custom sandbox; add iteration_reminder * Enable iteration reminder in CodeActAgent integration test * Don't remove numbers when comparing prompts * Update tests/integration/README.md --------- Co-authored-by: Boxuan Li --- .github/workflows/run-integration-tests.yml | 1 + opendevin/controller/agent_controller.py | 38 ++++++++++++++-- opendevin/core/config.py | 1 + opendevin/core/schema/config.py | 1 + opendevin/runtime/runtime.py | 7 ++- tests/integration/README.md | 9 ++-- tests/integration/conftest.py | 6 +-- .../CodeActAgent/test_edits/prompt_002.log | 9 ++-- .../CodeActAgent/test_edits/prompt_003.log | 44 ++++++++++--------- .../test_write_simple_script/prompt_002.log | 1 + tests/integration/regenerate.sh | 15 +++++-- tests/integration/test_agent.py | 1 - 12 files changed, 89 insertions(+), 44 deletions(-) diff --git a/.github/workflows/run-integration-tests.yml b/.github/workflows/run-integration-tests.yml index 82b4a9355a..76c6fa4620 100644 --- a/.github/workflows/run-integration-tests.yml +++ b/.github/workflows/run-integration-tests.yml @@ -57,6 +57,7 @@ jobs: AGENT: ${{ matrix.agent }} MAX_ITERATIONS: 10 LLM_EMBEDDING_MODEL: ${{ matrix.embedding-model }} + REMIND_ITERATIONS: ${{ (matrix.agent == 'CodeActAgent') && 'true' || 'false' }} run: | rm -rf workspace mkdir workspace diff --git a/opendevin/controller/agent_controller.py b/opendevin/controller/agent_controller.py index 43ea966a81..d4bfb6513d 100644 --- a/opendevin/controller/agent_controller.py +++ b/opendevin/controller/agent_controller.py @@ -33,7 +33,7 @@ from opendevin.events.observation import ( Observation, ) from opendevin.events.stream import EventSource, EventStream, EventStreamSubscriber -from opendevin.runtime import DockerSSHBox +from opendevin.runtime import DockerSSHBox, Sandbox from opendevin.runtime.runtime import Runtime from opendevin.runtime.server.runtime import ServerRuntime @@ -60,6 +60,8 @@ class AgentController: sid: str = 'default', max_iterations: int = MAX_ITERATIONS, max_chars: int = MAX_CHARS, + sandbox: Optional[Sandbox] = None, + remind_iterations: bool = config.remind_iterations, ): """Initializes a new instance of the AgentController class. @@ -68,6 +70,8 @@ class AgentController: sid: The session ID of the agent. max_iterations: The maximum number of iterations the agent can run. max_chars: The maximum number of characters the agent can output. + sandbox: An optional initialized sandbox to run the agent in. If not provided, a default sandbox will be created based on config. + remind_iterations: A boolean value indicating whether to remind the agent its remaining budget of interaction. """ self.id = sid self.agent = agent @@ -76,8 +80,15 @@ class AgentController: EventStreamSubscriber.AGENT_CONTROLLER, self.on_event ) self.max_iterations = max_iterations - self.runtime = ServerRuntime(self.id) + + self.remind_iterations = remind_iterations + if self.remind_iterations: + logger.info( + 'Iteration reminder is ENABLED: agent will be reminded of remaining turns.' + ) + self.runtime = ServerRuntime(sandbox=sandbox, sid=self.id) self.max_chars = max_chars + # Initialize agent-required plugins for sandbox (if any) self.runtime.init_sandbox_plugins(agent.sandbox_plugins) @@ -187,7 +198,9 @@ class AgentController: self.agent.reset() async def set_agent_state_to(self, new_state: AgentState): - logger.info(f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}') + logger.info( + f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}' + ) if new_state == self._agent_state: return @@ -201,7 +214,11 @@ class AgentController: self._cur_step += 1 if self.agent_task is not None: self.agent_task.cancel() - elif new_state == AgentState.STOPPED or new_state == AgentState.ERROR or new_state == AgentState.FINISHED: + elif ( + new_state == AgentState.STOPPED + or new_state == AgentState.ERROR + or new_state == AgentState.FINISHED + ): await self.reset_task() await self.event_stream.add_event( @@ -225,6 +242,17 @@ class AgentController: task = action.inputs.get('task') or '' await self.delegate.setup_task(task, action.inputs) + def add_iteration_reminder_when_needed(self, i: int, obs: Observation): + """Add iteration reminder to the observation if needed. + + Args: + i: The current iteration number (0-indexed). + obs: The observation to add the reminder to. + """ + if self.remind_iterations: + obs.content += f'\n\nENVIRONMENT REMINDER: You have {self.max_iterations - i - 1} turns left to complete the task.' + return obs + async def step(self, i: int) -> bool: if self.state is None: raise ValueError('No task to run') @@ -265,6 +293,7 @@ class AgentController: if isinstance(action, AgentFinishAction): self.state.outputs = action.outputs # type: ignore[attr-defined] logger.info(action, extra={'msg_type': 'INFO'}) + await self.add_history(action, NullObservation('')) return True elif isinstance(action, MessageAction) and action.wait_for_response: # FIXME: remove this once history is managed outside the agent controller @@ -280,6 +309,7 @@ class AgentController: elif not isinstance(observation, ErrorObservation): observation = await self.runtime.run_action(action) + observation = self.add_iteration_reminder_when_needed(i, observation) if not isinstance(observation, NullObservation): logger.info(observation, extra={'msg_type': 'OBSERVATION'}) await self.add_history(action, observation) diff --git a/opendevin/core/config.py b/opendevin/core/config.py index f453ce5cd7..030b054ce8 100644 --- a/opendevin/core/config.py +++ b/opendevin/core/config.py @@ -82,6 +82,7 @@ class AppConfig(metaclass=Singleton): ) run_as_devin: bool = True max_iterations: int = 100 + remind_iterations: bool = False e2b_api_key: str = '' sandbox_type: str = 'ssh' # Can be 'ssh', 'exec', or 'e2b' use_host_network: bool = False diff --git a/opendevin/core/schema/config.py b/opendevin/core/schema/config.py index f409b9940a..a58ef8ade4 100644 --- a/opendevin/core/schema/config.py +++ b/opendevin/core/schema/config.py @@ -31,6 +31,7 @@ class ConfigType(str, Enum): AGENT_MEMORY_MAX_THREADS = 'AGENT_MEMORY_MAX_THREADS' AGENT_MEMORY_ENABLED = 'AGENT_MEMORY_ENABLED' MAX_ITERATIONS = 'MAX_ITERATIONS' + REMIND_ITERATIONS = 'REMIND_ITERATIONS' MAX_CHARS = 'MAX_CHARS' AGENT = 'AGENT' E2B_API_KEY = 'E2B_API_KEY' diff --git a/opendevin/runtime/runtime.py b/opendevin/runtime/runtime.py index 2c5735334f..25088adfa9 100644 --- a/opendevin/runtime/runtime.py +++ b/opendevin/runtime/runtime.py @@ -51,14 +51,17 @@ class Runtime: """ sid: str - sandbox: Sandbox def __init__( self, + sandbox: Sandbox | None = None, sid: str = 'default', ): self.sid = sid - self.sandbox = create_sandbox(sid, config.sandbox_type) + if sandbox is None: + self.sandbox = create_sandbox(sid, config.sandbox_type) + else: + self.sandbox = sandbox self.browser = BrowserEnv() def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None: diff --git a/tests/integration/README.md b/tests/integration/README.md index d4d8c8dd1a..dd82d88024 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -20,11 +20,10 @@ not possible with benchmarks. Known limitations: 1. To avoid the potential impact of non-determinism, we remove all special -characters and numbers (often used as PIDs) when doing the comparison. If two -prompts for the same task only differ in non-alpha characters, a wrong mock -response might be picked up. -2. It is required that the agent itself doesn't do anything non-deterministic, -including but not limited to using randomly generated numbers. +characters when doing the comparison. If two prompts for the same task only +differ in non-alphanumeric characters, a wrong mock response might be picked up. +2. It is required that everything has to be deterministic. For example, agent +must not use randomly generated numbers. The folder is organised as follows: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 346cad6a11..5fba9302e0 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -11,7 +11,7 @@ workspace_path = os.getenv('WORKSPACE_BASE') def filter_out_symbols(input): - return ' '.join([char for char in input if char.isalpha()]) + return ' '.join([char for char in input if char.isalnum()]) def get_log_id(prompt_log_name): @@ -100,7 +100,7 @@ def patch_completion(monkeypatch, request): monkeypatch.setattr('sys.stdin', user_responses) -def clean_up(): +def set_up(): assert workspace_path is not None if os.path.exists(workspace_path): for file in os.listdir(workspace_path): @@ -109,7 +109,7 @@ def clean_up(): @pytest.fixture(autouse=True) def resource_setup(): - clean_up() + set_up() if not os.path.exists(workspace_path): os.makedirs(workspace_path) # Yield to test execution diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log index 74d51f5ebb..59bb6b5e81 100644 --- a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log +++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log @@ -229,9 +229,10 @@ open bad.txt ---------- OBSERVATION: -[File: /workspace/bad.txt (4 lines total)] -1:This is a stupid typoo. -2:Really? -3:No mor typos! +[File: /workspace/bad.txt (4 lines total)] +1:This is a stupid typoo. +2:Really? +3:No mor typos! 4:Enjoy! +ENVIRONMENT REMINDER: You have 9 turns left to complete the task. [Command -1 finished with exit code 0]] diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log index 2f7d232698..cd7320e838 100644 --- a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log +++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log @@ -229,11 +229,12 @@ open bad.txt ---------- OBSERVATION: -[File: /workspace/bad.txt (4 lines total)] -1:This is a stupid typoo. -2:Really? -3:No mor typos! +[File: /workspace/bad.txt (4 lines total)] +1:This is a stupid typoo. +2:Really? +3:No mor typos! 4:Enjoy! +ENVIRONMENT REMINDER: You have 9 turns left to complete the task. [Command -1 finished with exit code 0]] ---------- @@ -257,22 +258,23 @@ end_of_edit ---------- OBSERVATION: -> This is a stupid typo. -> EOF -[File: /workspace/bad.txt (4 lines total)] -1:This is a stupid typo. -2:Really? -3:No mor typos! -4:Enjoy! -File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 < No more typos! -> EOF - - -[File: /workspace/bad.txt (4 lines total)] -1:This is a stupid typo. -2:Really? -3:No more typos! -4:Enjoy! +> This is a stupid typo. +> EOF +[File: /workspace/bad.txt (4 lines total)] +1:This is a stupid typo. +2:Really? +3:No mor typos! +4:Enjoy! +File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 < No more typos! +> EOF + + +[File: /workspace/bad.txt (4 lines total)] +1:This is a stupid typo. +2:Really? +3:No more typos! +4:Enjoy! File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary. +ENVIRONMENT REMINDER: You have 8 turns left to complete the task. [Command -1 finished with exit code 0]] diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log index d28f72a101..b66209b26e 100644 --- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log +++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log @@ -230,4 +230,5 @@ echo "echo hello" > hello.sh OBSERVATION: +ENVIRONMENT REMINDER: You have 9 turns left to complete the task. [Command -1 finished with exit code 0]] diff --git a/tests/integration/regenerate.sh b/tests/integration/regenerate.sh index f17b5d98bb..d333bf347a 100755 --- a/tests/integration/regenerate.sh +++ b/tests/integration/regenerate.sh @@ -4,20 +4,26 @@ set -eo pipefail WORKSPACE_MOUNT_PATH=$(pwd)/_test_workspace WORKSPACE_BASE=$(pwd)/_test_workspace SANDBOX_TYPE="ssh" +MAX_ITERATIONS=10 # FIXME: SWEAgent hangs, so it goes last agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent") +# only enable iteration reminder for CodeActAgent in tests +remind_iterations_config=(false true false false) tasks=("Fix typos in bad.txt." "Write a shell script 'hello.sh' that prints 'hello'.") test_names=("test_edits" "test_write_simple_script") num_of_tests=${#tasks[@]} +num_of_agents=${#agents[@]} rm -rf logs rm -rf _test_workspace for ((i = 0; i < num_of_tests; i++)); do task=${tasks[i]} test_name=${test_names[i]} - for agent in "${agents[@]}"; do + for ((j = 0; j < num_of_agents; j++)); do + agent=${agents[j]} + remind_iterations=${remind_iterations_config[j]} echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n" rm -rf $WORKSPACE_BASE mkdir $WORKSPACE_BASE @@ -28,7 +34,8 @@ for ((i = 0; i < num_of_tests; i++)); do # Temporarily disable 'exit on error' set +e fi - SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE MAX_ITERATIONS=10 \ + SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \ + MAX_ITERATIONS=$MAX_ITERATIONS REMIND_ITERATIONS=$remind_iterations \ WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \ poetry run pytest -s ./tests/integration/test_agent.py::$test_name TEST_STATUS=$? @@ -43,10 +50,10 @@ for ((i = 0; i < num_of_tests; i++)); do rm -rf logs rm -rf tests/integration/mock/$agent/$test_name/* echo -e "/exit\n" | SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \ - DEBUG=true \ + DEBUG=true REMIND_ITERATIONS=$remind_iterations \ WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \ poetry run python ./opendevin/core/main.py \ - -i 10 \ + -i $MAX_ITERATIONS \ -t "$task Do not ask me for confirmation at any point." \ -c $agent diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py index 3b6d474744..f99dab1220 100644 --- a/tests/integration/test_agent.py +++ b/tests/integration/test_agent.py @@ -54,7 +54,6 @@ def test_edits(): dest_file = os.path.join(workspace_base, file) if os.path.exists(dest_file): os.remove(dest_file) - print('source = ', os.path.join(source_dir, file), ' dest = ', dest_file) shutil.copy(os.path.join(source_dir, file), dest_file) # Execute the task