Support passing sandbox as argument and iteration reminder (#1730)

* support custom sandbox;
add iteration_reminder

* Enable iteration reminder in CodeActAgent integration test

* Don't remove numbers when comparing prompts

* Update tests/integration/README.md

---------

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
Xingyao Wang 2024-05-12 15:57:33 +08:00 committed by GitHub
parent 1d58917bc8
commit 8bfae8413e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 89 additions and 44 deletions

View File

@ -57,6 +57,7 @@ jobs:
AGENT: ${{ matrix.agent }}
MAX_ITERATIONS: 10
LLM_EMBEDDING_MODEL: ${{ matrix.embedding-model }}
REMIND_ITERATIONS: ${{ (matrix.agent == 'CodeActAgent') && 'true' || 'false' }}
run: |
rm -rf workspace
mkdir workspace

View File

@ -33,7 +33,7 @@ from opendevin.events.observation import (
Observation,
)
from opendevin.events.stream import EventSource, EventStream, EventStreamSubscriber
from opendevin.runtime import DockerSSHBox
from opendevin.runtime import DockerSSHBox, Sandbox
from opendevin.runtime.runtime import Runtime
from opendevin.runtime.server.runtime import ServerRuntime
@ -60,6 +60,8 @@ class AgentController:
sid: str = 'default',
max_iterations: int = MAX_ITERATIONS,
max_chars: int = MAX_CHARS,
sandbox: Optional[Sandbox] = None,
remind_iterations: bool = config.remind_iterations,
):
"""Initializes a new instance of the AgentController class.
@ -68,6 +70,8 @@ class AgentController:
sid: The session ID of the agent.
max_iterations: The maximum number of iterations the agent can run.
max_chars: The maximum number of characters the agent can output.
sandbox: An optional initialized sandbox to run the agent in. If not provided, a default sandbox will be created based on config.
remind_iterations: A boolean value indicating whether to remind the agent its remaining budget of interaction.
"""
self.id = sid
self.agent = agent
@ -76,8 +80,15 @@ class AgentController:
EventStreamSubscriber.AGENT_CONTROLLER, self.on_event
)
self.max_iterations = max_iterations
self.runtime = ServerRuntime(self.id)
self.remind_iterations = remind_iterations
if self.remind_iterations:
logger.info(
'Iteration reminder is ENABLED: agent will be reminded of remaining turns.'
)
self.runtime = ServerRuntime(sandbox=sandbox, sid=self.id)
self.max_chars = max_chars
# Initialize agent-required plugins for sandbox (if any)
self.runtime.init_sandbox_plugins(agent.sandbox_plugins)
@ -187,7 +198,9 @@ class AgentController:
self.agent.reset()
async def set_agent_state_to(self, new_state: AgentState):
logger.info(f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}')
logger.info(
f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}'
)
if new_state == self._agent_state:
return
@ -201,7 +214,11 @@ class AgentController:
self._cur_step += 1
if self.agent_task is not None:
self.agent_task.cancel()
elif new_state == AgentState.STOPPED or new_state == AgentState.ERROR or new_state == AgentState.FINISHED:
elif (
new_state == AgentState.STOPPED
or new_state == AgentState.ERROR
or new_state == AgentState.FINISHED
):
await self.reset_task()
await self.event_stream.add_event(
@ -225,6 +242,17 @@ class AgentController:
task = action.inputs.get('task') or ''
await self.delegate.setup_task(task, action.inputs)
def add_iteration_reminder_when_needed(self, i: int, obs: Observation):
"""Add iteration reminder to the observation if needed.
Args:
i: The current iteration number (0-indexed).
obs: The observation to add the reminder to.
"""
if self.remind_iterations:
obs.content += f'\n\nENVIRONMENT REMINDER: You have {self.max_iterations - i - 1} turns left to complete the task.'
return obs
async def step(self, i: int) -> bool:
if self.state is None:
raise ValueError('No task to run')
@ -265,6 +293,7 @@ class AgentController:
if isinstance(action, AgentFinishAction):
self.state.outputs = action.outputs # type: ignore[attr-defined]
logger.info(action, extra={'msg_type': 'INFO'})
await self.add_history(action, NullObservation(''))
return True
elif isinstance(action, MessageAction) and action.wait_for_response:
# FIXME: remove this once history is managed outside the agent controller
@ -280,6 +309,7 @@ class AgentController:
elif not isinstance(observation, ErrorObservation):
observation = await self.runtime.run_action(action)
observation = self.add_iteration_reminder_when_needed(i, observation)
if not isinstance(observation, NullObservation):
logger.info(observation, extra={'msg_type': 'OBSERVATION'})
await self.add_history(action, observation)

View File

@ -82,6 +82,7 @@ class AppConfig(metaclass=Singleton):
)
run_as_devin: bool = True
max_iterations: int = 100
remind_iterations: bool = False
e2b_api_key: str = ''
sandbox_type: str = 'ssh' # Can be 'ssh', 'exec', or 'e2b'
use_host_network: bool = False

View File

@ -31,6 +31,7 @@ class ConfigType(str, Enum):
AGENT_MEMORY_MAX_THREADS = 'AGENT_MEMORY_MAX_THREADS'
AGENT_MEMORY_ENABLED = 'AGENT_MEMORY_ENABLED'
MAX_ITERATIONS = 'MAX_ITERATIONS'
REMIND_ITERATIONS = 'REMIND_ITERATIONS'
MAX_CHARS = 'MAX_CHARS'
AGENT = 'AGENT'
E2B_API_KEY = 'E2B_API_KEY'

View File

@ -51,14 +51,17 @@ class Runtime:
"""
sid: str
sandbox: Sandbox
def __init__(
self,
sandbox: Sandbox | None = None,
sid: str = 'default',
):
self.sid = sid
self.sandbox = create_sandbox(sid, config.sandbox_type)
if sandbox is None:
self.sandbox = create_sandbox(sid, config.sandbox_type)
else:
self.sandbox = sandbox
self.browser = BrowserEnv()
def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:

View File

@ -20,11 +20,10 @@ not possible with benchmarks.
Known limitations:
1. To avoid the potential impact of non-determinism, we remove all special
characters and numbers (often used as PIDs) when doing the comparison. If two
prompts for the same task only differ in non-alpha characters, a wrong mock
response might be picked up.
2. It is required that the agent itself doesn't do anything non-deterministic,
including but not limited to using randomly generated numbers.
characters when doing the comparison. If two prompts for the same task only
differ in non-alphanumeric characters, a wrong mock response might be picked up.
2. It is required that everything has to be deterministic. For example, agent
must not use randomly generated numbers.
The folder is organised as follows:

View File

@ -11,7 +11,7 @@ workspace_path = os.getenv('WORKSPACE_BASE')
def filter_out_symbols(input):
return ' '.join([char for char in input if char.isalpha()])
return ' '.join([char for char in input if char.isalnum()])
def get_log_id(prompt_log_name):
@ -100,7 +100,7 @@ def patch_completion(monkeypatch, request):
monkeypatch.setattr('sys.stdin', user_responses)
def clean_up():
def set_up():
assert workspace_path is not None
if os.path.exists(workspace_path):
for file in os.listdir(workspace_path):
@ -109,7 +109,7 @@ def clean_up():
@pytest.fixture(autouse=True)
def resource_setup():
clean_up()
set_up()
if not os.path.exists(workspace_path):
os.makedirs(workspace_path)
# Yield to test execution

View File

@ -229,9 +229,10 @@ open bad.txt
----------
OBSERVATION:
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typoo.
2:Really?
3:No mor typos!
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typoo.
2:Really?
3:No mor typos!
4:Enjoy!
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
[Command -1 finished with exit code 0]]

View File

@ -229,11 +229,12 @@ open bad.txt
----------
OBSERVATION:
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typoo.
2:Really?
3:No mor typos!
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typoo.
2:Really?
3:No mor typos!
4:Enjoy!
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
[Command -1 finished with exit code 0]]
----------
@ -257,22 +258,23 @@ end_of_edit
----------
OBSERVATION:
> This is a stupid typo.
> EOF
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typo.
2:Really?
3:No mor typos!
4:Enjoy!
File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
> No more typos!
> EOF
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typo.
2:Really?
3:No more typos!
4:Enjoy!
> This is a stupid typo.
> EOF
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typo.
2:Really?
3:No mor typos!
4:Enjoy!
File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
> No more typos!
> EOF
[File: /workspace/bad.txt (4 lines total)]
1:This is a stupid typo.
2:Really?
3:No more typos!
4:Enjoy!
File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
ENVIRONMENT REMINDER: You have 8 turns left to complete the task.
[Command -1 finished with exit code 0]]

View File

@ -230,4 +230,5 @@ echo "echo hello" > hello.sh
OBSERVATION:
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
[Command -1 finished with exit code 0]]

View File

@ -4,20 +4,26 @@ set -eo pipefail
WORKSPACE_MOUNT_PATH=$(pwd)/_test_workspace
WORKSPACE_BASE=$(pwd)/_test_workspace
SANDBOX_TYPE="ssh"
MAX_ITERATIONS=10
# FIXME: SWEAgent hangs, so it goes last
agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
# only enable iteration reminder for CodeActAgent in tests
remind_iterations_config=(false true false false)
tasks=("Fix typos in bad.txt." "Write a shell script 'hello.sh' that prints 'hello'.")
test_names=("test_edits" "test_write_simple_script")
num_of_tests=${#tasks[@]}
num_of_agents=${#agents[@]}
rm -rf logs
rm -rf _test_workspace
for ((i = 0; i < num_of_tests; i++)); do
task=${tasks[i]}
test_name=${test_names[i]}
for agent in "${agents[@]}"; do
for ((j = 0; j < num_of_agents; j++)); do
agent=${agents[j]}
remind_iterations=${remind_iterations_config[j]}
echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
rm -rf $WORKSPACE_BASE
mkdir $WORKSPACE_BASE
@ -28,7 +34,8 @@ for ((i = 0; i < num_of_tests; i++)); do
# Temporarily disable 'exit on error'
set +e
fi
SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE MAX_ITERATIONS=10 \
SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
MAX_ITERATIONS=$MAX_ITERATIONS REMIND_ITERATIONS=$remind_iterations \
WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
poetry run pytest -s ./tests/integration/test_agent.py::$test_name
TEST_STATUS=$?
@ -43,10 +50,10 @@ for ((i = 0; i < num_of_tests; i++)); do
rm -rf logs
rm -rf tests/integration/mock/$agent/$test_name/*
echo -e "/exit\n" | SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
DEBUG=true \
DEBUG=true REMIND_ITERATIONS=$remind_iterations \
WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
poetry run python ./opendevin/core/main.py \
-i 10 \
-i $MAX_ITERATIONS \
-t "$task Do not ask me for confirmation at any point." \
-c $agent

View File

@ -54,7 +54,6 @@ def test_edits():
dest_file = os.path.join(workspace_base, file)
if os.path.exists(dest_file):
os.remove(dest_file)
print('source = ', os.path.join(source_dir, file), ' dest = ', dest_file)
shutil.copy(os.path.join(source_dir, file), dest_file)
# Execute the task