mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Support passing sandbox as argument and iteration reminder (#1730)
* support custom sandbox; add iteration_reminder * Enable iteration reminder in CodeActAgent integration test * Don't remove numbers when comparing prompts * Update tests/integration/README.md --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
parent
1d58917bc8
commit
8bfae8413e
1
.github/workflows/run-integration-tests.yml
vendored
1
.github/workflows/run-integration-tests.yml
vendored
@ -57,6 +57,7 @@ jobs:
|
||||
AGENT: ${{ matrix.agent }}
|
||||
MAX_ITERATIONS: 10
|
||||
LLM_EMBEDDING_MODEL: ${{ matrix.embedding-model }}
|
||||
REMIND_ITERATIONS: ${{ (matrix.agent == 'CodeActAgent') && 'true' || 'false' }}
|
||||
run: |
|
||||
rm -rf workspace
|
||||
mkdir workspace
|
||||
|
||||
@ -33,7 +33,7 @@ from opendevin.events.observation import (
|
||||
Observation,
|
||||
)
|
||||
from opendevin.events.stream import EventSource, EventStream, EventStreamSubscriber
|
||||
from opendevin.runtime import DockerSSHBox
|
||||
from opendevin.runtime import DockerSSHBox, Sandbox
|
||||
from opendevin.runtime.runtime import Runtime
|
||||
from opendevin.runtime.server.runtime import ServerRuntime
|
||||
|
||||
@ -60,6 +60,8 @@ class AgentController:
|
||||
sid: str = 'default',
|
||||
max_iterations: int = MAX_ITERATIONS,
|
||||
max_chars: int = MAX_CHARS,
|
||||
sandbox: Optional[Sandbox] = None,
|
||||
remind_iterations: bool = config.remind_iterations,
|
||||
):
|
||||
"""Initializes a new instance of the AgentController class.
|
||||
|
||||
@ -68,6 +70,8 @@ class AgentController:
|
||||
sid: The session ID of the agent.
|
||||
max_iterations: The maximum number of iterations the agent can run.
|
||||
max_chars: The maximum number of characters the agent can output.
|
||||
sandbox: An optional initialized sandbox to run the agent in. If not provided, a default sandbox will be created based on config.
|
||||
remind_iterations: A boolean value indicating whether to remind the agent its remaining budget of interaction.
|
||||
"""
|
||||
self.id = sid
|
||||
self.agent = agent
|
||||
@ -76,8 +80,15 @@ class AgentController:
|
||||
EventStreamSubscriber.AGENT_CONTROLLER, self.on_event
|
||||
)
|
||||
self.max_iterations = max_iterations
|
||||
self.runtime = ServerRuntime(self.id)
|
||||
|
||||
self.remind_iterations = remind_iterations
|
||||
if self.remind_iterations:
|
||||
logger.info(
|
||||
'Iteration reminder is ENABLED: agent will be reminded of remaining turns.'
|
||||
)
|
||||
self.runtime = ServerRuntime(sandbox=sandbox, sid=self.id)
|
||||
self.max_chars = max_chars
|
||||
|
||||
# Initialize agent-required plugins for sandbox (if any)
|
||||
self.runtime.init_sandbox_plugins(agent.sandbox_plugins)
|
||||
|
||||
@ -187,7 +198,9 @@ class AgentController:
|
||||
self.agent.reset()
|
||||
|
||||
async def set_agent_state_to(self, new_state: AgentState):
|
||||
logger.info(f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}')
|
||||
logger.info(
|
||||
f'Setting agent({type(self.agent).__name__}) state from {self._agent_state} to {new_state}'
|
||||
)
|
||||
if new_state == self._agent_state:
|
||||
return
|
||||
|
||||
@ -201,7 +214,11 @@ class AgentController:
|
||||
self._cur_step += 1
|
||||
if self.agent_task is not None:
|
||||
self.agent_task.cancel()
|
||||
elif new_state == AgentState.STOPPED or new_state == AgentState.ERROR or new_state == AgentState.FINISHED:
|
||||
elif (
|
||||
new_state == AgentState.STOPPED
|
||||
or new_state == AgentState.ERROR
|
||||
or new_state == AgentState.FINISHED
|
||||
):
|
||||
await self.reset_task()
|
||||
|
||||
await self.event_stream.add_event(
|
||||
@ -225,6 +242,17 @@ class AgentController:
|
||||
task = action.inputs.get('task') or ''
|
||||
await self.delegate.setup_task(task, action.inputs)
|
||||
|
||||
def add_iteration_reminder_when_needed(self, i: int, obs: Observation):
|
||||
"""Add iteration reminder to the observation if needed.
|
||||
|
||||
Args:
|
||||
i: The current iteration number (0-indexed).
|
||||
obs: The observation to add the reminder to.
|
||||
"""
|
||||
if self.remind_iterations:
|
||||
obs.content += f'\n\nENVIRONMENT REMINDER: You have {self.max_iterations - i - 1} turns left to complete the task.'
|
||||
return obs
|
||||
|
||||
async def step(self, i: int) -> bool:
|
||||
if self.state is None:
|
||||
raise ValueError('No task to run')
|
||||
@ -265,6 +293,7 @@ class AgentController:
|
||||
if isinstance(action, AgentFinishAction):
|
||||
self.state.outputs = action.outputs # type: ignore[attr-defined]
|
||||
logger.info(action, extra={'msg_type': 'INFO'})
|
||||
await self.add_history(action, NullObservation(''))
|
||||
return True
|
||||
elif isinstance(action, MessageAction) and action.wait_for_response:
|
||||
# FIXME: remove this once history is managed outside the agent controller
|
||||
@ -280,6 +309,7 @@ class AgentController:
|
||||
elif not isinstance(observation, ErrorObservation):
|
||||
observation = await self.runtime.run_action(action)
|
||||
|
||||
observation = self.add_iteration_reminder_when_needed(i, observation)
|
||||
if not isinstance(observation, NullObservation):
|
||||
logger.info(observation, extra={'msg_type': 'OBSERVATION'})
|
||||
await self.add_history(action, observation)
|
||||
|
||||
@ -82,6 +82,7 @@ class AppConfig(metaclass=Singleton):
|
||||
)
|
||||
run_as_devin: bool = True
|
||||
max_iterations: int = 100
|
||||
remind_iterations: bool = False
|
||||
e2b_api_key: str = ''
|
||||
sandbox_type: str = 'ssh' # Can be 'ssh', 'exec', or 'e2b'
|
||||
use_host_network: bool = False
|
||||
|
||||
@ -31,6 +31,7 @@ class ConfigType(str, Enum):
|
||||
AGENT_MEMORY_MAX_THREADS = 'AGENT_MEMORY_MAX_THREADS'
|
||||
AGENT_MEMORY_ENABLED = 'AGENT_MEMORY_ENABLED'
|
||||
MAX_ITERATIONS = 'MAX_ITERATIONS'
|
||||
REMIND_ITERATIONS = 'REMIND_ITERATIONS'
|
||||
MAX_CHARS = 'MAX_CHARS'
|
||||
AGENT = 'AGENT'
|
||||
E2B_API_KEY = 'E2B_API_KEY'
|
||||
|
||||
@ -51,14 +51,17 @@ class Runtime:
|
||||
"""
|
||||
|
||||
sid: str
|
||||
sandbox: Sandbox
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sandbox: Sandbox | None = None,
|
||||
sid: str = 'default',
|
||||
):
|
||||
self.sid = sid
|
||||
self.sandbox = create_sandbox(sid, config.sandbox_type)
|
||||
if sandbox is None:
|
||||
self.sandbox = create_sandbox(sid, config.sandbox_type)
|
||||
else:
|
||||
self.sandbox = sandbox
|
||||
self.browser = BrowserEnv()
|
||||
|
||||
def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:
|
||||
|
||||
@ -20,11 +20,10 @@ not possible with benchmarks.
|
||||
|
||||
Known limitations:
|
||||
1. To avoid the potential impact of non-determinism, we remove all special
|
||||
characters and numbers (often used as PIDs) when doing the comparison. If two
|
||||
prompts for the same task only differ in non-alpha characters, a wrong mock
|
||||
response might be picked up.
|
||||
2. It is required that the agent itself doesn't do anything non-deterministic,
|
||||
including but not limited to using randomly generated numbers.
|
||||
characters when doing the comparison. If two prompts for the same task only
|
||||
differ in non-alphanumeric characters, a wrong mock response might be picked up.
|
||||
2. It is required that everything has to be deterministic. For example, agent
|
||||
must not use randomly generated numbers.
|
||||
|
||||
The folder is organised as follows:
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ workspace_path = os.getenv('WORKSPACE_BASE')
|
||||
|
||||
|
||||
def filter_out_symbols(input):
|
||||
return ' '.join([char for char in input if char.isalpha()])
|
||||
return ' '.join([char for char in input if char.isalnum()])
|
||||
|
||||
|
||||
def get_log_id(prompt_log_name):
|
||||
@ -100,7 +100,7 @@ def patch_completion(monkeypatch, request):
|
||||
monkeypatch.setattr('sys.stdin', user_responses)
|
||||
|
||||
|
||||
def clean_up():
|
||||
def set_up():
|
||||
assert workspace_path is not None
|
||||
if os.path.exists(workspace_path):
|
||||
for file in os.listdir(workspace_path):
|
||||
@ -109,7 +109,7 @@ def clean_up():
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def resource_setup():
|
||||
clean_up()
|
||||
set_up()
|
||||
if not os.path.exists(workspace_path):
|
||||
os.makedirs(workspace_path)
|
||||
# Yield to test execution
|
||||
|
||||
@ -229,9 +229,10 @@ open bad.txt
|
||||
----------
|
||||
|
||||
OBSERVATION:
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typoo.
|
||||
2:Really?
|
||||
3:No mor typos!
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typoo.
|
||||
2:Really?
|
||||
3:No mor typos!
|
||||
4:Enjoy!
|
||||
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
|
||||
[Command -1 finished with exit code 0]]
|
||||
|
||||
@ -229,11 +229,12 @@ open bad.txt
|
||||
----------
|
||||
|
||||
OBSERVATION:
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typoo.
|
||||
2:Really?
|
||||
3:No mor typos!
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typoo.
|
||||
2:Really?
|
||||
3:No mor typos!
|
||||
4:Enjoy!
|
||||
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
|
||||
[Command -1 finished with exit code 0]]
|
||||
|
||||
----------
|
||||
@ -257,22 +258,23 @@ end_of_edit
|
||||
----------
|
||||
|
||||
OBSERVATION:
|
||||
> This is a stupid typo.
|
||||
> EOF
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typo.
|
||||
2:Really?
|
||||
3:No mor typos!
|
||||
4:Enjoy!
|
||||
File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
|
||||
> No more typos!
|
||||
> EOF
|
||||
|
||||
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typo.
|
||||
2:Really?
|
||||
3:No more typos!
|
||||
4:Enjoy!
|
||||
> This is a stupid typo.
|
||||
> EOF
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typo.
|
||||
2:Really?
|
||||
3:No mor typos!
|
||||
4:Enjoy!
|
||||
File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.edit 3:3 <<EOF
|
||||
> No more typos!
|
||||
> EOF
|
||||
|
||||
|
||||
[File: /workspace/bad.txt (4 lines total)]
|
||||
1:This is a stupid typo.
|
||||
2:Really?
|
||||
3:No more typos!
|
||||
4:Enjoy!
|
||||
File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
|
||||
ENVIRONMENT REMINDER: You have 8 turns left to complete the task.
|
||||
[Command -1 finished with exit code 0]]
|
||||
|
||||
@ -230,4 +230,5 @@ echo "echo hello" > hello.sh
|
||||
|
||||
OBSERVATION:
|
||||
|
||||
ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
|
||||
[Command -1 finished with exit code 0]]
|
||||
|
||||
@ -4,20 +4,26 @@ set -eo pipefail
|
||||
WORKSPACE_MOUNT_PATH=$(pwd)/_test_workspace
|
||||
WORKSPACE_BASE=$(pwd)/_test_workspace
|
||||
SANDBOX_TYPE="ssh"
|
||||
MAX_ITERATIONS=10
|
||||
|
||||
# FIXME: SWEAgent hangs, so it goes last
|
||||
agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
|
||||
# only enable iteration reminder for CodeActAgent in tests
|
||||
remind_iterations_config=(false true false false)
|
||||
tasks=("Fix typos in bad.txt." "Write a shell script 'hello.sh' that prints 'hello'.")
|
||||
test_names=("test_edits" "test_write_simple_script")
|
||||
|
||||
num_of_tests=${#tasks[@]}
|
||||
num_of_agents=${#agents[@]}
|
||||
|
||||
rm -rf logs
|
||||
rm -rf _test_workspace
|
||||
for ((i = 0; i < num_of_tests; i++)); do
|
||||
task=${tasks[i]}
|
||||
test_name=${test_names[i]}
|
||||
for agent in "${agents[@]}"; do
|
||||
for ((j = 0; j < num_of_agents; j++)); do
|
||||
agent=${agents[j]}
|
||||
remind_iterations=${remind_iterations_config[j]}
|
||||
echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
|
||||
rm -rf $WORKSPACE_BASE
|
||||
mkdir $WORKSPACE_BASE
|
||||
@ -28,7 +34,8 @@ for ((i = 0; i < num_of_tests; i++)); do
|
||||
# Temporarily disable 'exit on error'
|
||||
set +e
|
||||
fi
|
||||
SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE MAX_ITERATIONS=10 \
|
||||
SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
|
||||
MAX_ITERATIONS=$MAX_ITERATIONS REMIND_ITERATIONS=$remind_iterations \
|
||||
WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
|
||||
poetry run pytest -s ./tests/integration/test_agent.py::$test_name
|
||||
TEST_STATUS=$?
|
||||
@ -43,10 +50,10 @@ for ((i = 0; i < num_of_tests; i++)); do
|
||||
rm -rf logs
|
||||
rm -rf tests/integration/mock/$agent/$test_name/*
|
||||
echo -e "/exit\n" | SANDBOX_TYPE=$SANDBOX_TYPE WORKSPACE_BASE=$WORKSPACE_BASE \
|
||||
DEBUG=true \
|
||||
DEBUG=true REMIND_ITERATIONS=$remind_iterations \
|
||||
WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
|
||||
poetry run python ./opendevin/core/main.py \
|
||||
-i 10 \
|
||||
-i $MAX_ITERATIONS \
|
||||
-t "$task Do not ask me for confirmation at any point." \
|
||||
-c $agent
|
||||
|
||||
|
||||
@ -54,7 +54,6 @@ def test_edits():
|
||||
dest_file = os.path.join(workspace_base, file)
|
||||
if os.path.exists(dest_file):
|
||||
os.remove(dest_file)
|
||||
print('source = ', os.path.join(source_dir, file), ' dest = ', dest_file)
|
||||
shutil.copy(os.path.join(source_dir, file), dest_file)
|
||||
|
||||
# Execute the task
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user