mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 13:47:19 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
151 lines
5.0 KiB
Python
151 lines
5.0 KiB
Python
"""Replay tests"""
|
|
|
|
import asyncio
|
|
|
|
from conftest import _close_test_runtime, _load_runtime
|
|
|
|
from openhands.controller.state.state import State
|
|
from openhands.core.config.app_config import AppConfig
|
|
from openhands.core.config.config_utils import OH_DEFAULT_AGENT
|
|
from openhands.core.main import run_controller
|
|
from openhands.core.schema.agent import AgentState
|
|
from openhands.events.action.empty import NullAction
|
|
from openhands.events.action.message import MessageAction
|
|
from openhands.events.event import EventSource
|
|
from openhands.events.observation.commands import CmdOutputObservation
|
|
|
|
|
|
def _get_config(trajectory_name: str, agent: str = OH_DEFAULT_AGENT):
|
|
return AppConfig(
|
|
default_agent=agent,
|
|
run_as_openhands=False,
|
|
# do not mount workspace
|
|
workspace_base=None,
|
|
workspace_mount_path=None,
|
|
replay_trajectory_path=f'./tests/runtime/trajs/{trajectory_name}.json',
|
|
)
|
|
|
|
|
|
def test_simple_replay(temp_dir, runtime_cls, run_as_openhands):
|
|
"""
|
|
A simple replay test that involves simple terminal operations and edits
|
|
(creating a simple 2048 game), using the default agent
|
|
"""
|
|
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
|
|
config.replay_trajectory_path = './tests/runtime/trajs/basic.json'
|
|
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=NullAction(),
|
|
runtime=runtime,
|
|
)
|
|
)
|
|
|
|
assert state.agent_state == AgentState.FINISHED
|
|
|
|
_close_test_runtime(runtime)
|
|
|
|
|
|
def test_simple_gui_replay(temp_dir, runtime_cls, run_as_openhands):
|
|
"""
|
|
A simple replay test that involves simple terminal operations and edits
|
|
(writing a Vue.js App), using the default agent
|
|
|
|
Note:
|
|
1. This trajectory is exported from GUI mode, meaning it has extra
|
|
environmental actions that don't appear in headless mode's trajectories
|
|
2. In GUI mode, agents typically don't finish; rather, they wait for the next
|
|
task from the user, so this exported trajectory ends with awaiting_user_input
|
|
"""
|
|
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
|
|
|
|
config = _get_config('basic_gui_mode')
|
|
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=NullAction(),
|
|
runtime=runtime,
|
|
# exit on message, otherwise this would be stuck on waiting for user input
|
|
exit_on_message=True,
|
|
)
|
|
)
|
|
|
|
assert state.agent_state == AgentState.FINISHED
|
|
|
|
_close_test_runtime(runtime)
|
|
|
|
|
|
def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
|
|
"""
|
|
Replay requires a consistent initial state to start with, otherwise it might
|
|
be producing garbage. The trajectory used in this test assumes existence of
|
|
a file named 'game_2048.py', which doesn't exist when we replay the trajectory
|
|
(so called inconsistent initial states). This test demonstrates how this would
|
|
look like: the following events would still be replayed even though they are
|
|
meaningless.
|
|
"""
|
|
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
|
|
config.replay_trajectory_path = './tests/runtime/trajs/wrong_initial_state.json'
|
|
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=NullAction(),
|
|
runtime=runtime,
|
|
)
|
|
)
|
|
|
|
assert state.agent_state == AgentState.FINISHED
|
|
|
|
has_error_in_action = False
|
|
for event in state.history:
|
|
if isinstance(event, CmdOutputObservation) and event.exit_code != 0:
|
|
has_error_in_action = True
|
|
break
|
|
|
|
assert has_error_in_action
|
|
|
|
_close_test_runtime(runtime)
|
|
|
|
|
|
def test_replay_basic_interactions(temp_dir, runtime_cls, run_as_openhands):
|
|
"""
|
|
Replay a trajectory that involves interactions, i.e. with user messages
|
|
in the middle. This tests two things:
|
|
1) The controller should be able to replay all actions without human
|
|
interference (no asking for user input).
|
|
2) The user messages in the trajectory should appear in the history.
|
|
"""
|
|
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
|
|
|
|
config = _get_config('basic_interactions')
|
|
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=NullAction(),
|
|
runtime=runtime,
|
|
)
|
|
)
|
|
|
|
assert state.agent_state == AgentState.FINISHED
|
|
|
|
# all user messages appear in the history, so that after a replay (assuming
|
|
# the trajectory doesn't end with `finish` action), LLM knows about all the
|
|
# context and can continue
|
|
user_messages = [
|
|
"what's 1+1?",
|
|
"No, I mean by Goldbach's conjecture!",
|
|
'Finish please',
|
|
]
|
|
i = 0
|
|
for event in state.history:
|
|
if isinstance(event, MessageAction) and event._source == EventSource.USER:
|
|
assert event.message == user_messages[i]
|
|
i += 1
|
|
assert i == len(user_messages)
|
|
|
|
_close_test_runtime(runtime)
|