OpenHands/tests/runtime/test_replay.py

"""Replay tests"""

import asyncio

from conftest import _close_test_runtime, _load_runtime

from openhands.controller.state.state import State
from openhands.core.config.app_config import AppConfig
from openhands.core.config.config_utils import OH_DEFAULT_AGENT
from openhands.core.main import run_controller
from openhands.core.schema.agent import AgentState
from openhands.events.action.empty import NullAction
from openhands.events.action.message import MessageAction
from openhands.events.event import EventSource
from openhands.events.observation.commands import CmdOutputObservation


def _get_config(trajectory_name: str, agent: str = OH_DEFAULT_AGENT):
    return AppConfig(
        default_agent=agent,
        run_as_openhands=False,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
        replay_trajectory_path=f'./tests/runtime/trajs/{trajectory_name}.json',
    )


def test_simple_replay(temp_dir, runtime_cls, run_as_openhands):
    """
    A simple replay test that involves simple terminal operations and edits
    (creating a simple 2048 game), using the default agent
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    config.replay_trajectory_path = './tests/runtime/trajs/basic.json'

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    _close_test_runtime(runtime)


def test_simple_gui_replay(temp_dir, runtime_cls, run_as_openhands):
    """
    A simple replay test that involves simple terminal operations and edits
    (writing a Vue.js App), using the default agent

    Note:
    1. This trajectory is exported from GUI mode, meaning it has extra
    environmental actions that don't appear in headless mode's trajectories
    2. In GUI mode, agents typically don't finish; rather, they wait for the next
    task from the user, so this exported trajectory ends with awaiting_user_input
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)

    config = _get_config('basic_gui_mode')

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
            # exit on message, otherwise this would be stuck on waiting for user input
            exit_on_message=True,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    _close_test_runtime(runtime)


def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
    """
    Replay requires a consistent initial state to start with, otherwise it might
    be producing garbage. The trajectory used in this test assumes existence of
    a file named 'game_2048.py', which doesn't exist when we replay the trajectory
    (so called inconsistent initial states). This test demonstrates how this would
    look like: the following events would still be replayed even though they are
    meaningless.
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    config.replay_trajectory_path = './tests/runtime/trajs/wrong_initial_state.json'

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    has_error_in_action = False
    for event in state.history:
        if isinstance(event, CmdOutputObservation) and event.exit_code != 0:
            has_error_in_action = True
            break

    assert has_error_in_action

    _close_test_runtime(runtime)


def test_replay_basic_interactions(temp_dir, runtime_cls, run_as_openhands):
    """
    Replay a trajectory that involves interactions, i.e. with user messages
    in the middle. This tests two things:
    1) The controller should be able to replay all actions without human
    interference (no asking for user input).
    2) The user messages in the trajectory should appear in the history.
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)

    config = _get_config('basic_interactions')

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    # all user messages appear in the history, so that after a replay (assuming
    # the trajectory doesn't end with `finish` action), LLM knows about all the
    # context and can continue
    user_messages = [
        "what's 1+1?",
        "No, I mean by Goldbach's conjecture!",
        'Finish please',
    ]
    i = 0
    for event in state.history:
        if isinstance(event, MessageAction) and event._source == EventSource.USER:
            assert event.message == user_messages[i]
            i += 1
    assert i == len(user_messages)

    _close_test_runtime(runtime)