Add tests for trajectory replay (#6513)

2026-03-22 05:37:20 +08:00 · 2025-01-30 05:56:24 -08:00
parent 99d2d01e1a
commit c9f16248d0
4 changed files with 742 additions and 3 deletions
--- a/openhands/controller/replay.py
+++ b/openhands/controller/replay.py
@@ -7,9 +7,12 @@ class ReplayManager:
    """ReplayManager manages the lifecycle of a replay session of a given trajectory.

    Replay manager keeps track of a list of events, replays actions, and ignore
-    messages and observations. It could lead to unexpected or even errorneous
-    results if any action is non-deterministic, or if the initial state before
-    the replay session is different from the initial state of the trajectory.
+    messages and observations.
+
+    Note that unexpected or even errorneous results could happen if
+    1) any action is non-deterministic, OR
+    2) if the initial state before the replay session is different from the
+    initial state of the trajectory.
    """

    def __init__(self, replay_events: list[Event] | None):
--- a/tests/runtime/test_replay.py
+++ b/tests/runtime/test_replay.py
@@ -0,0 +1,80 @@
+"""Replay tests"""
+
+import asyncio
+
+from conftest import _close_test_runtime, _load_runtime
+
+from openhands.controller.state.state import State
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import OH_DEFAULT_AGENT
+from openhands.core.main import run_controller
+from openhands.core.schema.agent import AgentState
+from openhands.events.action.empty import NullAction
+from openhands.events.observation.commands import CmdOutputObservation
+
+
+def _get_config(trajectory_name: str, agent: str = OH_DEFAULT_AGENT):
+    return AppConfig(
+        default_agent=agent,
+        run_as_openhands=False,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+        replay_trajectory_path=f'./tests/runtime/trajs/{trajectory_name}.json',
+    )
+
+
+def test_simple_replay(temp_dir, runtime_cls, run_as_openhands):
+    """
+    A simple replay test that involves simple terminal operations and edits
+    (creating a simple 2048 game), using the default agent
+    """
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+
+    config = _get_config('basic')
+
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=NullAction(),
+            runtime=runtime,
+        )
+    )
+
+    assert state.agent_state == AgentState.FINISHED
+
+    _close_test_runtime(runtime)
+
+
+def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
+    """
+    Replay requires a consistent initial state to start with, otherwise it might
+    be producing garbage. The trajectory used in this test assumes existence of
+    a file named 'game_2048.py', which doesn't exist when we replay the trajectory
+    (so called inconsistent initial states). This test demonstrates how this would
+    look like: the following events would still be replayed even though they are
+    meaningless.
+    """
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+
+    config = _get_config('wrong_initial_state')
+
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=NullAction(),
+            runtime=runtime,
+        )
+    )
+
+    assert state.agent_state == AgentState.FINISHED
+
+    has_error_in_action = False
+    for event in state.history:
+        if isinstance(event, CmdOutputObservation) and event.exit_code != 0:
+            has_error_in_action = True
+            break
+
+    assert has_error_in_action
+
+    _close_test_runtime(runtime)
--- a/tests/runtime/trajs/basic.json
+++ b/tests/runtime/trajs/basic.json
@@ -0,0 +1,202 @@
+[
+  {
+    "id": 0,
+    "timestamp": "2025-01-27T22:44:56.900854",
+    "source": "user",
+    "message": "Please create a simple 2048 game. No need to run it. Please finish the interaction after your finish coding.",
+    "action": "message",
+    "args": {
+      "content": "Please create a simple 2048 game. No need to run it. Please finish the interaction after your finish coding.",
+      "image_urls": null,
+      "wait_for_response": false
+    },
+    "timeout": 120
+  },
+  {
+    "id": 3,
+    "timestamp": "2025-01-27T22:45:13.764487",
+    "source": "agent",
+    "message": "",
+    "action": "edit",
+    "tool_call_metadata": {
+      "function_name": "str_replace_editor",
+      "tool_call_id": "toolu_017nBp7UfiGqyrA7j77HgtFP",
+      "model_response": {
+        "id": "chatcmpl-6814daca-5c62-431d-8c88-9dafead244f1",
+        "created": 1738046713,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "I'll help you create a simple 2048 game in Python. I'll use the str_replace_editor to create the file.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"create\", \"path\": \"/workspace/game_2048.py\", \"file_text\": \"import random\\nimport keyboard\\n\\nclass Game2048:\\n    def __init__(self):\\n        self.board = [[0] * 4 for _ in range(4)]\\n        self.score = 0\\n        self.spawn_number()\\n        self.spawn_number()\\n\\n    def spawn_number(self):\\n        empty_cells = [(i, j) for i in range(4) for j in range(4) if self.board[i][j] == 0]\\n        if empty_cells:\\n            i, j = random.choice(empty_cells)\\n            self.board[i][j] = 2 if random.random() < 0.9 else 4\\n\\n    def move(self, direction):\\n        merged = [[False] * 4 for _ in range(4)]\\n        moved = False\\n\\n        if direction in ['left', 'right']:\\n            for i in range(4):\\n                row = self.board[i][:]\\n                new_row = self.merge_line(row, direction == 'right')\\n                if new_row != row:\\n                    moved = True\\n                    self.board[i] = new_row\\n\\n        elif direction in ['up', 'down']:\\n            for j in range(4):\\n                col = [self.board[i][j] for i in range(4)]\\n                new_col = self.merge_line(col, direction == 'down')\\n                if new_col != col:\\n                    moved = True\\n                    for i in range(4):\\n                        self.board[i][j] = new_col[i]\\n\\n        if moved:\\n            self.spawn_number()\\n\\n    def merge_line(self, line, reverse=False):\\n        if reverse:\\n            line = line[::-1]\\n\\n        new_line = [0] * 4\\n        pos = 0\\n        \\n        # Remove zeros and merge equal numbers\\n        numbers = [x for x in line if x != 0]\\n        i = 0\\n        while i < len(numbers):\\n            if i + 1 < len(numbers) and numbers[i] == numbers[i + 1]:\\n                new_line[pos] = numbers[i] * 2\\n                self.score += numbers[i] * 2\\n                i += 2\\n            else:\\n                new_line[pos] = numbers[i]\\n                i += 1\\n            pos += 1\\n\\n        if reverse:\\n            new_line = new_line[::-1]\\n        \\n        return new_line\\n\\n    def is_game_over(self):\\n        # Check if there are any empty cells\\n        if any(0 in row for row in self.board):\\n            return False\\n\\n        # Check if there are any possible merges\\n        for i in range(4):\\n            for j in range(4):\\n                current = self.board[i][j]\\n                # Check right neighbor\\n                if j < 3 and current == self.board[i][j + 1]:\\n                    return False\\n                # Check bottom neighbor\\n                if i < 3 and current == self.board[i + 1][j]:\\n                    return False\\n        return True\\n\\n    def display(self):\\n        print(f\\\"\\\\nScore: {self.score}\\\\n\\\")\\n        for row in self.board:\\n            print(\\\" \\\".join(f\\\"{num:4}\\\" for num in row))\\n        print(\\\"\\\\n\\\")\\n\\ndef main():\\n    game = Game2048()\\n    \\n    print(\\\"Use arrow keys to move. Press 'q' to quit.\\\")\\n    \\n    while True:\\n        game.display()\\n        \\n        if game.is_game_over():\\n            print(\\\"Game Over!\\\")\\n            break\\n            \\n        event = keyboard.read_event(suppress=True)\\n        if event.event_type != 'down':\\n            continue\\n            \\n        if event.name == 'q':\\n            break\\n        elif event.name == 'left':\\n            game.move('left')\\n        elif event.name == 'right':\\n            game.move('right')\\n        elif event.name == 'up':\\n            game.move('up')\\n        elif event.name == 'down':\\n            game.move('down')\\n\\nif __name__ == \\\"__main__\\\":\\n    main()\"}",
+                    "name": "str_replace_editor"
+                  },
+                  "id": "toolu_017nBp7UfiGqyrA7j77HgtFP",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 1135,
+          "prompt_tokens": 3486,
+          "total_tokens": 4621,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 0
+          },
+          "cache_creation_input_tokens": 3482,
+          "cache_read_input_tokens": 0
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "args": {
+      "path": "/workspace/game_2048.py",
+      "content": "",
+      "start": 1,
+      "end": -1,
+      "thought": "I'll help you create a simple 2048 game in Python. I'll use the str_replace_editor to create the file.",
+      "impl_source": "oh_aci",
+      "translated_ipython_code": "print(file_editor(**{'command': 'create', 'path': '/workspace/game_2048.py', 'file_text': 'import random\\nimport keyboard\\n\\nclass Game2048:\\n    def __init__(self):\\n        self.board = [[0] * 4 for _ in range(4)]\\n        self.score = 0\\n        self.spawn_number()\\n        self.spawn_number()\\n\\n    def spawn_number(self):\\n        empty_cells = [(i, j) for i in range(4) for j in range(4) if self.board[i][j] == 0]\\n        if empty_cells:\\n            i, j = random.choice(empty_cells)\\n            self.board[i][j] = 2 if random.random() < 0.9 else 4\\n\\n    def move(self, direction):\\n        merged = [[False] * 4 for _ in range(4)]\\n        moved = False\\n\\n        if direction in [\\'left\\', \\'right\\']:\\n            for i in range(4):\\n                row = self.board[i][:]\\n                new_row = self.merge_line(row, direction == \\'right\\')\\n                if new_row != row:\\n                    moved = True\\n                    self.board[i] = new_row\\n\\n        elif direction in [\\'up\\', \\'down\\']:\\n            for j in range(4):\\n                col = [self.board[i][j] for i in range(4)]\\n                new_col = self.merge_line(col, direction == \\'down\\')\\n                if new_col != col:\\n                    moved = True\\n                    for i in range(4):\\n                        self.board[i][j] = new_col[i]\\n\\n        if moved:\\n            self.spawn_number()\\n\\n    def merge_line(self, line, reverse=False):\\n        if reverse:\\n            line = line[::-1]\\n\\n        new_line = [0] * 4\\n        pos = 0\\n        \\n        # Remove zeros and merge equal numbers\\n        numbers = [x for x in line if x != 0]\\n        i = 0\\n        while i < len(numbers):\\n            if i + 1 < len(numbers) and numbers[i] == numbers[i + 1]:\\n                new_line[pos] = numbers[i] * 2\\n                self.score += numbers[i] * 2\\n                i += 2\\n            else:\\n                new_line[pos] = numbers[i]\\n                i += 1\\n            pos += 1\\n\\n        if reverse:\\n            new_line = new_line[::-1]\\n        \\n        return new_line\\n\\n    def is_game_over(self):\\n        # Check if there are any empty cells\\n        if any(0 in row for row in self.board):\\n            return False\\n\\n        # Check if there are any possible merges\\n        for i in range(4):\\n            for j in range(4):\\n                current = self.board[i][j]\\n                # Check right neighbor\\n                if j < 3 and current == self.board[i][j + 1]:\\n                    return False\\n                # Check bottom neighbor\\n                if i < 3 and current == self.board[i + 1][j]:\\n                    return False\\n        return True\\n\\n    def display(self):\\n        print(f\"\\\\nScore: {self.score}\\\\n\")\\n        for row in self.board:\\n            print(\" \".join(f\"{num:4}\" for num in row))\\n        print(\"\\\\n\")\\n\\ndef main():\\n    game = Game2048()\\n    \\n    print(\"Use arrow keys to move. Press \\'q\\' to quit.\")\\n    \\n    while True:\\n        game.display()\\n        \\n        if game.is_game_over():\\n            print(\"Game Over!\")\\n            break\\n            \\n        event = keyboard.read_event(suppress=True)\\n        if event.event_type != \\'down\\':\\n            continue\\n            \\n        if event.name == \\'q\\':\\n            break\\n        elif event.name == \\'left\\':\\n            game.move(\\'left\\')\\n        elif event.name == \\'right\\':\\n            game.move(\\'right\\')\\n        elif event.name == \\'up\\':\\n            game.move(\\'up\\')\\n        elif event.name == \\'down\\':\\n            game.move(\\'down\\')\\n\\nif __name__ == \"__main__\":\\n    main()'}))"
+    },
+    "timeout": 120
+  },
+  {
+    "id": 4,
+    "timestamp": "2025-01-27T22:45:13.801384",
+    "source": "agent",
+    "message": "I edited the file /workspace/game_2048.py.",
+    "cause": 3,
+    "observation": "edit",
+    "tool_call_metadata": {
+      "function_name": "str_replace_editor",
+      "tool_call_id": "toolu_017nBp7UfiGqyrA7j77HgtFP",
+      "model_response": {
+        "id": "chatcmpl-6814daca-5c62-431d-8c88-9dafead244f1",
+        "created": 1738046713,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "I'll help you create a simple 2048 game in Python. I'll use the str_replace_editor to create the file.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"create\", \"path\": \"/workspace/game_2048.py\", \"file_text\": \"import random\\nimport keyboard\\n\\nclass Game2048:\\n    def __init__(self):\\n        self.board = [[0] * 4 for _ in range(4)]\\n        self.score = 0\\n        self.spawn_number()\\n        self.spawn_number()\\n\\n    def spawn_number(self):\\n        empty_cells = [(i, j) for i in range(4) for j in range(4) if self.board[i][j] == 0]\\n        if empty_cells:\\n            i, j = random.choice(empty_cells)\\n            self.board[i][j] = 2 if random.random() < 0.9 else 4\\n\\n    def move(self, direction):\\n        merged = [[False] * 4 for _ in range(4)]\\n        moved = False\\n\\n        if direction in ['left', 'right']:\\n            for i in range(4):\\n                row = self.board[i][:]\\n                new_row = self.merge_line(row, direction == 'right')\\n                if new_row != row:\\n                    moved = True\\n                    self.board[i] = new_row\\n\\n        elif direction in ['up', 'down']:\\n            for j in range(4):\\n                col = [self.board[i][j] for i in range(4)]\\n                new_col = self.merge_line(col, direction == 'down')\\n                if new_col != col:\\n                    moved = True\\n                    for i in range(4):\\n                        self.board[i][j] = new_col[i]\\n\\n        if moved:\\n            self.spawn_number()\\n\\n    def merge_line(self, line, reverse=False):\\n        if reverse:\\n            line = line[::-1]\\n\\n        new_line = [0] * 4\\n        pos = 0\\n        \\n        # Remove zeros and merge equal numbers\\n        numbers = [x for x in line if x != 0]\\n        i = 0\\n        while i < len(numbers):\\n            if i + 1 < len(numbers) and numbers[i] == numbers[i + 1]:\\n                new_line[pos] = numbers[i] * 2\\n                self.score += numbers[i] * 2\\n                i += 2\\n            else:\\n                new_line[pos] = numbers[i]\\n                i += 1\\n            pos += 1\\n\\n        if reverse:\\n            new_line = new_line[::-1]\\n        \\n        return new_line\\n\\n    def is_game_over(self):\\n        # Check if there are any empty cells\\n        if any(0 in row for row in self.board):\\n            return False\\n\\n        # Check if there are any possible merges\\n        for i in range(4):\\n            for j in range(4):\\n                current = self.board[i][j]\\n                # Check right neighbor\\n                if j < 3 and current == self.board[i][j + 1]:\\n                    return False\\n                # Check bottom neighbor\\n                if i < 3 and current == self.board[i + 1][j]:\\n                    return False\\n        return True\\n\\n    def display(self):\\n        print(f\\\"\\\\nScore: {self.score}\\\\n\\\")\\n        for row in self.board:\\n            print(\\\" \\\".join(f\\\"{num:4}\\\" for num in row))\\n        print(\\\"\\\\n\\\")\\n\\ndef main():\\n    game = Game2048()\\n    \\n    print(\\\"Use arrow keys to move. Press 'q' to quit.\\\")\\n    \\n    while True:\\n        game.display()\\n        \\n        if game.is_game_over():\\n            print(\\\"Game Over!\\\")\\n            break\\n            \\n        event = keyboard.read_event(suppress=True)\\n        if event.event_type != 'down':\\n            continue\\n            \\n        if event.name == 'q':\\n            break\\n        elif event.name == 'left':\\n            game.move('left')\\n        elif event.name == 'right':\\n            game.move('right')\\n        elif event.name == 'up':\\n            game.move('up')\\n        elif event.name == 'down':\\n            game.move('down')\\n\\nif __name__ == \\\"__main__\\\":\\n    main()\"}",
+                    "name": "str_replace_editor"
+                  },
+                  "id": "toolu_017nBp7UfiGqyrA7j77HgtFP",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 1135,
+          "prompt_tokens": 3486,
+          "total_tokens": 4621,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 0
+          },
+          "cache_creation_input_tokens": 3482,
+          "cache_read_input_tokens": 0
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "content": "--- /workspace/game_2048.py\n+++ /workspace/game_2048.py\n@@ -0,0 +1,2 @@\n+import random\n+import keyboard\n@@ -1,0 +4,114 @@\n+class Game2048:\n+    def __init__(self):\n+        self.board = [[0] * 4 for _ in range(4)]\n+        self.score = 0\n+        self.spawn_number()\n+        self.spawn_number()\n+\n+    def spawn_number(self):\n+        empty_cells = [(i, j) for i in range(4) for j in range(4) if self.board[i][j] == 0]\n+        if empty_cells:\n+            i, j = random.choice(empty_cells)\n+            self.board[i][j] = 2 if random.random() < 0.9 else 4\n+\n+    def move(self, direction):\n+        merged = [[False] * 4 for _ in range(4)]\n+        moved = False\n+\n+        if direction in ['left', 'right']:\n+            for i in range(4):\n+                row = self.board[i][:]\n+                new_row = self.merge_line(row, direction == 'right')\n+                if new_row != row:\n+                    moved = True\n+                    self.board[i] = new_row\n+\n+        elif direction in ['up', 'down']:\n+            for j in range(4):\n+                col = [self.board[i][j] for i in range(4)]\n+                new_col = self.merge_line(col, direction == 'down')\n+                if new_col != col:\n+                    moved = True\n+                    for i in range(4):\n+                        self.board[i][j] = new_col[i]\n+\n+        if moved:\n+            self.spawn_number()\n+\n+    def merge_line(self, line, reverse=False):\n+        if reverse:\n+            line = line[::-1]\n+\n+        new_line = [0] * 4\n+        pos = 0\n+\n+        # Remove zeros and merge equal numbers\n+        numbers = [x for x in line if x != 0]\n+        i = 0\n+        while i < len(numbers):\n+            if i + 1 < len(numbers) and numbers[i] == numbers[i + 1]:\n+                new_line[pos] = numbers[i] * 2\n+                self.score += numbers[i] * 2\n+                i += 2\n+            else:\n+                new_line[pos] = numbers[i]\n+                i += 1\n+            pos += 1\n+\n+        if reverse:\n+            new_line = new_line[::-1]\n+\n+        return new_line\n+\n+    def is_game_over(self):\n+        # Check if there are any empty cells\n+        if any(0 in row for row in self.board):\n+            return False\n+\n+        # Check if there are any possible merges\n+        for i in range(4):\n+            for j in range(4):\n+                current = self.board[i][j]\n+                # Check right neighbor\n+                if j < 3 and current == self.board[i][j + 1]:\n+                    return False\n+                # Check bottom neighbor\n+                if i < 3 and current == self.board[i + 1][j]:\n+                    return False\n+        return True\n+\n+    def display(self):\n+        print(f\"\\nScore: {self.score}\\n\")\n+        for row in self.board:\n+            print(\" \".join(f\"{num:4}\" for num in row))\n+        print(\"\\n\")\n+\n+def main():\n+    game = Game2048()\n+\n+    print(\"Use arrow keys to move. Press 'q' to quit.\")\n+\n+    while True:\n+        game.display()\n+\n+        if game.is_game_over():\n+            print(\"Game Over!\")\n+            break\n+\n+        event = keyboard.read_event(suppress=True)\n+        if event.event_type != 'down':\n+            continue\n+\n+        if event.name == 'q':\n+            break\n+        elif event.name == 'left':\n+            game.move('left')\n+        elif event.name == 'right':\n+            game.move('right')\n+        elif event.name == 'up':\n+            game.move('up')\n+        elif event.name == 'down':\n+            game.move('down')\n+\n+if __name__ == \"__main__\":\n+    main()",
+    "extras": {
+      "path": "/workspace/game_2048.py",
+      "prev_exist": false,
+      "old_content": null,
+      "new_content": "import random\nimport keyboard\n\nclass Game2048:\n    def __init__(self):\n        self.board = [[0] * 4 for _ in range(4)]\n        self.score = 0\n        self.spawn_number()\n        self.spawn_number()\n\n    def spawn_number(self):\n        empty_cells = [(i, j) for i in range(4) for j in range(4) if self.board[i][j] == 0]\n        if empty_cells:\n            i, j = random.choice(empty_cells)\n            self.board[i][j] = 2 if random.random() < 0.9 else 4\n\n    def move(self, direction):\n        merged = [[False] * 4 for _ in range(4)]\n        moved = False\n\n        if direction in ['left', 'right']:\n            for i in range(4):\n                row = self.board[i][:]\n                new_row = self.merge_line(row, direction == 'right')\n                if new_row != row:\n                    moved = True\n                    self.board[i] = new_row\n\n        elif direction in ['up', 'down']:\n            for j in range(4):\n                col = [self.board[i][j] for i in range(4)]\n                new_col = self.merge_line(col, direction == 'down')\n                if new_col != col:\n                    moved = True\n                    for i in range(4):\n                        self.board[i][j] = new_col[i]\n\n        if moved:\n            self.spawn_number()\n\n    def merge_line(self, line, reverse=False):\n        if reverse:\n            line = line[::-1]\n\n        new_line = [0] * 4\n        pos = 0\n        \n        # Remove zeros and merge equal numbers\n        numbers = [x for x in line if x != 0]\n        i = 0\n        while i < len(numbers):\n            if i + 1 < len(numbers) and numbers[i] == numbers[i + 1]:\n                new_line[pos] = numbers[i] * 2\n                self.score += numbers[i] * 2\n                i += 2\n            else:\n                new_line[pos] = numbers[i]\n                i += 1\n            pos += 1\n\n        if reverse:\n            new_line = new_line[::-1]\n        \n        return new_line\n\n    def is_game_over(self):\n        # Check if there are any empty cells\n        if any(0 in row for row in self.board):\n            return False\n\n        # Check if there are any possible merges\n        for i in range(4):\n            for j in range(4):\n                current = self.board[i][j]\n                # Check right neighbor\n                if j < 3 and current == self.board[i][j + 1]:\n                    return False\n                # Check bottom neighbor\n                if i < 3 and current == self.board[i + 1][j]:\n                    return False\n        return True\n\n    def display(self):\n        print(f\"\\nScore: {self.score}\\n\")\n        for row in self.board:\n            print(\" \".join(f\"{num:4}\" for num in row))\n        print(\"\\n\")\n\ndef main():\n    game = Game2048()\n    \n    print(\"Use arrow keys to move. Press 'q' to quit.\")\n    \n    while True:\n        game.display()\n        \n        if game.is_game_over():\n            print(\"Game Over!\")\n            break\n            \n        event = keyboard.read_event(suppress=True)\n        if event.event_type != 'down':\n            continue\n            \n        if event.name == 'q':\n            break\n        elif event.name == 'left':\n            game.move('left')\n        elif event.name == 'right':\n            game.move('right')\n        elif event.name == 'up':\n            game.move('up')\n        elif event.name == 'down':\n            game.move('down')\n\nif __name__ == \"__main__\":\n    main()",
+      "impl_source": "oh_aci",
+      "formatted_output_and_error": "File created successfully at: /workspace/game_2048.py"
+    }
+  },
+  {
+    "id": 5,
+    "timestamp": "2025-01-27T22:45:18.405685",
+    "source": "agent",
+    "message": "I've created a simple implementation of the 2048 game with the following features:\n\n1. A 4x4 game board\n2. Arrow key controls for moving tiles\n3. Score tracking\n4. Game over detection\n5. Random spawning of new numbers (2 or 4)\n6. Merging of equal numbers when moving\n\nTo run the game, you would need to:\n1. Install the keyboard package (`pip install keyboard`)\n2. Run the script with sudo privileges (required for keyboard input)\n\nThe game uses the arrow keys for movement and 'q' to quit. The game ends when no more moves are possible.",
+    "action": "finish",
+    "tool_call_metadata": {
+      "function_name": "finish",
+      "tool_call_id": "toolu_01BUKu3B7nY574atNECkBiFk",
+      "model_response": {
+        "id": "chatcmpl-27cc0ad9-6b86-4f1f-9f72-d9c7ab2282f7",
+        "created": 1738046718,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "I've created a simple implementation of the 2048 game with the following features:\n\n1. A 4x4 game board\n2. Arrow key controls for moving tiles\n3. Score tracking\n4. Game over detection\n5. Random spawning of new numbers (2 or 4)\n6. Merging of equal numbers when moving\n\nTo run the game, you would need to:\n1. Install the keyboard package (`pip install keyboard`)\n2. Run the script with sudo privileges (required for keyboard input)\n\nThe game uses the arrow keys for movement and 'q' to quit. The game ends when no more moves are possible.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{}",
+                    "name": "finish"
+                  },
+                  "id": "toolu_01BUKu3B7nY574atNECkBiFk",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 177,
+          "prompt_tokens": 4646,
+          "total_tokens": 4823,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 3482
+          },
+          "cache_creation_input_tokens": 1158,
+          "cache_read_input_tokens": 3482
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "args": {
+      "outputs": {},
+      "thought": "I've created a simple implementation of the 2048 game with the following features:\n\n1. A 4x4 game board\n2. Arrow key controls for moving tiles\n3. Score tracking\n4. Game over detection\n5. Random spawning of new numbers (2 or 4)\n6. Merging of equal numbers when moving\n\nTo run the game, you would need to:\n1. Install the keyboard package (`pip install keyboard`)\n2. Run the script with sudo privileges (required for keyboard input)\n\nThe game uses the arrow keys for movement and 'q' to quit. The game ends when no more moves are possible."
+    },
+    "timeout": 120
+  }
+]
--- a/tests/runtime/trajs/wrong_initial_state.json
+++ b/tests/runtime/trajs/wrong_initial_state.json
@@ -0,0 +1,454 @@
+[
+  {
+    "id": 0,
+    "timestamp": "2025-01-27T23:16:50.239610",
+    "source": "user",
+    "message": "Please rename game_2048.py to 2048.py",
+    "action": "message",
+    "args": {
+      "content": "Please rename game_2048.py to 2048.py",
+      "image_urls": null,
+      "wait_for_response": false
+    },
+    "timeout": 120
+  },
+  {
+    "id": 3,
+    "timestamp": "2025-01-27T23:16:54.105108",
+    "source": "agent",
+    "message": "Reading file: /workspace",
+    "action": "read",
+    "tool_call_metadata": {
+      "function_name": "str_replace_editor",
+      "tool_call_id": "toolu_01USom24cuiyvAembLhEryrn",
+      "model_response": {
+        "id": "chatcmpl-90c35dea-eb25-484c-aaa7-d9d00048f64a",
+        "created": 1738048613,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "I'll help you rename the file. First, let me check if the file exists and its location.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"view\", \"path\": \"/workspace\"}",
+                    "name": "str_replace_editor"
+                  },
+                  "id": "toolu_01USom24cuiyvAembLhEryrn",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 95,
+          "prompt_tokens": 3475,
+          "total_tokens": 3570,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 0
+          },
+          "cache_creation_input_tokens": 3471,
+          "cache_read_input_tokens": 0
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "args": {
+      "path": "/workspace",
+      "start": 0,
+      "end": -1,
+      "thought": "I'll help you rename the file. First, let me check if the file exists and its location.",
+      "impl_source": "oh_aci",
+      "translated_ipython_code": "print(file_editor(**{'command': 'view', 'path': '/workspace'}))"
+    },
+    "timeout": 120
+  },
+  {
+    "id": 4,
+    "timestamp": "2025-01-27T23:16:54.140581",
+    "source": "agent",
+    "message": "I read the file /workspace.",
+    "cause": 3,
+    "observation": "read",
+    "tool_call_metadata": {
+      "function_name": "str_replace_editor",
+      "tool_call_id": "toolu_01USom24cuiyvAembLhEryrn",
+      "model_response": {
+        "id": "chatcmpl-90c35dea-eb25-484c-aaa7-d9d00048f64a",
+        "created": 1738048613,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "I'll help you rename the file. First, let me check if the file exists and its location.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"view\", \"path\": \"/workspace\"}",
+                    "name": "str_replace_editor"
+                  },
+                  "id": "toolu_01USom24cuiyvAembLhEryrn",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 95,
+          "prompt_tokens": 3475,
+          "total_tokens": 3570,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 0
+          },
+          "cache_creation_input_tokens": 3471,
+          "cache_read_input_tokens": 0
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "content": "Here's the files and directories up to 2 levels deep in /workspace, excluding hidden items:\n/workspace/\n/workspace/game_2048.py",
+    "extras": {
+      "path": "/workspace",
+      "impl_source": "oh_aci"
+    }
+  },
+  {
+    "id": 5,
+    "timestamp": "2025-01-27T23:16:56.951967",
+    "source": "agent",
+    "message": "Running command: mv /workspace/game_2048.py /workspace/2048.py",
+    "action": "run",
+    "tool_call_metadata": {
+      "function_name": "execute_bash",
+      "tool_call_id": "toolu_01PuY3mXrE6fm3j52GscR4rj",
+      "model_response": {
+        "id": "chatcmpl-1f2bc7b4-848a-45ec-a2d8-aff83e5adbd6",
+        "created": 1738048616,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "Great, I found the file. Now I'll use the bash command `mv` to rename it.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"mv /workspace/game_2048.py /workspace/2048.py\", \"is_input\": \"false\"}",
+                    "name": "execute_bash"
+                  },
+                  "id": "toolu_01PuY3mXrE6fm3j52GscR4rj",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 111,
+          "prompt_tokens": 3616,
+          "total_tokens": 3727,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 3471
+          },
+          "cache_creation_input_tokens": 139,
+          "cache_read_input_tokens": 3471
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "args": {
+      "command": "mv /workspace/game_2048.py /workspace/2048.py",
+      "is_input": false,
+      "thought": "Great, I found the file. Now I'll use the bash command `mv` to rename it.",
+      "blocking": false,
+      "hidden": false,
+      "confirmation_state": "confirmed"
+    },
+    "timeout": 120
+  },
+  {
+    "id": 6,
+    "timestamp": "2025-01-27T23:16:57.600730",
+    "source": "agent",
+    "message": "Command `mv /workspace/game_2048.py /workspace/2048.py` executed with exit code 0.",
+    "cause": 5,
+    "observation": "run",
+    "tool_call_metadata": {
+      "function_name": "execute_bash",
+      "tool_call_id": "toolu_01PuY3mXrE6fm3j52GscR4rj",
+      "model_response": {
+        "id": "chatcmpl-1f2bc7b4-848a-45ec-a2d8-aff83e5adbd6",
+        "created": 1738048616,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "Great, I found the file. Now I'll use the bash command `mv` to rename it.",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"mv /workspace/game_2048.py /workspace/2048.py\", \"is_input\": \"false\"}",
+                    "name": "execute_bash"
+                  },
+                  "id": "toolu_01PuY3mXrE6fm3j52GscR4rj",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 111,
+          "prompt_tokens": 3616,
+          "total_tokens": 3727,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 3471
+          },
+          "cache_creation_input_tokens": 139,
+          "cache_read_input_tokens": 3471
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "content": "",
+    "extras": {
+      "command": "mv /workspace/game_2048.py /workspace/2048.py",
+      "metadata": {
+        "exit_code": 0,
+        "pid": -1,
+        "username": "openhands",
+        "hostname": "0301ebbced1e",
+        "working_dir": "/workspace",
+        "py_interpreter_path": "/openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python",
+        "prefix": "",
+        "suffix": "\n[The command completed with exit code 0.]"
+      },
+      "hidden": false
+    },
+    "success": true
+  },
+  {
+    "id": 7,
+    "timestamp": "2025-01-27T23:16:59.803261",
+    "source": "agent",
+    "message": "Reading file: /workspace",
+    "action": "read",
+    "tool_call_metadata": {
+      "function_name": "str_replace_editor",
+      "tool_call_id": "toolu_01DodVjoeVmEY3gU5ALbqmGj",
+      "model_response": {
+        "id": "chatcmpl-ea298ce5-bc1c-435d-b5f9-6eb6e6e864df",
+        "created": 1738048619,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "Let me verify that the file was renamed successfully:",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"view\", \"path\": \"/workspace\"}",
+                    "name": "str_replace_editor"
+                  },
+                  "id": "toolu_01DodVjoeVmEY3gU5ALbqmGj",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 85,
+          "prompt_tokens": 3804,
+          "total_tokens": 3889,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 3610
+          },
+          "cache_creation_input_tokens": 188,
+          "cache_read_input_tokens": 3610
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "args": {
+      "path": "/workspace",
+      "start": 0,
+      "end": -1,
+      "thought": "Let me verify that the file was renamed successfully:",
+      "impl_source": "oh_aci",
+      "translated_ipython_code": "print(file_editor(**{'command': 'view', 'path': '/workspace'}))"
+    },
+    "timeout": 120
+  },
+  {
+    "id": 8,
+    "timestamp": "2025-01-27T23:16:59.824084",
+    "source": "agent",
+    "message": "I read the file /workspace.",
+    "cause": 7,
+    "observation": "read",
+    "tool_call_metadata": {
+      "function_name": "str_replace_editor",
+      "tool_call_id": "toolu_01DodVjoeVmEY3gU5ALbqmGj",
+      "model_response": {
+        "id": "chatcmpl-ea298ce5-bc1c-435d-b5f9-6eb6e6e864df",
+        "created": 1738048619,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "Let me verify that the file was renamed successfully:",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{\"command\": \"view\", \"path\": \"/workspace\"}",
+                    "name": "str_replace_editor"
+                  },
+                  "id": "toolu_01DodVjoeVmEY3gU5ALbqmGj",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 85,
+          "prompt_tokens": 3804,
+          "total_tokens": 3889,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 3610
+          },
+          "cache_creation_input_tokens": 188,
+          "cache_read_input_tokens": 3610
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "content": "Here's the files and directories up to 2 levels deep in /workspace, excluding hidden items:\n/workspace/\n/workspace/2048.py",
+    "extras": {
+      "path": "/workspace",
+      "impl_source": "oh_aci"
+    }
+  },
+  {
+    "id": 9,
+    "timestamp": "2025-01-27T23:17:02.182015",
+    "source": "agent",
+    "message": "The file has been successfully renamed from `game_2048.py` to `2048.py`. Is there anything else you need help with?",
+    "action": "finish",
+    "tool_call_metadata": {
+      "function_name": "finish",
+      "tool_call_id": "toolu_01G98EmHEC8HgktRjJBYXqEa",
+      "model_response": {
+        "id": "chatcmpl-fcf8f59b-beb3-4412-94d5-fbe4c02d64a1",
+        "created": 1738048621,
+        "model": "claude-3-5-sonnet-20241022",
+        "object": "chat.completion",
+        "system_fingerprint": null,
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "message": {
+              "content": "The file has been successfully renamed from `game_2048.py` to `2048.py`. Is there anything else you need help with?",
+              "role": "assistant",
+              "tool_calls": [
+                {
+                  "index": 1,
+                  "function": {
+                    "arguments": "{}",
+                    "name": "finish"
+                  },
+                  "id": "toolu_01G98EmHEC8HgktRjJBYXqEa",
+                  "type": "function"
+                }
+              ],
+              "function_call": null
+            }
+          }
+        ],
+        "usage": {
+          "completion_tokens": 67,
+          "prompt_tokens": 3933,
+          "total_tokens": 4000,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 3798
+          },
+          "cache_creation_input_tokens": 129,
+          "cache_read_input_tokens": 3798
+        },
+        "service_tier": null
+      },
+      "total_calls_in_response": 1
+    },
+    "args": {
+      "outputs": {},
+      "thought": "The file has been successfully renamed from `game_2048.py` to `2048.py`. Is there anything else you need help with?"
+    },
+    "timeout": 120
+  }
+]